In [None]:
import csv
import json
import time
import sys
import re
import pandas as pd
import numpy as np
from urllib import request, parse, robotparser, error
from bs4 import BeautifulSoup
import os
import networkx as nx
from itertools import combinations
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import re


## Scrap Data

In [None]:
# ---------- Configuration ----------
BASE = "https://www.bbc.co.uk"
UA = "Mozilla/5.0 (compatible; RecipeScraper/1.0)"

# ---------- HTTP ----------
def fetch(url, timeout=20, sleep=0.8):
    """Fetch URL and return bytes."""
    req = request.Request(url, headers={"User-Agent": UA})
    try:
        with request.urlopen(req, timeout=timeout) as resp:
            time.sleep(sleep)
            return resp.read()
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

# ---------- Parsing ----------
def get_links(html, pattern):
    """Extract unique links matching pattern."""
    soup = BeautifulSoup(html, "lxml")
    links = set()
    for a in soup.select(f'a[href^="{pattern}"]'):
        href = a.get("href", "")
        if href.startswith(pattern) and "?" not in href and "#" not in href:
            links.add(parse.urljoin(BASE, href))
    return sorted(links)

def get_next_page(html):
    """Find next page link."""
    soup = BeautifulSoup(html, "lxml")
    a = soup.find("a", attrs={"rel": "next"})
    if a and a.get("href"):
        return parse.urljoin(BASE, a["href"])
    return None

def iso_to_minutes(iso_str):
    """Convert ISO 8601 duration to minutes."""
    m = re.match(r"^P(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$", iso_str.strip(), re.I)
    if not m:
        return None
    days, hours, mins, secs = [int(x or 0) for x in m.groups()]
    return days*1440 + hours*60 + mins + (1 if secs and not (mins or hours) else 0)

def extract_text(obj):
    """Recursively extract text from JSON-LD objects."""
    if not obj:
        return None
    if isinstance(obj, list):
        parts = [extract_text(x) for x in obj]
        return " || ".join(p for p in parts if p) or None
    if isinstance(obj, dict):
        if "text" in obj:
            return extract_text(obj["text"])
        parts = [extract_text(v) for k, v in obj.items() if not k.startswith("@")]
        return " ".join(p for p in parts if p) or None
    return str(obj).strip() or None

def find_recipe_jsonld(html):
    """Find Recipe JSON-LD in HTML."""
    soup = BeautifulSoup(html, "lxml")
    for script in soup.find_all("script", type=lambda v: v and "ld+json" in v.lower()):
        try:
            data = json.loads(script.string or script.get_text())
        except:
            continue
        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue
            obj_type = obj.get("@type", "")
            if isinstance(obj_type, list):
                obj_type = " ".join(str(t).lower() for t in obj_type)
            if "recipe" in str(obj_type).lower():
                return obj
            if "@graph" in obj:
                for g in obj["@graph"]:
                    if isinstance(g, dict):
                        g_type = str(g.get("@type", "")).lower()
                        if "recipe" in g_type:
                            return g
    return None

def parse_recipe(html):
    """Extract recipe data from HTML."""
    rj = find_recipe_jsonld(html)
    if not rj:
        return {}
    data = {
        "title": extract_text(rj.get("name")),
        "description": extract_text(rj.get("description")),
        "servings": extract_text(rj.get("recipeYield")),
        "ingredients": extract_text(rj.get("recipeIngredient")),
    }
    total_time = rj.get("totalTime")
    if not total_time:
        prep = rj.get("prepTime")
        cook = rj.get("cookTime")
        if prep or cook:
            mins = (iso_to_minutes(prep) or 0) + (iso_to_minutes(cook) or 0)
            total_time = f"PT{mins}M" if mins else None
    data["total_time_min"] = iso_to_minutes(total_time) if total_time else None
    instr = rj.get("recipeInstructions", [])
    if isinstance(instr, list):
        steps = []
        for item in instr:
            if isinstance(item, dict) and "itemListElement" in item:
                steps.extend(extract_text(x) for x in item["itemListElement"])
            else:
                steps.append(extract_text(item))
        data["method"] = " || ".join(s for s in steps if s) or None
    else:
        data["method"] = extract_text(instr)
    tags = []
    for key in ("keywords", "recipeCategory", "recipeCuisine", "suitableForDiet"):
        val = rj.get(key)
        if isinstance(val, str):
            tags.extend(p.strip() for p in val.split(",") if p.strip())
        elif isinstance(val, list):
            tags.extend(extract_text(v) for v in val if extract_text(v))
    data["tags"] = " | ".join(t for t in tags if t) or None
    agg = rj.get("aggregateRating", {})
    data["ratingValue"] = agg.get("ratingValue")
    data["ratingCount"] = agg.get("ratingCount")
    return data

# ---------- Collections from File ----------
def read_collections(file_path):
    """Read collection names from txt file and return their URLs."""
    urls = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            name = line.strip()
            if not name:
                continue
            slug = parse.quote(name.lower().replace(" ", "_"))
            urls.append(f"{BASE}/food/collections/{slug}")
    return urls

# ---------- Main Crawler ----------
def crawl(max_collections=None, max_recipes_per_collection=None, 
          out_csv="bbc_food_recipes.csv", out_jsonl="bbc_food_recipes.jsonl"):
    collections = read_collections("collections.txt")
    if max_collections:
        collections = collections[:max_collections]
    seen_urls = set()
    total = 0
    with open(out_csv, "w", newline="", encoding="utf-8") as csv_fp, \
         open(out_jsonl, "w", encoding="utf-8") as jsonl_fp:
        fields = ["title", "url", "collection", "description", "servings", 
                  "total_time_min", "ingredients", "method", "tags", 
                  "ratingValue", "ratingCount"]
        writer = csv.DictWriter(csv_fp, fieldnames=fields)
        writer.writeheader()
        for i, coll_url in enumerate(collections, 1):
            print(f"[{i}/{len(collections)}] {coll_url}")
            page_url = coll_url
            recipes_count = 0
            while page_url:
                html = fetch(page_url)
                if not html:
                    break
                recipe_urls = get_links(html, "/food/recipes/")
                for recipe_url in recipe_urls:
                    if recipe_url in seen_urls:
                        continue
                    if max_recipes_per_collection and recipes_count >= max_recipes_per_collection:
                        break
                    data = parse_recipe(fetch(recipe_url) or "")
                    data["url"] = recipe_url
                    data["collection"] = coll_url
                    writer.writerow({k: data.get(k) for k in fields})
                    jsonl_fp.write(json.dumps(data, ensure_ascii=False) + "\n")
                    seen_urls.add(recipe_url)
                    recipes_count += 1
                    total += 1
                page_url = get_next_page(html)
    print(f"Done! Wrote {total} recipes to {out_csv} and {out_jsonl}")

# ---------- Main ----------
if __name__ == "__main__":
    print("BBC Food Recipe Scraper")
    print("-" * 40)
    csv_file = "bbc_food_recipes.csv"
    jsonl_file = "bbc_food_recipes.jsonl"
    max_coll = None  # limit collections for testing
    max_rec = 5   # limit recipes per collection for testing
    crawl(max_collections=max_coll,
          max_recipes_per_collection=max_rec,
          out_csv=csv_file,
          out_jsonl=jsonl_file)


BBC Food Recipe Scraper
----------------------------------------
[1/89] https://www.bbc.co.uk/food/collections/1_best_sausage_recipes
[2/89] https://www.bbc.co.uk/food/collections/african_inspired_food
[3/89] https://www.bbc.co.uk/food/collections/air_fryer_family_food
[4/89] https://www.bbc.co.uk/food/collections/al-desko
[5/89] https://www.bbc.co.uk/food/collections/back_to_basics
[6/89] https://www.bbc.co.uk/food/collections/baked_cheesecakes
[7/89] https://www.bbc.co.uk/food/collections/baking_with_vegetables
[8/89] https://www.bbc.co.uk/food/collections/barbecue_roasts
[9/89] https://www.bbc.co.uk/food/collections/camping_recipes
[10/89] https://www.bbc.co.uk/food/collections/caribbean_barbecue_recipes
[11/89] https://www.bbc.co.uk/food/collections/celebration_cakes
[12/89] https://www.bbc.co.uk/food/collections/cheap_chicken_dinners_for_four
[13/89] https://www.bbc.co.uk/food/collections/dairy-free_baking
[14/89] https://www.bbc.co.uk/food/collections/dark_chocolate_treats
[15/89

### Load Data 

In [4]:
# Scraped data
data = pd.read_csv("bbc_food_recipes.csv")
size_bytes = os.path.getsize("bbc_food_recipes.csv")
size_mb = size_bytes / (1024 * 1024)

print(f"File size: {size_mb:.2f} MB")
# Load food list from Excel
food_list = pd.read_excel("Frida_Dataset_May2025.xlsx", sheet_name="Data_Table", skiprows=1)

food_list_units = food_list.iloc[0]

# Remove rows 0 and 1, reset index, and rename first three columns
food_list = (food_list.drop(index=[0, 1])
             .reset_index(drop=True)
             .rename(columns={food_list.columns[0]: "Food_Danish",
                            food_list.columns[1]: "Food_English",
                            food_list.columns[2]: "Food_ID"}))

# Extract first part before comma from Food_English and get unique values
single_food_list = list(set(item.split(',')[0] for item in food_list['Food_English']))

# Add additional ingredients
single_food_list.extend(['mushrooms', 'white wine', 'red wine', 'caster sugar', 
                        'chicken stock', 'vegetable stock', 'beef stock', 'basil', 
                        'oregano', 'thyme', 'rosemary', 'parmasan', 'mozzarella'])
single_food_list = list(set(item.split(',')[0].strip().lower()
                            for item in food_list['Food_English']))


File size: 1.02 MB


### Extract and apply clean list of ingredients

In [None]:
# function to extract and apply clean list of ingredients 
def clean_and_extract(ingredient_string, food_list):
    if not isinstance(ingredient_string, str):
        return []

    # Lowercase and remove numbers + units
    text = ingredient_string.lower()
    text = re.sub(r"\d+[\w/]*", "", text)        # remove 30g, 1oz, 80ml etc.
    text = re.sub(r"[^\w\s]", " ", text)         # remove punctuation
    words = text.split()

    # Basic singularization (very simple)
    words = [w[:-1] if w.endswith("s") else w for w in words]

    matches = []
    for w in words:
        if w in food_list:
            matches.append(w)

    return list(set(matches))    # unique


In [None]:
# apply to dataset
single_food_lower = [x.lower() for x in single_food_list]

data["clean_ingredients"] = data["ingredients"].apply(
    lambda x: clean_and_extract(x, single_food_lower)
)


In [None]:
## Basic statistics
print(f'Total recipes scraped: {len(data)}')
print(f'Total unique ingredients: {len(set(item for sublist in data["final_ingredients"] for item in sublist))}')

Total recipes scraped: 445
Total unique ingredients: 152
