In [None]:
import requests
import ssl
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import json

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def extract_text(soup, selector, get_text=True, attribute=None):
    element = soup.select_one(selector)
    if element:
        if get_text:
            return element.get_text(strip=True)
        elif attribute:
            return element.get(attribute, 'N/A')
    return 'N/A'

def collect_page_data(url):
    try:
        document = urlopen(url, context=ctx)
        html = document.read()

        if 'text/html' != document.info().get_content_type():
            print("Ignore non text/html page")
            return None

        soup = BeautifulSoup(html, "html.parser")

        title = extract_text(soup, 'h1#main-heading')
        image = extract_text(soup, 'img[data-testid="hero-image"]', get_text=False, attribute='src')
        prep_time = extract_text(soup, 'dt:contains("Prepare") + dd')
        cook_time = extract_text(soup, 'dt:contains("Cook") + dd')
        total_time = f"Prep Time: {prep_time}, Cook Time: {cook_time}"

        json_ld = soup.find('script', type='application/ld+json')
        ingredients = []
        rating_val = 'N/A'
        rating_count = 'N/A'
        category = 'N/A'
        cuisine = 'N/A'
        if json_ld:
            data = json.loads(json_ld.string)
            for item in data.get('@graph', []):
                if item.get('@type') == 'Recipe':
                    recipe = item
                    ingredients = recipe.get('recipeIngredient', [])
                    aggregate_rating = recipe.get('aggregateRating', {})
                    rating_val = aggregate_rating.get('ratingValue', 'N/A')
                    rating_count = aggregate_rating.get('ratingCount', 'N/A')
                    category = recipe.get('recipeCategory', 'N/A')
                    cuisine = recipe.get('recipeCuisine', 'N/A')
                    break

      # Extracts diet tags and whether the recipe is vegetarian and/or vegan
        dietary = soup.select('div dd a')
        diet_tags = []
        for a in dietary:
            text = a.get_text(strip=True).lower()
            diet_tags.append(text)

        tag_list = []
        for tag in diet_tags:
            if tag not in ['vegan', 'vegetarian']:
                tag_list.append(tag)
        diet = ', '.join(tag_list)

        # Checks if recipe is vegan and/or vegetarian
        if 'vegan' in diet_tags:
            vegan = 'Yes'
        else:
            vegan = 'No'

        if 'vegetarian' in diet_tags:
            vegetarian = 'Yes'
        else:
            vegetarian = 'No'

        data = {
            'title': title,
            'total_time': total_time,
            'image': image,
            'ingredients': ingredients,
            'rating_val': rating_val,
            'rating_count': rating_count,
            'category': category,
            'cuisine': cuisine,
            'diet': diet,
            'vegan': vegan,
            'vegetarian': vegetarian,
            'url': url
        }

        df = pd.DataFrame([data])
        return df

    except requests.RequestException as e:
        print(f"Network error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return None


recipe_urls = [
    'https://www.bbc.co.uk/food/recipes/easiest_ever_banana_cake_42108',
    'https://www.bbc.co.uk/food/recipes/oven-roasted_chicken_13123',
    'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
]

for i, url in enumerate(recipe_urls):
    print(f"Scraping: {url}")
    try:
        recipe_data = collect_page_data(url)
        if recipe_data is not None:
            filename = f"recipe_{i}.csv"
            recipe_data.to_csv(filename, index=False)
            print(f"Saved recipe to {filename}")
            print(recipe_data.head()) # Print the dataframe
        else:
            print(f"No data for {url} :(")
    except Exception as e:
        print(f"Error processing {url}: {e}")
    print("-" * 40)