In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import os 

# Opens and filters a local XML sitemap file and extracts all details
def parse_sitemap(file_path):
    with open(file_path, "r", encoding="utf-8") as xml_file:
        # Use 'xml' parser for sitemaps
        return BeautifulSoup(xml_file, "xml").find_all("loc")

def get_recipe_urls(locations):
    # Filters for URLs containing the specific path "/recipe/"
    return [i.text for i in locations if "/recipe/" in i.text]

sitemap = "C:/Users/Prana/Documents/CGAS-EATLYST/sitemap_1.xml"

# Setting headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
    locations = parse_sitemap(sitemap)
    all_recipe_urls = get_recipe_urls(locations)
except FileNotFoundError:
    print(f"Error: Sitemap file not found at {sitemap}")
    exit()

# I have limited the sample size to avoid hitting the server too hard and randomly chosen 26 recipes
sample_size = 26
if len(all_recipe_urls) >= sample_size:
    urlArray = random.sample(all_recipe_urls, sample_size)
else:
    urlArray = all_recipe_urls 
    print(f"Warning: Only {len(all_recipe_urls)} recipe URLs found, fewer than the requested {sample_size}.")

# Making dataset to store scraped data using beautifulsoup and pandas
df = pd.DataFrame(columns=['Recipe_Name', 'Recipe_URL', 'Image_URL', 'Detailed_Ingredients', 'Instructions', 'Prep Time'])

# output file path
output_file = 'recipe_scraping.csv'

for currUrl in urlArray:
    time.sleep(random.uniform(1, 3)) 
    
    try:
        r = requests.get(currUrl, headers=headers)
        r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        soup = BeautifulSoup(r.content, 'html5lib')

    #Extract Recipe Name
        recipeName = soup.find('h1')
        recipeName = recipeName.get_text(" ", strip=True) if recipeName else "N/A"

    # Extract Image URL 
        image_tag = soup.find('img', attrs={'class': 'mm-img-block__img'})
        
        # Fallback to the first image found if specific class doesn't work (may not be the main image)
        if not image_tag:
             image_tag = soup.find('img')

        # Extract the URL from the 'src' attribute
        image_url = image_tag.get('src') if image_tag and image_tag.get('src') else "N/A"


    #Extract Ingredients 
        ingredientDiv = soup.find_all('li', attrs={'class': 'mm-recipes-structured-ingredients__list-item'})
        recipeIngredient = []
        for row in ingredientDiv:
            spans = row.find_all('span')
            if spans:
                ingredient = ' '.join(s.get_text(" ", strip=True) for s in spans if s.get_text(strip=True))
            else:
                ingredient = row.get_text(" ", strip=True)
            
            # Clean up extra spaces
            ingredient = ' '.join(ingredient.split()) 
            if ingredient:
                recipeIngredient.append(ingredient)
                
        recipeIngredient = ', '.join(recipeIngredient) if recipeIngredient else "N/A"

    #Extract Instructions ---
        recipeInstructionDiv = soup.find_all('li', attrs={
            'class': 'comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI'
        })

        steps = []
        for row in recipeInstructionDiv:
            p = row.find('p')
            # Get text from <p> or the list item itself
            text = (p.get_text(" ", strip=True) if p else row.get_text(" ", strip=True)).replace('\n', '')
            text = ' '.join(text.split())
            if text:
                steps.append(text)
                
        recipeInstruction = " ".join(steps) if steps else "N/A"

        details = soup.find('div', class_='mm-recipes-details__content')
        Total_time = "N/A"
        if details:
            # Find the 'Total Time:' label
            Total_label = details.find('div', string='Total Time:')
            if Total_label:
                # Find the next sibling div with the value
                Total_time_value = Total_label.find_next('div', class_='mm-recipes-details__value')
                Total_time = Total_time_value.get_text(" ", strip=True) if Total_time_value else "N/A"

        new_row_df = pd.DataFrame([{
            'Recipe_Name': recipeName,
            'Recipe_URL': currUrl,
            'Image_URL': image_url, 
            'Detailed_Ingredients': recipeIngredient,
            'Instructions': recipeInstruction,
            'Prep Time': Total_time,
        }])

        df = pd.concat([df, new_row_df], ignore_index=True)
        df.to_csv(output_file, index=False)

        print(f"Processed: {currUrl} | Image URL: {image_url}")

    except requests.exceptions.RequestException as e:
        # If a link fails, move to the next one
        print(f" Error fetching {currUrl}: {e}")
        continue 
    except Exception as e:
        print(f" An unexpected error occurred while processing {currUrl}: {e}")
        continue

print("\n--- Scraping Complete ---")
print(f"Data saved to {output_file}")