In [272]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import re

### Utility functions for scraping recipes from the site

In [277]:
def all_recipes_from_a_page(page_url):
    '''
    This function takes recipe page url of the format "https://www.archanaskitchen.com/recipes/page-{page_number}"
    It fetches metadata of all the recipes present on that page
    For each fetched recipe metadata it calls fetch_recipe_data to get details of each recipe
    Returns list of lists containing all the recipes of that page
    '''
    
    recipes_page = []
    base_url = "https://www.archanaskitchen.com" ## required for image link and recipe link
    page_no = page_url.split('/')[-1].split('-')[-1]
    response = requests.get(page_url).text
    page = BeautifulSoup(response, 'lxml') 
        # print(page.prettify())

    # Fetching each recipe's name, imagelink and recipe link from each page 
    for recipes in page.find_all('div', class_='blogRecipe'):
        recipe_data=[]
        name = recipes.find('span', itemprop='name').text.strip()
        image_link = recipes.find('img', itemprop='image')['src']
        recipe_link = recipes.find('a', itemprop='url')['href']

        image_link = base_url+image_link
        recipe_link = base_url+recipe_link
        
        # print(page_no, name)
        
        recipe_data = [page_no, name, image_link, recipe_link]
        recipe_data.extend(fetch_recipe_data(recipe_link))
        recipes_page.append(recipe_data)

    return recipes_page

In [278]:
def fetch_recipe_data(recipe_url):
    '''
    This function takes a recipe page url of the format "https://www.archanaskitchen.com/garlic-knot-recipe"
    From this page it fetches all the recipe details present for the recipe
    Returns a list of extracted things for the given recipe
    '''
    
    response = requests.get(recipe_url).text
    page = BeautifulSoup(response, 'lxml')

    # Extracting recipe title
    recipe_title = page.find('h1', class_='recipe-title').text.strip() if page.find('h1', class_='recipe-title') else ""
    
    # Extracting image URL
    image_url = page.find('img', itemprop='image')['src'] if page.find('img', itemprop='image') else ""
    image_url = "https://www.archanaskitchen.com" + image_url if image_url else ""
    
    # Extracting other details like cuisine, course, diet, etc.
    cuisine = page.find('div', class_='cuisine').find('span', itemprop='recipeCuisine').text.strip() if page.find('div', class_='cuisine') else ""
    course = page.find('div', class_='course').find('span', itemprop='keywords').text.strip() if page.find('div', class_='course') else ""
    diet = page.find('div', class_='diet').find('span', itemprop='keywords').text.strip() if page.find('div', class_='diet') else ""
    equipments = ''.join([equipment.text.strip() for equipment in page.find('div', class_='products').find_all('a')]) if page.find('div', class_='products') else ""

    # Extracting ratings
    rating_value = page.find("span", itemprop="ratingValue").text.strip() if page.find("span", itemprop="ratingValue") else ""
    rating_count = page.find("span", itemprop="ratingCount").text.strip() if page.find("span", itemprop="ratingCount") else ""
    best_rating = page.find("span", itemprop="bestRating").text.strip() if page.find("span", itemprop="bestRating") else ""
    worst_rating = page.find("span", itemprop="worstRating").text.strip() if page.find("span", itemprop="worstRating") else ""

    # Extracting prep time, cook time, total time, and servings
    recipe_timings = page.find('div', class_='RecipeServesTime')
    if recipe_timings:
        prep_time = recipe_timings.find('span', itemprop='prepTime').find('p').get_text().strip() if recipe_timings.find('span', itemprop='prepTime') else ""
        cook_time = recipe_timings.find('span', itemprop='cookTime').find('p').get_text().strip() if recipe_timings.find('span', itemprop='cookTime') else ""
        total_time = recipe_timings.find('span', itemprop='totalTime').find('p').get_text().strip() if recipe_timings.find('span', itemprop='totalTime') else ""
        servings = recipe_timings.find('span', itemprop='recipeYield').find('p').get_text().split()[0].strip() if recipe_timings.find('span', itemprop='recipeYield') else ""
    else:
        prep_time = cook_time = total_time = servings = ""
        
    # Extracting recipe description
    recipe_description = page.find('div', class_='recipedescription').find('span', itemprop='description').text.strip() if page.find('div', class_='recipedescription') else ""
    
    # Extracting ingredients
    ingredients_sections = page.find_all('div', class_='recipeingredients')
    ingredients = []
    for subtitle in ingredients_sections:
        ul = subtitle.find_next('ul')
        for li in ul.find_all('li', itemprop='ingredients'):
            text_parts = list(li.stripped_strings)
            if len(text_parts) > 1:
                quantity, name, *details = text_parts        
                details = ' - ' + ' '.join(details) if details else ''
                ing = quantity + ' - ' + name + details if name[0] == ',' else name + ' - ' + quantity + details
                ing = ing.replace(',', '').replace('  ',' ')
            else:
                ing = text_parts[0]
        
            ingredients.append(ing)
    ingredients = '|'.join(ingredients)
    
    # Extracting instructions
    instructions = []
    instruction_section = page.find('div', class_='recipeinstructions') if page.find('div', class_='recipeinstructions') else ""
    if instruction_section:
        for item in instruction_section.find_all('li', itemprop='recipeInstructions'):
            instructions.append(item.text.strip())
        instructions = ' '.join(instructions)
    
    # print("Recipe Title:", recipe_title, '\n')
    # print("Recipe Description:", recipe_description, '\n')
    # print("Ingredients:", ingredients, '\n')
    # print("Instructions:", instructions, '\n')
    # print("Image URL:", image_url, '\n')
    # print("Cuisine:", cuisine, '\n')
    # print("Course:", course, '\n')
    # print("Diet:", diet, '\n')
    # print("Equipments:", equipments, '\n')
    # print("Rating Value:", rating_value, '\n')
    # print("Rating Count:", rating_count, '\n')
    # print("Best Rating:", best_rating), '\n')
    # print("Worst Rating:", worst_rating, '\n')
    # print("Prep Time:", prep_time, '\n')
    # print("Cook Time:", cook_time, '\n')
    # print("Total Time:", total_time, '\n')
    # print("Servings:", servings, '\n')

    # print([recipe_title, image_url, cuisine, course, diet, equipments, rating_value, rating_count, best_rating, worst_rating, prep_time, cook_time, total_time, servings, recipe_description, ingredients, instructions])
    return [recipe_title, image_url, cuisine, course, diet, equipments, rating_value, rating_count, best_rating, worst_rating, prep_time, cook_time, total_time, servings, recipe_description, ingredients, instructions]
    

In [279]:
def process_batch(start_page, end_page):
    '''
    This function takes two inputs: start_page and end_page numbers
    start_page and end_page denotes the first pagea and the last_page to scrap the recipes from the website
    It generates page urls of the format "https://www.archanaskitchen.com/recipes/page-{page_number}" for all pages between start and end pages
    Calls all_recipes_from_a_page function for each page url and saves result into a list
    Results are saved in the csv file (file name contains first and last page)
    '''
    
    all_recipes = []
    base_page_link = "https://www.archanaskitchen.com/recipes/page-"
    file_path = os.path.join(os.getcwd(), '..', 'Data', f'Recipes_{start_page}_to_{end_page}.csv')
    url_list = [base_page_link + str(i) for i in range(start_page, end_page + 1)]

    ## Traversing each page and all recipes on that page
    for url in url_list:
        print(url)
        recipes_page = all_recipes_from_a_page(url)
        all_recipes.extend(recipes_page)

    ## Creating Dataframe to save scraped data
    columns = ['Page No', 'Recipe Name', 'Image URL', 'Recipe URL', 'Recipe URL Title', 'Image URL Original', 'Cuisine', 'Course', 'Diet', 'Equipments', 'Average Rating', 'Total Ratings', 'Best Rating', 'Worst Rating', 'Prep Time', 'Cook Time', 'Total Time', 'Servings', 'Recipe Description', 'Ingredients', 'Instructions']
    df = pd.DataFrame(all_recipes, columns=columns)

    ## Saving batch data to a file
    df.to_csv(file_path, index=False)

### Running the batches with batch size = 10

In [256]:
first_page = 91  #1
last_page = 100   #337
batch_size = 10

for i in range(first_page-1, last_page, batch_size):
    start_page = i+1
    end_page = min(i + batch_size, last_page)
    #print(start_page, end_page)
    process_batch(start_page, end_page)

https://www.archanaskitchen.com/recipes/page-91
https://www.archanaskitchen.com/recipes/page-92
https://www.archanaskitchen.com/recipes/page-93
https://www.archanaskitchen.com/recipes/page-94
https://www.archanaskitchen.com/recipes/page-95
https://www.archanaskitchen.com/recipes/page-96
https://www.archanaskitchen.com/recipes/page-97
https://www.archanaskitchen.com/recipes/page-98
https://www.archanaskitchen.com/recipes/page-99
https://www.archanaskitchen.com/recipes/page-100


### Merging into single file

In [276]:
directory_path = os.path.join(os.getcwd(), '..', 'Data')
save_path = os.path.join(directory_path, 'Recipes.csv')


# Read each CSV file into a DataFrame and concat them
df_all = pd.DataFrame()
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]
csv_files.sort(key=lambda filename: int(re.search(r'\d+', filename).group()))
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    df_all = pd.concat([df_all, df], ignore_index=True)


# Save to the file
df_all.to_csv(save_path, index=False)

In [284]:
directory_path = os.path.join(os.getcwd(), '..', 'Data')
batches_path = os.path.join(directory_path, 'Batches')
os.makedirs(batches_path, exist_ok=True)

for file in csv_files:
    if file.startswith('Recipes_'):
        cur_path = os.path.join(directory_path, file)
        mov_path = os.path.join(batches_path, file)
        os.rename(cur_path, mov_path)
        