## Scraping the site
This notebook is created for the purpose of showcasing how the carbonara_recipe.csv came to be. In order to do that, I created a highly custom function that will scrape the data from the website. Take note that I did not modify this to the extent that it can cater other similar structured site for the purpose of showcasing what is in the site properly for demo. As you can see, that every section of the site is partitioned in a way we can actually get what we need properly. In this case, what we need is the ingredients and it has it's own section along with a table structured ingredients that makes it easier for us to convert it into a table format, getting what we need in the transformation process.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:

# Function to convert quantity to float
def convert_to_float(quantity_str):
    if not quantity_str:
        return None

    # Check if the quantity is a range
    match = re.match(r'^([\d.]+)\s*-\s*([\d.]+)$', quantity_str)
    if match:
        # If it's a range, calculate the average
        return float(match.group(1)) 
    else:
        # If it's a single value, convert to float
        return float(eval(quantity_str))

In [3]:


# Function to scrape ingredients from the given URL
def scrape_ingredients(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the section containing ingredients
    ingredients_section = soup.find('div', {'class': 'wprm-recipe-ingredients-container'})
    if not ingredients_section:
        print("Ingredients section not found.")
        return None

    # Extract ingredient details
    ingredients_data = []
    for ingredient_item in ingredients_section.find_all('li', {'class': 'wprm-recipe-ingredient'}):
        ingredient = ingredient_item.find('span', {'class': 'wprm-recipe-ingredient-name'})
        quantity = ingredient_item.find('span', {'class': 'wprm-recipe-ingredient-amount'})
        unit = ingredient_item.find('span', {'class': 'wprm-recipe-ingredient-unit'})
        notes = ingredient_item.find('span', {'class': 'wprm-recipe-ingredient-notes-normal'})

        if ingredient:
            ingredients_data.append({
                'Ingredient': ingredient.text.strip(),
                'Quantity': convert_to_float(quantity.text.strip()) if quantity else None,
                'Unit': unit.text.strip() if unit else notes.text.strip() if notes else None
            })

    return ingredients_data



In [4]:
website_url = "https://www.cookingnook.com/recipe/carbonara/"
ingredients_data = scrape_ingredients(website_url)

In [5]:
print(ingredients_data)

[{'Ingredient': 'spaghetti', 'Quantity': 1.0, 'Unit': 'pound'}, {'Ingredient': 'salt and freshly ground pepper', 'Quantity': None, 'Unit': None}, {'Ingredient': 'oil', 'Quantity': 0.5, 'Unit': 'teaspoon'}, {'Ingredient': 'onion', 'Quantity': 1.0, 'Unit': None}, {'Ingredient': 'bacon', 'Quantity': 5.0, 'Unit': 'slices'}, {'Ingredient': 'mushrooms', 'Quantity': 1.0, 'Unit': 'cup'}, {'Ingredient': 'butter', 'Quantity': 4.0, 'Unit': 'tablespoons'}, {'Ingredient': 'Parmesan cheese', 'Quantity': 0.3333333333333333, 'Unit': 'cup'}, {'Ingredient': 'eggs', 'Quantity': 5.0, 'Unit': None}, {'Ingredient': 'parsley', 'Quantity': 1.0, 'Unit': 'tablespoon'}]


In [6]:

# Create a pandas DataFrame
ingredients_df = pd.DataFrame(ingredients_data)
ingredients_df['Quantity'] = ingredients_df['Quantity'].apply(lambda x: f"{x:.2f}" if pd.notna(x) else None)

# Display the DataFrame
print(ingredients_df)

                       Ingredient Quantity         Unit
0                       spaghetti     1.00        pound
1  salt and freshly ground pepper     None         None
2                             oil     0.50     teaspoon
3                           onion     1.00         None
4                           bacon     5.00       slices
5                       mushrooms     1.00          cup
6                          butter     4.00  tablespoons
7                 Parmesan cheese     0.33          cup
8                            eggs     5.00         None
9                         parsley     1.00   tablespoon


In [7]:
ingredients_df.to_csv('carbonara_recipe.csv', index=False)