In [7]:
! pip install seaborn
! pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.3 soupsieve-2.6


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats as stats
from scipy.stats import gamma, linregress
from bs4 import BeautifulSoup
import re

In [14]:
df = pd.read_csv("data/scraped.csv")
shape = df.shape
df.replace("", np.nan, inplace=True)

def summarize(df):
  unique_counts = df.nunique()    
  data_types = df.dtypes    
  missing_counts = df.isnull().sum()    
  missing_percentage = 100 * df.isnull().mean()    
  summary_df = pd.concat([unique_counts, data_types, missing_counts, missing_percentage], axis=1)    
  summary_df.columns = ['unique_count', 'data_types', 'missing_counts', 'missing_percentage']   
  return summary_df

summary_df = summarize(df)

high_missing_columns = summary_df[summary_df['missing_percentage'] >= 99]
high_missing_column_names = high_missing_columns.index.tolist()

df.drop(columns=high_missing_column_names)
df = df.drop_duplicates(subset=['name'])

values_to_drop = ['515', '251', 'ingredients','uncategorized']
df = df[~df['category'].isin(values_to_drop)]



In [15]:
df = df.dropna(subset=['prep', 'cook'], how='all')
df = df.dropna(subset=['calories'], how='all')

nutrition_columns = ['carbohydrates_g',
                     'sugars_g', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'protein_g',
                     'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'calcium_mg',
                     'iron_mg', 'magnesium_mg', 'potassium_mg', 'vitamin_a_iu_IU',
                     'niacin_equivalents_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg']
df[nutrition_columns] = df[nutrition_columns].fillna(0)

In [19]:
def parse_time(time_str):
    if pd.isnull(time_str):
        return 0

    time_str = time_str.lower().strip()

    time_str = re.sub(r'[^0-9a-z ]', '', time_str)
    time_str = time_str.replace('min', 'mins').replace('hr', 'hrs')
    time_str = re.sub(r'minss+', 'mins', time_str)
    time_str = re.sub(r'hrss+', 'hrs', time_str)

    hours_match = re.search(r'(\d+)\s*hrs?', time_str)
    mins_match = re.search(r'(\d+)\s*mins?', time_str)

    hours = int(hours_match.group(1)) if hours_match else 0
    minutes = int(mins_match.group(1)) if mins_match else 0

    return hours * 60 + minutes

def verify_total_times(df):
    parsed_times = {
        col: df[col].apply(parse_time) for col in ['prep', 'cook', 'total']
    }
    df['is_correct'] = (parsed_times['prep'] + parsed_times['cook']) == parsed_times['total']
    return df

def format_time(minutes):
    if minutes < 60:
        return f"{minutes} mins"
    else:
        hrs = minutes // 60
        mins = minutes % 60
        if mins == 0:
            return f"{hrs} hr" if hrs == 1 else f"{hrs} hrs"
        else:
            return f"{hrs} hr {mins} mins" if hrs == 1 else f"{hrs} hrs {mins} mins"

def value_counts_with_percentage(df, column_name):
    counts = df[column_name].value_counts(dropna=False)
    percentages = df[column_name].value_counts(dropna=False, normalize=True) * 100
    result = pd.DataFrame({
        'Count': counts,
        'Percentage': percentages
    })
    
    return result

df = df.drop(columns=['total'])

df['prep_mins'] = df['prep'].apply(parse_time)
df['cook_mins'] = df['cook'].apply(parse_time)
df['total_mins'] = df['prep_mins'] + df['cook_mins']
df['total'] = df['total_mins'].apply(format_time)
df.drop(columns=['prep_mins', 'cook_mins', 'total_mins'], inplace=True)
cols = list(df.columns)
total_index = cols.index('total')
cook_index = cols.index('cook')
cols.insert(cook_index + 1, cols.pop(total_index))
df = df[cols]


In [None]:
def parse_ingredients(ingredients):
  ingredients_list = ingredients.split(';')
  return [ingredient.strip() for ingredient in ingredients_list]

df['parsed_ingredients'] = df['ingredients'].apply(parse_ingredients)

Unnamed: 0,name,url,category,author,summary,rating,rating_count,review_count,ingredients,directions,...,vitamin_k_mcg,biotin_mcg,vitamin_b12_mcg,mono_fat_g,poly_fat_g,trans_fatty_acid_g,omega_3_fatty_acid_g,omega_6_fatty_acid_g,is_correct,parsed_ingredients
0,Simple Macaroni and Cheese,https://www.allrecipes.com/recipe/238691/simpl...,main-dish,g0dluvsugly,A very quick and easy fix to a tasty side-dish...,4.42,834,575,1 (8 ounce) box elbow macaroni ; ¼ cup butter ...,Bring a large pot of lightly salted water to a...,...,,,,,,,,,True,"[1 (8 ounce) box elbow macaroni, ¼ cup butter,..."
1,Gourmet Mushroom Risotto,https://www.allrecipes.com/recipe/85389/gourme...,main-dish,Myleen Sagrado Sjödin,Authentic Italian-style risotto cooked the slo...,4.8,3388,2245,"6 cups chicken broth, divided ; 3 tablespoons ...","In a saucepan, warm the broth over low heat. W...",...,,,,,,,,,True,"[6 cups chicken broth, divided, 3 tablespoons ..."
2,Dessert Crepes,https://www.allrecipes.com/recipe/19037/desser...,breakfast-and-brunch,ANN57,Essential crepe recipe. Sprinkle warm crepes ...,4.8,1156,794,"4 eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...","In large bowl, whisk together eggs, milk, melt...",...,,,,,,,,,True,"[4 eggs, lightly beaten, 1 ⅓ cups milk, 2 tab..."
3,Pork Steaks,https://www.allrecipes.com/recipe/70463/pork-s...,meat-and-poultry,BABYLOVE1222,My mom came up with this recipe when I was a c...,4.57,689,539,¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...,"Melt butter in a skillet, and mix in the soy s...",...,,,,,,,,,True,"[¼ cup butter, ¼ cup soy sauce, 1 bunch green ..."
5,Chicken Parmesan,https://www.allrecipes.com/recipe/223042/chick...,world-cuisine,Chef John,My version of chicken parmesan is a little dif...,4.83,4245,2662,"4 skinless, boneless chicken breast halves ; ...",Preheat an oven to 450 degrees F (230 degrees ...,...,,,,,,,,,False,"[4 skinless, boneless chicken breast halves, ..."
