In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats as stats
from scipy.stats import gamma, linregress
from bs4 import BeautifulSoup
import re
import spacy

In [2]:
df = pd.read_csv("data/scraped.csv")
shape = df.shape
df.replace("", np.nan, inplace=True)

def summarize(df):
  unique_counts = df.nunique()    
  data_types = df.dtypes    
  missing_counts = df.isnull().sum()    
  missing_percentage = 100 * df.isnull().mean()    
  summary_df = pd.concat([unique_counts, data_types, missing_counts, missing_percentage], axis=1)    
  summary_df.columns = ['unique_count', 'data_types', 'missing_counts', 'missing_percentage']   
  return summary_df

summary_df = summarize(df)

high_missing_columns = summary_df[summary_df['missing_percentage'] >= 99]
high_missing_column_names = high_missing_columns.index.tolist()

df.drop(columns=high_missing_column_names)
df = df.drop_duplicates(subset=['name'])

values_to_drop = ['515', '251', 'ingredients','uncategorized']
df = df[~df['category'].isin(values_to_drop)]



In [3]:
df = df.dropna(subset=['prep', 'cook'], how='all')
df = df.dropna(subset=['calories'], how='all')

nutrition_columns = ['carbohydrates_g',
                     'sugars_g', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'protein_g',
                     'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'calcium_mg',
                     'iron_mg', 'magnesium_mg', 'potassium_mg', 'vitamin_a_iu_IU',
                     'niacin_equivalents_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg']
df[nutrition_columns] = df[nutrition_columns].fillna(0)

In [4]:
def parse_time(time_str):
    if pd.isnull(time_str):
        return 0

    time_str = time_str.lower().strip()

    time_str = re.sub(r'[^0-9a-z ]', '', time_str)
    time_str = time_str.replace('min', 'mins').replace('hr', 'hrs')
    time_str = re.sub(r'minss+', 'mins', time_str)
    time_str = re.sub(r'hrss+', 'hrs', time_str)

    hours_match = re.search(r'(\d+)\s*hrs?', time_str)
    mins_match = re.search(r'(\d+)\s*mins?', time_str)

    hours = int(hours_match.group(1)) if hours_match else 0
    minutes = int(mins_match.group(1)) if mins_match else 0

    return hours * 60 + minutes

def verify_total_times(df):
    parsed_times = {
        col: df[col].apply(parse_time) for col in ['prep', 'cook', 'total']
    }
    df['is_correct'] = (parsed_times['prep'] + parsed_times['cook']) == parsed_times['total']
    return df

def format_time(minutes):
    if minutes < 60:
        return f"{minutes} mins"
    else:
        hrs = minutes // 60
        mins = minutes % 60
        if mins == 0:
            return f"{hrs} hr" if hrs == 1 else f"{hrs} hrs"
        else:
            return f"{hrs} hr {mins} mins" if hrs == 1 else f"{hrs} hrs {mins} mins"

def value_counts_with_percentage(df, column_name):
    counts = df[column_name].value_counts(dropna=False)
    percentages = df[column_name].value_counts(dropna=False, normalize=True) * 100
    result = pd.DataFrame({
        'Count': counts,
        'Percentage': percentages
    })
    
    return result

df = df.drop(columns=['total'])

df['prep_mins'] = df['prep'].apply(parse_time)
df['cook_mins'] = df['cook'].apply(parse_time)
df['total_mins'] = df['prep_mins'] + df['cook_mins']
df['total'] = df['total_mins'].apply(format_time)
df.drop(columns=['prep_mins', 'cook_mins', 'total_mins'], inplace=True)
cols = list(df.columns)
total_index = cols.index('total')
cook_index = cols.index('cook')
cols.insert(cook_index + 1, cols.pop(total_index))
df = df[cols]


In [5]:
def parse_ingredients(ingredients):
  ingredients_list = ingredients.split(';')
  return [ingredient.strip() for ingredient in ingredients_list]

df['parsed_ingredients'] = df['ingredients'].apply(parse_ingredients)

In [6]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
nlp = spacy.load("en_core_web_sm")

unwanted = [
    r'\u2009', r'/', r'inch', r'â…›', r'â…”', r'Â®"', r'®', r'\)', r'\(', r'%', r'V8', r'V8®', r'™', r'®', r'\'', r'"'
]

def clean_text(text):
  text = text.strip()
  for pattern in unwanted:
    text = re.sub(pattern, '', text)
  return text

def extract_ingredients(ingredients):
  ans = []
  stop_words = set([
        'cup', 'cups', 'teaspoon', 'teaspoons', 'tablespoon', 'tablespoons', 'ounce', 'ounces',
        'pound', 'pounds', 'quart', 'quarts', 'pinch', 'dash', 'taste', 'large', 'small', 'medium',
        'divided', 'minced', 'sliced', 'diced', 'chopped', 'ground', 'freshly', 'prepared', 'cut',
        'into', 'strips', 'halves', 'cubes', 'to', 'box', 'spoon', 'spoons', 'optional'
    ])
  
  for ingredient in ingredients:
    ingredient = clean_text(ingredient)

    ingredient = re.sub(r'\d*\s*[\d¼½¾⅓⅔⅛⅜⅝⅞]+\s*', '', ingredient)
    ingredient = re.sub(r'\b(?:' + '|'.join(stop_words) + r')\b', '', ingredient, flags=re.IGNORECASE)
    ingredient = clean_text(ingredient)

    d = nlp(ingredient)
    for chunk in d.noun_chunks:
      filtered = [token.text for token in chunk if token.text.lower() not in stop_words and not token.is_digit]
      if filtered:
        ans.append(' '.join(filtered).strip())

  return list(set(ans))

df['high_level_ingredients'] = df['parsed_ingredients'].apply(extract_ingredients)
df['ingredient_count'] = df['high_level_ingredients'].apply(len)

KeyboardInterrupt: 

In [None]:
df.to_csv('data/final_df.csv', index=False)