In [1]:
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import csv
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("recipes.csv")

In [3]:
df.shape

(522567, 28)

In [4]:
df.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4,,"c(""Mix everything together and bring to a boil..."


In [5]:
df.isnull().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                       82544
PrepTime                           0
TotalTime                          1
DatePublished                      1
Description                        6
Images                             5
RecipeCategory                   762
Keywords                       17248
RecipeIngredientQuantities        14
RecipeIngredientParts             14
AggregatedRating              253238
ReviewCount                   247504
Calories                          16
FatContent                        22
SaturatedFatContent               29
CholesterolContent                29
SodiumContent                     29
CarbohydrateContent               43
FiberContent                      45
SugarContent                      46
ProteinContent                    46
RecipeServings                182956
RecipeYield                   348123
R

In [10]:
df.drop(['DatePublished'], axis='columns', inplace=True)

In [11]:
def convert_to_minutes(duration):
    minutes = 0
    hours = 0

    if 'H' in duration:
        hours = int(duration.split('H')[0].replace('PT', '')) * 60
        duration = duration.split('H')[1] if 'M' in duration else ''

    if 'M' in duration:
        minutes = int(duration.replace('PT', '').replace('M', ''))

    return hours + minutes

In [12]:
df = df[df['PrepTime'].str.contains(r'PT\d*H?\d*M?', na=False)]

In [13]:
df['PrepTime'] = df['PrepTime'].apply(convert_to_minutes)
df['TotalTime'] = df['TotalTime'].apply(convert_to_minutes)
df['CookTime'] = df['TotalTime'] - df['PrepTime']

In [14]:
def format_time(minutes):
    hours = minutes // 60
    mins = minutes % 60
    if hours > 0 and mins > 0:
        return f"{hours}h {mins}m"
    elif hours > 0:
        return f"{hours}h"
    else:
        return f"{mins}m"

df['PrepTime'] = df['PrepTime'].apply(format_time)
df['CookTime'] = df['CookTime'].apply(format_time)
df['TotalTime'] = df['TotalTime'].apply(format_time)

In [15]:
df.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,24h,45m,24h 45m,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",Frozen Desserts,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,25m,4h,4h 25m,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",Chicken Breast,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,5m,30m,35m,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",Beverages,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,20m,24h,24h 20m,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",Soy/Tofu,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,30m,20m,50m,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",Vegetable,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4,,"c(""Mix everything together and bring to a boil..."


In [16]:
new_description = "Soft, buttery, and packed with cheesy herb flavor, Herb Pull-Aparts are made by coating dough pieces in garlic, herbs, and cheese, then baking until golden and fluffy. Perfect as a side or a savory snack!"
df.loc[df['Name'] == 'Herb Pull-Aparts', 'Description'] = new_description

new_description = "A rich and creamy dish, carefully cooked in a bain-marie for a smooth texture. Chilled and sliced for a clean finish, it pairs perfectly with mixed greens and melba toast—a delicious make-ahead festive starter."
df.loc[df['Name'] == 'Chicken Liver Parfait', 'Description'] = new_description

new_description = "A fresh and flavorful chard salad, lightly tossed in warm garlic-infused dressing with crispy bacon and eggs. Perfectly balanced with a hint of vinegar, it’s a simple yet hearty dish for any meal."
df.loc[df['Name'] == 'Hot Swiss Chard Salad', 'Description'] = new_description

new_description = "A versatile homemade dressing mix combining onion, garlic, parsley, and seasonings. Store for up to 6 months and mix with mayo and buttermilk for a classic dressing or use sour cream/yogurt for variations."
df.loc[df['Name'] == 'Hidden Valley Mix for Dressing(copycat)', 'Description'] = new_description

new_description = "Savory and tangy cocktail meatballs made with seasoned ground beef and baked in a rich cranberry-chili sauce. Perfect for parties, they can be kept warm in a crockpot for easy serving."
df.loc[df['Name'] == 'Cranberry Cocktail Meatballs', 'Description'] = new_description

In [17]:
df.isnull().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                           0
PrepTime                           0
TotalTime                          0
Description                        0
Images                             1
RecipeCategory                   752
Keywords                       17238
RecipeIngredientQuantities         4
RecipeIngredientParts              1
AggregatedRating              253223
ReviewCount                   247489
Calories                           1
FatContent                         1
SaturatedFatContent                1
CholesterolContent                 1
SodiumContent                      1
CarbohydrateContent                1
FiberContent                       1
SugarContent                       1
ProteinContent                     1
RecipeServings                182911
RecipeYield                   348072
RecipeInstructions                 1
d

In [18]:
df = df.dropna(thresh=df.shape[1] - 5)

In [19]:
df.isnull().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                           0
PrepTime                           0
TotalTime                          0
Description                        0
Images                             1
RecipeCategory                   653
Keywords                       17139
RecipeIngredientQuantities         3
RecipeIngredientParts              0
AggregatedRating              253124
ReviewCount                   247390
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182812
RecipeYield                   347973
RecipeInstructions                 0
d

In [20]:
df = df.dropna(subset=['RecipeIngredientQuantities', 'Images'])

In [21]:
df.isnull().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                           0
PrepTime                           0
TotalTime                          0
Description                        0
Images                             0
RecipeCategory                   653
Keywords                       17139
RecipeIngredientQuantities         0
RecipeIngredientParts              0
AggregatedRating              253123
ReviewCount                   247389
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182810
RecipeYield                   347970
RecipeInstructions                 0
d

In [30]:
df.fillna("Not available", inplace=True)

In [31]:
df.isnull().sum()

RecipeId                      0
Name                          0
AuthorId                      0
AuthorName                    0
CookTime                      0
PrepTime                      0
TotalTime                     0
Description                   0
Images                        0
RecipeCategory                0
Keywords                      0
RecipeIngredientQuantities    0
RecipeIngredientParts         0
AggregatedRating              0
ReviewCount                   0
Calories                      0
FatContent                    0
SaturatedFatContent           0
CholesterolContent            0
SodiumContent                 0
CarbohydrateContent           0
FiberContent                  0
SugarContent                  0
ProteinContent                0
RecipeServings                0
RecipeYield                   0
RecipeInstructions            0
dtype: int64

In [33]:
df.to_csv("processed_recipes.csv", index=False)