# Data Munging
- Cleaning the Data: Handle missing values, duplicates, data types, and errors
- Transform the Data: Normalize, encode, aggregate, filter, new features
- Restructure: change shape or format of data

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re # searches for patterns in a string
import ast
import string

### Load in csv

In [2]:
df = pd.read_csv('../data/all_recipes_combined.csv')
df.head()

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40 mins,20 mins,5 hrs,16.0,"['1/3 cup butter, melted', '1/4 cup white suga...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,"{'Calories': '405', 'Fat': '31g', 'Carbs': '28...",https://www.allrecipes.com/french-silk-pie-bar...
1,Our 15 Best Brownie Recipes of All Time,There's a sweet treat for everyone.,,,,,[],[],,https://www.allrecipes.com/best-brownie-recipe...
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20 mins,5 mins,25 mins,6.0,"['1 cup dark chocolate chips', '20 creme-fille...",['Melt chocolate chips in a microwave-safe bow...,"{'Calories': '1058', 'Fat': '65g', 'Carbs': '1...",https://www.allrecipes.com/no-bake-espresso-ma...
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40 mins,20 mins,2 hrs 15 mins,12.0,"['cooking spray', '2 1/4 cups all-purpose flou...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,"{'Calories': '824', 'Fat': '55g', 'Carbs': '80...",https://www.allrecipes.com/blackout-cake-recip...
4,Sleeping Gingerbread Treats,Shhhh they're sleeping!,15 mins,15 mins,30 mins,9.0,"['1 sheet of prepared puff pastry, thawed', '9...",['Gather the ingredients. Preheat the oven to ...,"{'Calories': '231', 'Fat': '12g', 'Carbs': '29...",https://www.allrecipes.com/sleeping-gingerbrea...


### Shape of data
- 2_304 rows, 10 columns

In [3]:
df.shape

(2304, 10)

### Data types
- All columns are stored as string data types

In [4]:
df.dtypes

title          object
intro          object
prep_time      object
cook_time      object
total_time     object
servings       object
ingredients    object
directions     object
nutrition      object
recipe_url     object
dtype: object

## Cleaning the Data

### 1. Handle Missing Values
- Null values in 'intro', 'prep_time', 'cook_time', 'total_time', 'servings', 'nutrition', 'recipe_url'
- Dropped a total of 161 rows containing null values (64 rows in recipe_urls, 23 rows in intro, 88 rows in nutrition, 4 rows in serving, 9 rows in total_time)
- 14 null rows remaining in column prep_time: The user will have the option to filter recipes based on total time so it's not vital that all prep_time rows are filled
- 187 null rows remaining in column cook_time: Same reasoning used from prep_time

In [5]:
# Check for nulls
df.isnull().sum()

title            0
intro           23
prep_time       90
cook_time      268
total_time      76
servings        71
ingredients      0
directions       0
nutrition       88
recipe_url      64
dtype: int64

#### Look into 'recipe_url'
- 64 recipes with nulls
- Drop these 64 rows b/c end goal is to return a recipes url for the user to have access to

In [6]:
# View null values
df[df['recipe_url'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
384,Cheesy Lasagna Sheet Pasta,This 3 ingredient cheesy lasagna sheet pasta c...,5 mins,15 mins,20 mins,6,"['8 ounces lasagna noodles, broken in half', '...",['Bring a large pot of lightly salted water to...,"{'Calories': '203', 'Fat': '9g', 'Carbs': '19g...",
385,Creamy Beef and Bow Tie Pasta,This creamy beef and bow tie pasta is a fun ta...,15 mins,20 mins,35 mins,4,"['6 ounces bowtie pasta (farfalle)', '1 pound ...",['Bring a large pot of lightly salted water to...,"{'Calories': '498', 'Fat': '26g', 'Carbs': '26...",
386,Reuben Mac,Not your typical mac and cheese. This recipe w...,30 mins,30 mins,1 hr,6,"['8 ounces large elbow macaroni', '¼ cup butte...",['Bring a large pot of lightly salted water to...,"{'Calories': '757', 'Fat': '44g', 'Carbs': '56...",
387,Dump and Bake Meatball Casserole,This super easy dump and bake meatball cassero...,5 mins,40 mins,45 mins,8,"['12 ounces dry rotini pasta', '2 1/2 cups wat...",['Gather all ingredients.\xa0Preheat the oven ...,"{'Calories': '362', 'Fat': '19g', 'Carbs': '28...",
388,Soy Butter Pasta with Chicken,This soy butter pasta with chicken has a simpl...,10 mins,15 mins,25 mins,3,"['8 ounces bucatini or linguine pasta', '4 tab...",['Bring a large pot of salted water to a boil....,"{'Calories': '399', 'Fat': '23g', 'Carbs': '33...",
...,...,...,...,...,...,...,...,...,...,...
443,Creamy Chicken Fajita Pasta,This chicken fajita pasta dish has all the fla...,15 mins,30 mins,45 mins,8,"['2 tablespoons chili powder', '1 tablespoon c...","['Combine chili powder, cumin, salt, onion pow...","{'Calories': '555', 'Fat': '25g', 'Carbs': '34...",
444,Marry Me Shrimp Pasta,This creamy shrimp dish will make any pasta lo...,10 mins,20 mins,30 mins,8,['12 ounces \xa0uncooked calamarata or rigaton...,['Gather all ingredients. Cook pasta in salted...,"{'Calories': '300', 'Fat': '16g', 'Carbs': '20...",
445,7 Pasta Dinners for Every Night of the Week,Even the pickiest of eaters will love these pa...,,,,,[],[],,
446,Lemon Orzo,Serve this lemon orzo as a side with chicken o...,5 mins,20 mins,25 mins,4,"['2 tablespoons unsalted butter, divided', '1 ...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,"{'Calories': '442', 'Fat': '11g', 'Carbs': '76...",


In [7]:
# Drop null values in url column
df.dropna(subset = ['recipe_url'], inplace = True)

In [8]:
# Confirm
df[df['recipe_url'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url


#### Look into 'intro'
- 23 recipes with nulls
- Drop these 23 rows b/c they are not individual recipes, they are scrapped pages that include multiple recipes

In [9]:
# View null values
df[df['intro'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
21,25 Recipes to Make With Brownie Mix (That Tast...,,,,,,[],[],,https://www.allrecipes.com/brownie-mix-recipes...
370,8 Favorite Indian Butter Chicken Recipes,,,,,,[],[],,https://www.allrecipes.com/gallery/butter-chic...
536,11 Thai Red Curry Recipes,,,,,,[],[],,https://www.allrecipes.com/gallery/thai-red-cu...
558,Our Best Thai Green Curry Recipes,,,,,,[],[],,https://www.allrecipes.com/gallery/thai-green-...
563,18 Thai Shrimp Recipes,,,,,,[],[],,https://www.allrecipes.com/gallery/thai-shrimp...
687,12 Refreshing Thai-Inspired Summer Salads,,,,,,[],[],,https://www.allrecipes.com/gallery/refreshing-...
701,12 Recipes to Turn Extra Chicken into Healthy ...,,,,,,[],[],,https://www.allrecipes.com/gallery/turn-leftov...
719,18 Best Savory Recipes With Phyllo Dough,,,,,,[],[],,https://www.allrecipes.com/gallery/the-best-sa...
744,These Greek Lamb Recipes Are a True Taste of G...,,,,,,[],[],,https://www.allrecipes.com/gallery/greek-lamb-...
1038,15 Unbeatable Udon Noodle Recipes,,,,,,[],[],,https://www.allrecipes.com/gallery/udon-noodle...


In [10]:
# Drop null values in intro column
df.dropna(subset = ['intro'], inplace = True)

In [11]:
# Confirm
df[df['recipe_url'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url


#### Look into 'nutrition'
- 88 recipes with nulls
- Drop these 88 rows b/c one goal is to be able to cluster recipes based on user nutrition preference

In [12]:
# View null values
df[df['nutrition'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
1,Our 15 Best Brownie Recipes of All Time,There's a sweet treat for everyone.,,,,,[],[],,https://www.allrecipes.com/best-brownie-recipe...
6,28 Mint Chocolate Recipes You'll Want to Make ...,You're going to want to bookmark this one.,,,,,[],[],,https://www.allrecipes.com/mint-chocolate-reci...
19,15 Chocolate and Strawberry Recipes (That Go W...,Put your strawberry haul to good use.,,,,,[],[],,https://www.allrecipes.com/strawberries-and-ch...
25,Copycat Cosmic Brownies,Little Debbie has some competition!,30 mins,1 hr 5 mins,3 hrs 35 mins,12,"['baking spray', '14 ounces (60% cacao) bitter...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,,https://www.allrecipes.com/copycat-cosmic-brow...
38,"The No-Fail Secret to the Chewiest, Fudgiest B...",It took years to figure this out. You’re welcome.,,,,,[],[],,https://www.allrecipes.com/article/how-to-make...
...,...,...,...,...,...,...,...,...,...,...
1938,Nabisco Discontinued the Beloved Wafer Cookies...,"No chocolate wafers, no problem.",,,,,[],[],,https://www.allrecipes.com/14-icebox-cakes-tha...
1945,13 Easy Frozen Treats You Can Make With 5 Ingr...,Get your freezer ready.,,,,,[],[],,https://www.allrecipes.com/5-ingredient-frozen...
2080,Our 50 Best Breakfast Egg Recipes Make Morning...,Our best breakfast egg recipes ever make break...,,,,,[],[],,https://www.allrecipes.com/breakfast-egg-recip...
2118,Chicken Adobo Tacos,These chicken adobo tacos fuse Mexican and Fil...,20 mins,40 mins,1 hr 20 mins,8.0,"['1/3 cup soy sauce', '1/3 cup white vinegar',...","['Combine soy sauce, vinegar, bay leaves, and ...",,https://www.allrecipes.com/chicken-adobo-tacos...


In [13]:
# Drop null values in intro column
df.dropna(subset = ['nutrition'], inplace = True)

In [14]:
# Confirm
df[df['nutrition'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url


#### Look into 'servings'
- 4 recipes with nulls
- Drop these 4 rows b/c one goal is to be able to filter recipes based on number of servings 

In [15]:
# View null values
df[df['servings'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
906,Nicole’s Salmon Wellington,Nicole's salmon Wellington is much more elegan...,20 mins,35 mins,1 hr,,"['1 tablespoon olive oil', '10 ounces frozen c...",['Heat a large nonstick skillet over medium-hi...,"{'Calories': '5144', 'Fat': '354g', 'Carbs': '...",https://www.allrecipes.com/nicole-s-salmon-wel...
1282,Easy Rotisserie Chicken Enchiladas,These easy rotisserie chicken enchiladas can b...,10 mins,30 mins,40 mins,,"['1 1/2 tablespoons canola oil', '6 corn torti...",['Preheat the oven to 375 degrees F (190 degre...,"{'Calories': '1008', 'Fat': '50g', 'Carbs': '8...",https://www.allrecipes.com/easy-rotisserie-chi...
1747,Coconut Milk Ice Cream,"This coconut milk ice cream, lightly flavored ...",5 mins,,6 hrs 5 mins,,['1 (13.5 ounce) can unsweetened coconut milk'...,"['Whisk together coconut milk, sugar, vanilla ...","{'Calories': '346', 'Fat': '8g', 'Carbs': '68g...",https://www.allrecipes.com/coconut-milk-ice-cr...
2112,Easy Rotisserie Chicken Enchiladas,These easy rotisserie chicken enchiladas can b...,10 mins,30 mins,40 mins,,"['1 1/2 tablespoons canola oil', '6 corn torti...",['Preheat the oven to 375 degrees F (190 degre...,"{'Calories': '1008', 'Fat': '50g', 'Carbs': '8...",https://www.allrecipes.com/easy-rotisserie-chi...


In [16]:
# Drop null values in servings column
df.dropna(subset = ['servings'], inplace = True)

In [17]:
# Confirm
df[df['nutrition'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url


#### Look into 'prep_time'
- 23 recipes with nulls
- Will leave these 23 recipes as is, goal is to filter out recipes based on total_time so prep_time is not vital. I would rather have the extra 23 recipes

In [18]:
# View null values
df[df['prep_time'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
62,Cocoa Apple Cake,This recipe is very moist and rich. I have als...,,,,12,"['3 eggs', '2 cups white sugar', '1 cup butte...","['Cream together until fluffy the eggs, sugar,...","{'Calories': '418', 'Fat': '19g', 'Carbs': '60...",https://www.allrecipes.com/recipe/7266/cocoa-a...
74,Country Ham and Biscuits,Dolly Parton and Rachel Parton George's famous...,,,45 mins,6 servings,"['1/4 cup shortening, plus morefor greasing', ...",['Preheat oven to 425 degrees F (220 degrees C...,"{'Calories': '656', 'Fat': '40g', 'Carbs': '57...",https://www.allrecipes.com/country-ham-and-bis...
94,Pork Schnitzel,"To make German-style pork schnitzel, pork loin...",,10 mins,30 mins,4,"['4 (5 ounce) boneless pork loin chops, trimme...",['Place 1 pork chop between 2 sheets of plasti...,"{'Calories': '599', 'Fat': '39g', 'Carbs': '35...",https://www.allrecipes.com/recipe/8532147/pork...
144,This Viral Pan Pizza Recipe Uses Your 9x13 and...,"Let the store do the work, you just have to as...",,,45 mins,4 servings,"['1 pound fresh\xa0pizza dough,\xa0thawed', 'c...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,"{'Calories': '578', 'Fat': '28g', 'Carbs': '59...",https://www.allrecipes.com/trader-joes-spicy-g...
208,Mama's Banana Pudding,Dolly Parton and her sister Rachel Parton Gero...,,,2 hrs 20 mins,24,"['1 cup plus 1 tablespoon sugar', '1/2 cup flo...","['Stir together 1 cup sugar, the flour, and sa...","{'Calories': '318', 'Fat': '13g', 'Carbs': '44...",https://www.allrecipes.com/mamas-banana-puddin...
217,Raspberry and Strawberry Buckle,I've always called this a buckle (I'm from the...,,,,15,"['½ cup butter, softened', '½ cup white sugar'...",['Preheat oven to 375 degrees F (190 degrees C...,"{'Calories': '266', 'Fat': '13g', 'Carbs': '35...",https://www.allrecipes.com/recipe/7603/raspber...
276,Yaki Udon,Yaki Udon is a classic Japanese stir-fry dish ...,,15 mins,35 mins,4.0,"['¼ cup oyster sauce', '3 tablespoons soy sauc...","['Whisk together oyster sauce, soy sauce, miri...","{'Calories': '212', 'Fat': '11g', 'Carbs': '23...",https://www.allrecipes.com/recipe/8539106/yaki...
378,Biryani with Yogurt Marinated Chicken,"This is, as called by my mom, my signature bir...",,45 mins,1 hr 50 mins,12.0,"['1 (8 ounce) container whole milk yogurt', '6...","['Mix yogurt, garlic, ginger, garam masala, tu...","{'Calories': '507', 'Fat': '9g', 'Carbs': '83g...",https://www.allrecipes.com/recipe/246559/birya...
381,Spicy Indian Dahl,A spicy Indian lentil soup that can be enjoyed...,,,,6.0,"['1 cup red lentils', '2 tablespoons ginger ro...",['Cook the lentils by boiling or pressure cook...,"{'Calories': '209', 'Fat': '6g', 'Carbs': '31g...",https://www.allrecipes.com/recipe/13059/spicy-...
542,Thai Noodles,A spicy Thai noodle dish with chicken and vege...,,,,4.0,"['½ pound dried rice noodles', '1 pound skinle...",['Cook rice noodles according to package direc...,"{'Calories': '571', 'Fat': '19g', 'Carbs': '65...",https://www.allrecipes.com/recipe/11690/thai-n...


#### Look into 'total_time'
- 9 recipes with nulls
- Will drop these 9 recipes b/c on goal is to filter out recipes for the user based on total_time

In [19]:
# View null values
df[df['total_time'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
62,Cocoa Apple Cake,This recipe is very moist and rich. I have als...,,,,12.0,"['3 eggs', '2 cups white sugar', '1 cup butte...","['Cream together until fluffy the eggs, sugar,...","{'Calories': '418', 'Fat': '19g', 'Carbs': '60...",https://www.allrecipes.com/recipe/7266/cocoa-a...
217,Raspberry and Strawberry Buckle,I've always called this a buckle (I'm from the...,,,,15.0,"['½ cup butter, softened', '½ cup white sugar'...",['Preheat oven to 375 degrees F (190 degrees C...,"{'Calories': '266', 'Fat': '13g', 'Carbs': '35...",https://www.allrecipes.com/recipe/7603/raspber...
381,Spicy Indian Dahl,A spicy Indian lentil soup that can be enjoyed...,,,,6.0,"['1 cup red lentils', '2 tablespoons ginger ro...",['Cook the lentils by boiling or pressure cook...,"{'Calories': '209', 'Fat': '6g', 'Carbs': '31g...",https://www.allrecipes.com/recipe/13059/spicy-...
542,Thai Noodles,A spicy Thai noodle dish with chicken and vege...,,,,4.0,"['½ pound dried rice noodles', '1 pound skinle...",['Cook rice noodles according to package direc...,"{'Calories': '571', 'Fat': '19g', 'Carbs': '65...",https://www.allrecipes.com/recipe/11690/thai-n...
554,Amazing Simple Thai Tofu,You'd never guess it's Tofu! Peanut and ginger...,,,,4.0,"['1 (14 ounce) package firm tofu, cut into 3/4...",['Heat olive oil and sesame oil in a skillet o...,"{'Calories': '285', 'Fat': '21g', 'Carbs': '11...",https://www.allrecipes.com/recipe/58097/amazin...
730,My Big Fat Greek Omelet,"If you're low-carbing it, try my spinach omele...",,,,4.0,"['1 cup halved grape tomatoes', '1 teaspoon dr...",['Heat a 12-inch non-stick skillet over low he...,"{'Calories': '253', 'Fat': '18g', 'Carbs': '7g...",https://www.allrecipes.com/recipe/80046/my-big...
1222,Birria de Pollo (Chicken Birria) Tacos,"These Mexican birria de pollo tacos, or chicke...",,,,6.0,"['4 cups chicken broth', '4 dried guajillo ch...",['Add chicken broth to a medium saucepan and b...,"{'Calories': '255', 'Fat': '11g', 'Carbs': '8g...",https://www.allrecipes.com/recipe/8463920/birr...
1642,Pierogi Dough,Ukranian Pierogi ready to be stuffed with your...,,,,30.0,"['4 cups all-purpose flour', '1 teaspoon salt'...","['In a large bowl mix together the flour, salt...","{'Calories': '66', 'Fat': '1g', 'Carbs': '13g'...",https://www.allrecipes.com/recipe/12063/pierog...
2158,Birria de Pollo (Chicken Birria) Tacos,"These Mexican birria de pollo tacos, or chicke...",,,,6.0,"['4 cups chicken broth', '4 dried guajillo ch...",['Add chicken broth to a medium saucepan and b...,"{'Calories': '255', 'Fat': '11g', 'Carbs': '8g...",https://www.allrecipes.com/recipe/8463920/birr...


In [20]:
# Drop null values in total_time column
df.dropna(subset = ['total_time'], inplace = True)

In [21]:
# Confirm
df[df['total_time'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url


#### Look into 'cook_time'
- 187 recipes with nulls
- Will leave these 187 recipes as is, goal is to filter out recipes based on total_time so cook_time is not vital. I would rather have the extra 187 recipes

In [22]:
# View null values
df[df['cook_time'].isnull()]

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,nutrition,recipe_url
5,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,15 mins,,2 hrs 15 mins,6,"['5 Little Debbie® Christmas Tree Brownies', '...","['Cut 4 brownies into small pieces, and set as...","{'Calories': '534', 'Fat': '31g', 'Carbs': '61...",https://www.allrecipes.com/little-debbie-brown...
8,Cottage Cheese Chocolate Chip Cookie Dough,This cottage cheese chocolate chip cookie doug...,10 mins,,10 mins,6,"['1 cup 4% milkfat small curd cottage cheese',...","['Place cottage cheese, maple syrup, oats, alm...","{'Calories': '256', 'Fat': '15g', 'Carbs': '24...",https://www.allrecipes.com/cottage-cheese-choc...
16,Blender Chocolate Mousse,"This blender chocolate mousse, with espresso p...",10 mins,,40 mins,4,"['1 cup heavy cream', '1/4 cup unsweetened coc...","[""Combine heavy cream, cocoa powder, confectio...","{'Calories': '252', 'Fat': '22g', 'Carbs': '12...",https://www.allrecipes.com/blender-chocolate-m...
31,Best No-Bake Chocolate Cheesecake,This no-bake chocolate cheesecake has a delici...,30 mins,,2 hrs,8,"['1 cup graham cracker crumbs', '1/4 cup salt...",['Gather your ingredients.\n\n \n\n\n\n\n\n \n...,"{'Calories': '405', 'Fat': '30g', 'Carbs': '32...",https://www.allrecipes.com/no-bake-chocolate-c...
41,5 Minute Baileys Chocolate Mousse,In a hurry for a quick and delicious dessert? ...,5 mins,,5 mins,4,"['1 cup heavy whipping cream', '¼ cup Baileys®...","[""Combine heavy whipping cream, Bailey's, coco...","{'Calories': '305', 'Fat': '23g', 'Carbs': '19...",https://www.allrecipes.com/recipe/8529769/5-mi...
...,...,...,...,...,...,...,...,...,...,...
2020,Asian Chicken Salad,This Asian chicken salad makes good use of lef...,20 mins,,30 mins,6,"['¼ cup vegetable oil', '3 tablespoons rice vi...","['Gather all ingredients.', 'Make the dressing...","{'Calories': '414', 'Fat': '18g', 'Carbs': '39...",https://www.allrecipes.com/recipe/14252/asian-...
2111,Sheet Pan Breakfast Bake,This breakfast bake starts out with a two ingr...,10 mins,,47 mins,8,"['1 cup Greek yogurt', '1 ¼ cups self-rising f...",['Preheat the oven to 350 degrees F (175 degre...,"{'Calories': '286', 'Fat': '18g', 'Carbs': '19...",https://www.allrecipes.com/recipe/8516222/shee...
2148,Steak Burrito,My husband grills ribeye steaks at least once ...,10 mins,,10 mins,1.0,"['1 (10-inch) flour tortilla', '3/4 cup cubed,...",['Fold a paper towel around tortilla. Microwav...,"{'Calories': '2456', 'Fat': '83g', 'Carbs': '3...",https://www.allrecipes.com/steak-burrito-recip...
2229,Homemade Lox,This homemade lox recipe takes some time to pr...,20 mins,,2 days 50 mins,8,"['1 pound salmon fillet, bones removed', '½ cu...",['Rinse salmon with water; pat dry with paper ...,"{'Calories': '120', 'Fat': '3g', 'Carbs': '13g...",https://www.allrecipes.com/recipe/254896/homem...


In [23]:
# Final null value spread
df.isnull().sum()

title            0
intro            0
prep_time       14
cook_time      187
total_time       0
servings         0
ingredients      0
directions       0
nutrition        0
recipe_url       0
dtype: int64

In [24]:
# Shape after dropped nulls
df.shape

(2143, 10)

### 2. Handle Duplicate Values
- There were 273 duplicated recipes in the data frame
- 273 rows dropped from dataframe
- Left with 1_870 unique recipes

In [25]:
# Check for duplicate recipes
df['title'].value_counts()

title
NYC Deli Bacon, Egg, and Cheese Sandwich    5
Ground Pork Tacos with Pineapple Salsa      4
Copycat McDonald's Filet-o-Fish Sandwich    4
Authentic Saag Paneer                       3
Authentic Mexican Chile Rellenos            3
                                           ..
Lamb Feta Peppers                           1
Feta Cheese Turkey Burgers                  1
Roasted Greek Chicken                       1
Margaret's Keftedes (Greek Meatballs)       1
Beef Bourguignon II                         1
Name: count, Length: 1870, dtype: int64

In [26]:
# How many recipes are duplicated ?
df['title'].duplicated().sum()

273

In [27]:
# Want only unique recipes, dropping duplicates
df.drop_duplicates(subset = ['title'], inplace = True)

In [28]:
# Confirm
df['title'].duplicated().sum()

0

In [29]:
df.shape

(1870, 10)

### 3. Handle Data Types
- All data intially stored as strings (object)

In [30]:
# Check data types
df.dtypes

title          object
intro          object
prep_time      object
cook_time      object
total_time     object
servings       object
ingredients    object
directions     object
nutrition      object
recipe_url     object
dtype: object

### 4. Time Conversion
- Columns prep_time, cook_time, total_time have values stored as differnet units: minutes, hours, & days
- Columns prep_time, cook_time, total_time converted to data type float and all units are in minutes

In [31]:
# Look at values in prep_time column
df['prep_time'].value_counts()

prep_time
15 mins         502
10 mins         434
20 mins         373
5 mins          179
30 mins         159
25 mins          99
40 mins          27
45 mins          26
35 mins          25
1 hr             18
4 mins            3
1 hr 10 mins      2
3 mins            2
50 mins           2
2 mins            2
2 hrs             1
1 hr 30 mins      1
22 mins           1
1 hr 25 mins      1
5 hrs             1
8 mins            1
Name: count, dtype: int64

In [32]:
# Look at values in cook_time column
df['cook_time'].value_counts()

cook_time
15 mins          248
10 mins          236
20 mins          227
30 mins          133
25 mins          118
                ... 
5 hrs 10 mins      1
1 hr 28 mins       1
13 mins            1
14 mins            1
1 hr 22 mins       1
Name: count, Length: 84, dtype: int64

In [33]:
# Look at values in total_time column
df['total_time'].value_counts()

total_time
30 mins          161
40 mins          122
35 mins          120
25 mins          117
45 mins          102
                ... 
4 hrs 55 mins      1
8 hrs 45 mins      1
38 mins            1
5 hrs 30 mins      1
1 hr 47 mins       1
Name: count, Length: 177, dtype: int64

#### Time Conversion Function
- Function that takes in a string and converts to minutes

In [34]:
# Define the function
def convert_to_minutes(time_str): 

    # Return the value as is if it's already NaN or empty
    if pd.isna(time_str) or not time_str:
        return time_str

    # Turn into string to process
    time_str = str(time_str).strip()
    
    # Try to convert to float directly (for 20.0, etc)
    try:
        value = float(time_str)
        return int(value)
    except ValueError:
        pass
        
    # Define regular expression raw string
    # '(\d+)\s*days?)?' accounts for days
    # '(\d+)\s*hrs?\s*(\d+)\s*mins' accounts for hour and min string (ie. 1 hr 40 min)
    # '(\d+)\s*hrs?' accounts for hours (ie. 1 hr)
    # 's*(\d+)\s*mins' accounts for mins (ie. 40 mins)
    pattern = r'(?:(\d+)\s*days?)?\s*(?:(\d+)\s*hrs?)?\s*(?:(\d+)\s*mins?)?'

    # Matches the time_str against the pattern to see if there are any matches
    match = re.match(pattern, str(time_str))

    # If a match was found, if statements will run
    if match:
        # Case 1: days, hours, and minutes (or any subset of them)
        days = int(match.group(1)) if match.group(1) else 0
        hours = int(match.group(2)) if match.group(2) else 0
        minutes = int(match.group(3)) if match.group(3) else 0
        return days * 1440 + hours * 60 + minutes

        # Handling invalid formats
        return 

In [35]:
# Apply function to column prep_time
df['prep_time'] = df['prep_time'].apply(convert_to_minutes)

In [36]:
# Confirm
df['prep_time'].value_counts()

prep_time
15.0     502
10.0     434
20.0     373
5.0      179
30.0     159
25.0      99
40.0      27
45.0      26
35.0      25
60.0      18
4.0        3
70.0       2
3.0        2
50.0       2
2.0        2
120.0      1
90.0       1
22.0       1
85.0       1
300.0      1
8.0        1
Name: count, dtype: int64

In [37]:
# Apply function to column cook_time
df['cook_time'] = df['cook_time'].apply(convert_to_minutes)

In [38]:
# Confirm
df['cook_time'].value_counts()

cook_time
15.0     248
10.0     236
20.0     227
30.0     133
25.0     118
        ... 
310.0      1
88.0       1
13.0       1
14.0       1
82.0       1
Name: count, Length: 84, dtype: int64

In [39]:
# Apply function to column total_time
df['total_time'] = df['total_time'].apply(convert_to_minutes)

In [40]:
# Confirm
df['total_time'].value_counts()

total_time
30     161
40     122
35     120
25     117
45     102
      ... 
295      1
525      1
38       1
330      1
107      1
Name: count, Length: 177, dtype: int64

In [41]:
# Check to make sure everything converted corerctly
df.isnull().sum()

title            0
intro            0
prep_time       11
cook_time      171
total_time       0
servings         0
ingredients      0
directions       0
nutrition        0
recipe_url       0
dtype: int64

### 5. Fix servings column
- 'servings' column contains extra information beyond the serving amount
- Strip and removed all extra characters besides numeric serving
- Convert servings column to numeric data type

In [42]:
# Check how data is stored in servings column
df['servings'].unique()

array(['16', '6', '12', '9', '64', '10', '18', '2', '4',
       '12 large squares', '20', '5', '8', '1', '36', '24', '42', '48',
       '6 servings', '10 to 12', '15', '4 (serving size: 1 sandwich)',
       '4 servings', '8 servings', '3', '4 (serving size: 1 apple)',
       '10 Servings', '12 servings', '21', '4.0', '5.0', '6.0', '1.0',
       '2.0', '8.0', '12.0', '6 to 8', '3.0',
       '4 (serving size: 1 sandwich [1 bun, 1 fillet, 2 Tbsp. tartar sauce, 1 slice cheese)',
       '12 sliders', '10.0', '7.0', '4 to 6',
       '6 (serving size: about 1 1/3 cups pasta, about 2 tenderloins)',
       '8 to 10', '4 (serving size: about 1 1/4 cup oatmeal mixture)',
       '2 to 3', '8 (serving size: 3 slices)',
       '4 (serving size: one puff pastry wrapped salmon fillet)',
       '8 (serving size: 1 Funyun’s bag)', '4 (serving size: 2 toasts)',
       '16.0', '15.0', '18.0', '24.0', '6 (serving size: 2 tacos)', '7',
       '4 (serving size: 1 cutlet, 2 Tbsp. sauce)',
       '10 (serving 

In [43]:
# Clean servings column
df['servings'] = df['servings'].str.extract(r'(\d+\.?\d*)', expand=False)
# Removes extra characters 

In [44]:
# Convert the servings column to numeric (float or int)
df['servings'] = pd.to_numeric(df['servings'], errors='coerce')

In [45]:
# Confirm
df['servings'].value_counts()

servings
4.0     578
6.0     376
8.0     274
2.0     164
12.0    137
1.0      83
10.0     70
5.0      40
16.0     39
3.0      21
24.0     18
20.0     16
18.0     13
15.0      9
9.0       6
36.0      4
7.0       4
32.0      2
48.0      2
21.0      1
42.0      1
13.0      1
64.0      1
72.0      1
25.0      1
35.0      1
28.0      1
33.0      1
45.0      1
22.0      1
44.0      1
26.0      1
14.0      1
Name: count, dtype: int64

## Transforming

### 1. Nutrition Column
- 'nutrition' column contains: Calories, Fat, Carbs, & Protein
- Seperate out calories and different macros into seperate categories

In [46]:
df['nutrition'].head()

0    {'Calories': '405', 'Fat': '31g', 'Carbs': '28...
2    {'Calories': '1058', 'Fat': '65g', 'Carbs': '1...
3    {'Calories': '824', 'Fat': '55g', 'Carbs': '80...
4    {'Calories': '231', 'Fat': '12g', 'Carbs': '29...
5    {'Calories': '534', 'Fat': '31g', 'Carbs': '61...
Name: nutrition, dtype: object

In [47]:
# Function to clean nutrition data and convert it to a dictionary
def clean_and_parse_nutrition(nutrition_str):
    if isinstance(nutrition_str, str):
        # Clean string by removing 'g' and replace single quotes with double quotes
        cleaned_str = re.sub(r'(\d+)(g)', r'\1', nutrition_str)  # Removes 'g'
        cleaned_str = cleaned_str.replace("'", '"')  # Replaces single quotes with double quotes for valid JSON
        print(f"Cleaned string: {cleaned_str}")  # Print the cleaned string for debugging
        try:
            return ast.literal_eval(cleaned_str)  # Convert to dictionary
        except Exception as e:
            print(f"Error parsing string: {nutrition_str}, Error: {e}")
            return {}
    return {}

In [48]:
# Apply the cleaning function to the 'nutrition' column
df['nutrition'] = df['nutrition'].apply(clean_and_parse_nutrition)

Cleaned string: {"Calories": "405", "Fat": "31", "Carbs": "28", "Protein": "5"}
Cleaned string: {"Calories": "1058", "Fat": "65", "Carbs": "113", "Protein": "10"}
Cleaned string: {"Calories": "824", "Fat": "55", "Carbs": "80", "Protein": "9"}
Cleaned string: {"Calories": "231", "Fat": "12", "Carbs": "29", "Protein": "3"}
Cleaned string: {"Calories": "534", "Fat": "31", "Carbs": "61", "Protein": "6"}
Cleaned string: {"Calories": "86", "Fat": "4", "Carbs": "11", "Protein": "1"}
Cleaned string: {"Calories": "256", "Fat": "15", "Carbs": "24", "Protein": "9"}
Cleaned string: {"Calories": "143", "Fat": "6", "Carbs": "20", "Protein": "2"}
Cleaned string: {"Calories": "107", "Fat": "7", "Carbs": "10", "Protein": "1"}
Cleaned string: {"Calories": "260", "Fat": "16", "Carbs": "26", "Protein": "4"}
Cleaned string: {"Calories": "174", "Fat": "8", "Carbs": "25", "Protein": "2"}
Cleaned string: {"Calories": "501", "Fat": "20", "Carbs": "79", "Protein": "5"}
Cleaned string: {"Calories": "889", "Fat":

In [49]:
# Extract the nutritional components into separate columns
df['calories'] = df['nutrition'].apply(lambda x: int(x.get('Calories', 0)) if x else 0)
df['fat'] = df['nutrition'].apply(lambda x: int(x.get('Fat', '0').strip()) if x else 0)
df['carbs'] = df['nutrition'].apply(lambda x: int(x.get('Carbs', '0').strip()) if x else 0)
df['protein'] = df['nutrition'].apply(lambda x: int(x.get('Protein', '0').strip()) if x else 0)

In [50]:
# Drop nutrition column, no longer need
df = df.drop(columns=['nutrition'])

In [51]:
# Confirm
df.head(3)

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,ingredients,directions,recipe_url,calories,fat,carbs,protein
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,"['1/3 cup butter, melted', '1/4 cup white suga...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,"['1 cup dark chocolate chips', '20 creme-fille...",['Melt chocolate chips in a microwave-safe bow...,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,"['cooking spray', '2 1/4 cups all-purpose flou...",['Gather all ingredients.\n\n \n\n\n\n\n\n \n ...,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9


## Clean and Preprocess 'ingredients'

### 1. Convert Column

In [52]:
df['ingredients'].head()

0    ['1/3 cup butter, melted', '1/4 cup white suga...
2    ['1 cup dark chocolate chips', '20 creme-fille...
3    ['cooking spray', '2 1/4 cups all-purpose flou...
4    ['1 sheet of prepared puff pastry, thawed', '9...
5    ['5 Little Debbie® Christmas Tree Brownies', '...
Name: ingredients, dtype: object

In [53]:
df['ingredients'].unique()
# ingredients column contains: quantities, units, and descriptors

array(["['1/3 cup butter, melted', '1/4 cup white sugar', '1 1/4 cups finely crushed chocolate or regular graham crackers (about 9 sheets)', '7 ounces bittersweet chocolate', '4 large eggs, at room temperature', '3/4 cup white sugar', '1/4 cup packed brown sugar', '3 tablespoons water, at room temperature', '1/4 teaspoon salt', '1 tablespoon vanilla extract', '1/2 cup unsalted butter, at room temperature, cut into 8 pieces', '1/2 cup heavy cream', '3 ounces cream cheese, softened', '3 tablespoons white sugar', '1 teaspoon vanilla extract', '1 pinch salt', '1 1/2 cups heavy cream', 'chocolate curls or chocolate sprinkles']",
       "['1 cup dark chocolate chips', '20 creme-filled chocolate sandwich cookies (such as OREO®)', '4 tablespoons butter, melted', '1 tablespoon instant espresso powder', '1 tablespoon hot water', '1 1/2 tablespoons cocoa powder, plus more for sprinkling', '1 tablespoon coffee liqueur (such as Kahlua®)', '8 ounces cream cheese, softened', '1/2 cup white sugar', '1

In [54]:
type(df['ingredients'].iloc[0])
# ingredients are stored as a series of strings, want to convert to a list to be able to loop through and apply transformations

str

In [55]:
# Convert ingredients column for each cell to contain a list of ingredient strings
df['ingredients'] = df['ingredients'].apply(ast.literal_eval)

In [56]:
# Confirm
type(df['ingredients'].iloc[0])

list

### 2. Normalize Text

In [57]:
df['ingredients'][0]
# I want to remove punctuation, parentheses any anything in them, all lowercase, and strip any extra whitespace

['1/3 cup butter, melted',
 '1/4 cup white sugar',
 '1 1/4 cups finely crushed chocolate or regular graham crackers (about 9 sheets)',
 '7 ounces bittersweet chocolate',
 '4 large eggs, at room temperature',
 '3/4 cup white sugar',
 '1/4 cup packed brown sugar',
 '3 tablespoons water, at room temperature',
 '1/4 teaspoon salt',
 '1 tablespoon vanilla extract',
 '1/2 cup unsalted butter, at room temperature, cut into 8 pieces',
 '1/2 cup heavy cream',
 '3 ounces cream cheese, softened',
 '3 tablespoons white sugar',
 '1 teaspoon vanilla extract',
 '1 pinch salt',
 '1 1/2 cups heavy cream',
 'chocolate curls or chocolate sprinkles']

In [58]:
# Write a function to lower(), strip(), remove punctuation, numerics, and anything in parentheses
def first_clean_text(ingredient_list):
    clean_ingredients = [] # Create an empty list where cleaned ingredients will be stored

    number_pattern = r'\d+' # regular expression to remove all numbers
    fraction_pattern = r'[\u00BC-\u00BE\u2150-\u215E]' # unicode fraction types
    
    for item in ingredient_list: # iterate through each ingredient
        item = item.lower().strip() # converts all text to lower case and removes extra spaces
        item = re.sub(r'\(.*?\)', '', item) # regular expression to remove any text within parenteses 
        item = re.sub(fraction_pattern, '', item) # removes unicode fractions
        item = item.translate(str.maketrans('', '', string.punctuation)) # removes any punctuation using a built in python table
        item = re.sub(number_pattern, '', item) # removes all numbers
        item = item.strip() # strip any extra spaces after the rest
        clean_ingredients.append(item) # takes the cleaned ingredient item and add it to the clean_ingredients list
        
    return clean_ingredients

In [59]:
# Apply the funciton to the ingredients column, save as a new column just in case
df['cleaned_ingredients'] = df['ingredients'].apply(first_clean_text)

In [60]:
# Confirm
df['cleaned_ingredients'][0]

['cup butter melted',
 'cup white sugar',
 'cups finely crushed chocolate or regular graham crackers',
 'ounces bittersweet chocolate',
 'large eggs at room temperature',
 'cup white sugar',
 'cup packed brown sugar',
 'tablespoons water at room temperature',
 'teaspoon salt',
 'tablespoon vanilla extract',
 'cup unsalted butter at room temperature cut into  pieces',
 'cup heavy cream',
 'ounces cream cheese softened',
 'tablespoons white sugar',
 'teaspoon vanilla extract',
 'pinch salt',
 'cups heavy cream',
 'chocolate curls or chocolate sprinkles']

In [61]:
# Write a function to remove all numerics and all units of measure
def clean_units(ingredient_list):
    no_units_ingredients = []  # Create an empty list where cleaned ingredients will be stored
    # define the untis of measurement
    quantity_and_units = r'\b(cup|cups|ounce|ounces|oz|tablespoon|tablespoons|tbsp|teaspoon|teaspoons|tsp|pinch|pinches|container|containers|quart|quarts|packet|packets|package|packages|sheetsheets|can|cans|sleeve|sleeves|whole|slice|slices|clove|cloves|pound|pounds|large|medium|small|tiny|jumbo|sheet|sheets|mini)\b'

    for item in ingredient_list:
        item = re.sub(quantity_and_units, '', item) # remove units
        item = item.strip() # remove any extra whitespace after removal
        no_units_ingredients.append(item)

    return no_units_ingredients

In [62]:
# Apply the funciton to the cleaned_ingredients column
df['cleaned_ingredients'] = df['cleaned_ingredients'].apply(clean_units)

In [63]:
# Confirm
df['cleaned_ingredients'][0]

['butter melted',
 'white sugar',
 'finely crushed chocolate or regular graham crackers',
 'bittersweet chocolate',
 'eggs at room temperature',
 'white sugar',
 'packed brown sugar',
 'water at room temperature',
 'salt',
 'vanilla extract',
 'unsalted butter at room temperature cut into  pieces',
 'heavy cream',
 'cream cheese softened',
 'white sugar',
 'vanilla extract',
 'salt',
 'heavy cream',
 'chocolate curls or chocolate sprinkles']

In [64]:
from collections import Counter

In [65]:
all_ingredients = [ingredient for sublist in df['cleaned_ingredients'] for ingredient in sublist]
ingredient_counts = Counter(all_ingredients)

In [66]:
ingredient_counts

Counter({'salt': 534,
         'olive oil': 346,
         'water': 328,
         'garlic minced': 303,
         'white sugar': 272,
         'allpurpose flour': 231,
         'soy sauce': 210,
         'ground black pepper': 204,
         'vegetable oil': 200,
         'eggs': 193,
         'kosher salt': 180,
         'freshly ground black pepper': 176,
         'milk': 175,
         'butter': 170,
         'vanilla extract': 159,
         'garlic powder': 138,
         'onion chopped': 135,
         'cornstarch': 133,
         'ground cumin': 130,
         'salt and ground black pepper to taste': 119,
         'cayenne pepper': 117,
         'salt and freshly ground black pepper to taste': 111,
         'dried oregano': 105,
         'heavy cream': 103,
         'minced garlic': 100,
         'egg': 99,
         'brown sugar': 97,
         'lemon juice': 89,
         'chicken broth': 89,
         'sesame oil': 86,
         'unsalted butter': 83,
         'baking powder': 81,
        

In [99]:
# Now I want to remove all prep words
prep_words = ['minced','freshly','ground','chopped','to taste','dried','shredded','ground','grated','beaten','crushed','extravirgin','crumbled',
              'or','diced','whipping','melted','lean','juiced','divided','skinless','boneless','halves','fresh','dry','plain','finely','roasted',
              'unsweetened','distilled','granulated','packed','cold','softened','light','sliced','at room temperature','for','frying','sweetened',
              'extra virgin','peeled''and deveined','seasoned','rinsed and','drained','as needed','cubes','semisweet','florets','thinly','lowsodium',
              'removed','cut','into','wedges','and rinsed','mix','nonstick','toasted','reducedsodium','stalks','instant','boiling','bunch','zested',
              'warm','peeled','pitted','dark','rolled','uncooked','cooked','creamy','halved','plus','granules','serving','skewers','prepared',
              'smashed','and diced','filets','more','split','lukewarm','bitesized','pieces','thawed','and drained','to','cover','stalk','pure',
              'with flour','fine','dry','jar','inch','of','thickcut','lessodium','low','long grain','prepared','drops','dashes','leaves','chunk',
              'chunks','garnish','full','fat','wooden skewers','about','bonein','cubed','bottle','envelope','refrigerated','raw','filet','and cubed',
              'caps','canned','and squeezed','shortgrain','glutinous','japanese sushistyle','crunchy','sprig','sprigs','natural','fullfat','very',
              'baked','regular','deepdish','fireroasted','sushi','japanese','chinese','scrubbed','pint','thin','seperated','rings','lengthwise',
              'head','seperated','marinated','quartered','cored','seeded','splash','coarsely','wholemilk','coarsely','miniature','dipping','meat'
              'complete','italianstyle','jarred','ball','torn','hardboiled','and','reducedfat','processed','lb','pickled','firm','soaked',
              'coarsely','flaked','flaky','lesssodium','roughly','if needed','cube','pressed','undrained','bittersweet','seperated','snipped',
              'dusting','warmed','sodium','lowfat','with juice','gallons','tenderizer','matchsticks','extrafirm','stick','cleaned','pimentostuffed',
              'longgrain','flavoring','stems','lowfat','rinsed','mixed','well','cutlets','reduced','blend','deveined','pounded','thickness',
              'curd','rainbow jimmie','chilled','pureed','quickcooking','fullycooked','soft','drizzling','squeezed','shelled','diagonally','strips',
              'parts','seperated','bitesize','broken','indian','clarified','cube','new','in water','minutes','debearded','crosswise','from',
              'squares parchement paper','ranchstyle','bibb','lightly','thick','and deveined','in water minutes','drizzling','cracked','thick',
              'fillets','freshlysqueezed','patted','links','piece','ripe','portions','such as','holiday','premium','milkfat','blanched','superfine',
              'xinch parchment paper','heads','extra','sunsulphured','frozen','lowersodium','reserved','jars','stemmed','trimmed','horizontally',
              's’moresized','strong','mashed','pan','matchstick','with','unsulphured','shaved','separated','sea','julienned','lacinato',
              'squares parchment paper','discarded','high','protein','good culture®','round','breaded','fully','chunky','saltfree','roast',
              'red pink white jimmie','canadian','mild','precut','dutch process','wholewheat','java chip flavored','dairy free','virgin','cooled',
              'almond breeze','blue diamond','highquality','crumbs','firmly','fatfree','nondairy','crumb','dash','lowmoisture partskim','oilpacked',
              'stewed','asian','jellied','silver skin','rounds','bag','scoop','brushing','additional greasing','topping','until smooth','morsels',
              'gel','box','rolling','buttery','arils','nonfat','deep','fluid','lump','milkfish','the','other cooking apples eighths','whitefleshed',
              'in shell','any color','complete','using holes on grater','optional toppings','kosher','root','additional','if','desired','aged',
              'dredging','sifted','milk choice','in half','aluminum foil','thirds','up','flakes','raos®','malk®','greasing','rubbed','vertically',
              'concentrate','soybased','loosely','loaf','sticks','genoa','italianseasoned','quarters','spears','in oil','fire','brewed','bulk',
              'turbinado','demerara','matchsticksize','montreal','knorr®','whisked','strained','sandwich','excess','base',
              'caramel pumpkin pie spice candy eyes','bits','waxed paper','pepperidge farm®','spread boursin®','other cooking apples eighths',
              'dots® honey mustard pretzels','centercut','red pink white jimmie','converted','snacksized bags cookies',
              '® teddy grahams® oreos® nilla wafers®','reese’s® peanut butter hershey’s® chocolate bar','stirfry','other temperature oil',
              'morefor','carton','several','thincut','nonpareil','bone in','doublecut','any chile pepper','louisianastyle','seafood seasoning',
              'chile de','cheesy','snaps','in adobo','hawaiianstyle','dice','mostly','sauceless','few hot sauce','spice','rack','membrane','oval',
              'jimmy dean® original hearty','crumbles','goodquality','flatleaf','top','oil cooking','stale italian','cured picnic','hickory chips',
              'readytoeat','country style','italian flat leaf','thickly','racks','chocolate curls','other','thickskinned','thinskinned','english']

In [100]:
# Write a function to remove all the prep words for the ingredients
def remove_prep_words(ingredient_list):
    
    pattern = pattern = r'(?<!\w)(?:' + '|'.join(re.escape(word) for word in prep_words) + r')(?!\w)' # regular expression pattern to remove prep_words
    cleaned = [re.sub(pattern, '', ingredient).strip() for ingredient in ingredient_list] # remove words and clean up extra whitespace
    
    return [re.sub(r'\s+', ' ', ing) for ing in cleaned if ing]

In [101]:
df['cleaned_ingredients'] = df['cleaned_ingredients'].apply(remove_prep_words)

In [102]:
# Confirm
df['cleaned_ingredients'][0]

['butter',
 'white sugar',
 'chocolate graham crackers',
 'chocolate',
 'eggs',
 'white sugar',
 'brown sugar',
 'water',
 'salt',
 'vanilla extract',
 'unsalted butter',
 'heavy cream',
 'cream cheese',
 'white sugar',
 'vanilla extract',
 'salt',
 'heavy cream',
 'chocolate sprinkles']

In [103]:
all_ingredients = [ingredient for sublist in df['cleaned_ingredients'] for ingredient in sublist]
ingredient_counts = Counter(all_ingredients)
ingredient_counts

Counter({'salt': 988,
         'garlic': 738,
         'black pepper': 515,
         'olive oil': 499,
         'water': 451,
         'onion': 432,
         'butter': 324,
         'soy sauce': 302,
         'eggs': 301,
         'white sugar': 299,
         'vegetable oil': 280,
         'allpurpose flour': 263,
         'salt black pepper': 261,
         'milk': 209,
         'ginger': 207,
         'brown sugar': 187,
         'unsalted butter': 176,
         'cilantro': 174,
         'vanilla extract': 168,
         'heavy cream': 168,
         'cayenne pepper': 162,
         'parsley': 155,
         'egg': 154,
         'garlic powder': 151,
         'lemon juice': 149,
         'green onions': 149,
         'cumin': 145,
         'cornstarch': 140,
         'parmesan cheese': 140,
         'oregano': 137,
         'red pepper': 130,
         'chicken broth': 124,
         'sesame oil': 118,
         'chicken breast': 117,
         'tomatoes': 116,
         'red bell pepper': 115

### Create clusters for total_time, calories, carbs, proteins, fats

In [104]:
# total_time

def categorize_time(total_time):
   
    if total_time <= 30:
        return '30 minutes or less!'
    elif 31 <= total_time <= 60:
        return 'Hour of less!'
    elif 61 <= total_time <= 120:
        return 'Long recipes!'
    else:
        return 'Livin in the kitchin!'


# Assuming df has a column 'total_time' which stores the time in minutes
df['time_category'] = df['total_time'].apply(categorize_time)

In [106]:
df.head(3)

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!


In [109]:
# calories column

def categorize_calories(calories):
   
    if calories <= 300:
        return 'Low Cal!'
    elif 301 <= calories <= 600:
        return 'Average Cal!'
    elif 601 <= calories <= 1000:
        return 'High Cal!'
    else:
        return "Don't Look!"


# Assuming df has a column 'total_time' which stores the time in minutes
df['calorie_category'] = df['calories'].apply(categorize_calories)

In [110]:
df.head(3)

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!


In [116]:
# fats column

def categorize_fats(fat):
   
    if fat <= 15:
        return 'Low Fat!'
    elif 16 <= fat <= 45:
        return 'Average Fat!'
    else:
        return "High Fat!"


# Assuming df has a column 'total_time' which stores the time in minutes
df['fat_category'] = df['fat'].apply(categorize_fats)

In [117]:
df.head(3)

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category,fat_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!,Average Fat!
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!,High Fat!
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!,High Fat!


In [118]:
# carbs column

def categorize_carbs(carbs):
   
    if carbs <= 25:
        return 'Low Carbs!'
    elif 26 <= carbs <= 80:
        return 'Average Carbs!'
    else:
        return "High Carbs!"


# Assuming df has a column 'total_time' which stores the time in minutes
df['carbs_category'] = df['carbs'].apply(categorize_carbs)

In [119]:
df.head(3)

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category,fat_category,carbs_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!,High Fat!,High Carbs!
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!


In [122]:
# protein column

def categorize_proteins(proteins):
   
    if proteins <= 25:
        return 'Low Protein!'
    elif 26 <= proteins <= 80:
        return 'Average Protein!'
    else:
        return "High Protein!"


# Assuming df has a column 'total_time' which stores the time in minutes
df['protein_category'] = df['protein'].apply(categorize_proteins)

In [123]:
df.head()

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category,fat_category,carbs_category,protein_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!
2,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!,High Fat!,High Carbs!,Low Protein!
3,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!,Low Protein!
4,Sleeping Gingerbread Treats,Shhhh they're sleeping!,15.0,15.0,30,9.0,https://www.allrecipes.com/sleeping-gingerbrea...,231,12,29,3,"[puff pastry, chocolate squares, gingerbread m...",30 minutes or less!,Low Cal!,Low Fat!,Average Carbs!,Low Protein!
5,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,15.0,,135,6.0,https://www.allrecipes.com/little-debbie-brown...,534,31,61,6,"[little debbie® christmas tree brownies, cream...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!


In [124]:
df.to_csv('../data/recipes_cleaned.csv', index=False)