In [1]:
import pandas as pd
import seaborn as sbn

## Ingest Data

In [2]:
df = pd.read_csv('../data/RAW_recipes.csv')#, index_col='id'

In [3]:
ingredients_df = df[["id", "ingredients"]]
ingredients_df.head()

Unnamed: 0,id,ingredients
0,137739,"['winter squash', 'mexican seasoning', 'mixed ..."
1,31490,"['prepared pizza crust', 'sausage patty', 'egg..."
2,112140,"['ground beef', 'yellow onions', 'diced tomato..."
3,59389,"['spreadable cheese with garlic and herbs', 'n..."
4,44061,"['tomato juice', 'apple cider vinegar', 'sugar..."


In [99]:
ingredients_df.count()

id             231637
ingredients    231637
dtype: int64

## Data Cleaning
### Explode the list of ingredients per recipe

In [5]:
from ast import literal_eval
# Convert to list so explode works
ingredients_df["ingredients"] = ingredients_df["ingredients"].apply(literal_eval)

ingredients_exploded_df = ingredients_df.explode("ingredients", ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredients_df["ingredients"] = ingredients_df["ingredients"].apply(literal_eval)


In [6]:
display(ingredients_exploded_df)

Unnamed: 0,id,ingredients
0,137739,winter squash
1,137739,mexican seasoning
2,137739,mixed spice
3,137739,honey
4,137739,butter
...,...,...
2096577,298509,eggs
2096578,298509,flour
2096579,298509,cream of tartar
2096580,298509,baking soda


In [7]:
ingredients_exploded_df.groupby('ingredients').count()

Unnamed: 0_level_0,id
ingredients,Unnamed: 1_level_1
1% fat buttermilk,24
1% fat cottage cheese,32
1% low-fat chocolate milk,2
1% low-fat milk,472
10 bean soup mix,2
...,...
zoom quick hot cereal,1
zucchini,4591
zucchini with italian-style tomato sauce,3
zwieback toast,5


In [8]:
# display(pd.get_dummies(ingredients_exploded_df["ingredients"]))

## Group by ingredient to get a feel for the least common

This helps to identify areas that perhaps could use some cleaning, such as removing brand names

In [9]:
ingredient_counts = ingredients_exploded_df.groupby("ingredients").count().rename(columns={"id": "count"})

In [10]:
ingredient_counts.sort_values("count", ascending=True).head()

Unnamed: 0_level_0,count
ingredients,Unnamed: 1_level_1
zwieback toast crumbs,1
frozen cranberry-apple juice cocktail,1
frozen corn souffle,1
soyaki,1
frozen chopped mangoes,1


Reset the index so we can split the data even more

In [11]:
ingredient_counts = ingredient_counts.reset_index()

Split the ingredient values so that we can easily remove words that are less necessary to the integrity of the recipe

In [12]:
split_values = ingredient_counts.ingredients.str.split().rename("words")

In [13]:
df_merged = pd.concat([ingredient_counts, split_values], axis=1)

In [14]:
display(df_merged)

Unnamed: 0,ingredients,count,words
0,1% fat buttermilk,24,"[1%, fat, buttermilk]"
1,1% fat cottage cheese,32,"[1%, fat, cottage, cheese]"
2,1% low-fat chocolate milk,2,"[1%, low-fat, chocolate, milk]"
3,1% low-fat milk,472,"[1%, low-fat, milk]"
4,10 bean soup mix,2,"[10, bean, soup, mix]"
...,...,...,...
14937,zoom quick hot cereal,1,"[zoom, quick, hot, cereal]"
14938,zucchini,4591,[zucchini]
14939,zucchini with italian-style tomato sauce,3,"[zucchini, with, italian-style, tomato, sauce]"
14940,zwieback toast,5,"[zwieback, toast]"


In [15]:
all_words = df_merged.explode("words", ignore_index=True)

In [16]:
display(all_words.groupby("words").count().sort_values("count", ascending=False)[:10])

Unnamed: 0_level_0,ingredients,count
words,Unnamed: 1_level_1,Unnamed: 2_level_1
mix,649,649
cheese,562,562
sauce,449,449
chocolate,445,445
and,361,361
chicken,356,356
dried,327,327
cream,323,323
red,305,305
frozen,289,289


In [17]:
all_words[all_words['ingredients'].str.contains("cheese")]

Unnamed: 0,ingredients,count,words
3,1% fat cottage cheese,32,1%
4,1% fat cottage cheese,32,fat
5,1% fat cottage cheese,32,cottage
6,1% fat cottage cheese,32,cheese
122,2% cheddar cheese,68,2%
...,...,...,...
40786,whole milk ricotta cheese,95,cheese
41377,yellow cheese,18,yellow
41378,yellow cheese,18,cheese
41528,yogurt cheese,19,yogurt


### Remove descriptive words

In [83]:
df_merge_exploded = df_merged.explode('words')

adjectives_to_remove = ['kraft', 'low-carb', 'low-fat', 'prepared', 'fresh', 'frozen', 'grated', 'unsweetened',\
                        'reduced', 'fat', 'free', 'fat-free', 'sugar-free', 'boneless', 'and', 'salad', 'wishbone',\
                        'whole', 'dried', 'inch', "in.", "baby", "betty", "crocker", "reduced-fat", "cooked", 'extra',
                       'large', 'extra-large', 'dry', 'duncan', 'hines', 'best', 'unsalted', 'vegan', 'vegetarian', \
                       'whole', 'wheat', 'toasted', 'unbaked', 'unbleached', 'unsifted', 'boneless', 'breaded', 'canned',\
                       'lean', 'half', 'skinless'] 
adjectives_to_remove.extend([f"{i}%" for i in range(0,101)])
adjectives_to_remove.extend(["8-in", "8\"", "9-in", "9\""])
adjectives_to_remove.extend([f"{i}-inch" for i in range(1,24)])
adjectives_to_remove.extend([str(i) for i in range(0, 100)])
adjectives_to_remove.extend(["red", "yellow", "green", "brown"])

simplified_ingredient_list_df = df_merge_exploded[~df_merge_exploded['words'].isin(adjectives_to_remove)]

In [84]:
display(simplified_ingredient_list_df.groupby("words").count().sort_values("count", ascending=False)[:25])

Unnamed: 0_level_0,ingredients,count,new_name
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mix,649,649,649
cheese,562,562,562
sauce,449,449,449
chocolate,445,445,445
chicken,356,356,356
cream,323,323,323
beef,284,284,284
white,265,265,265
pepper,250,250,250
juice,250,250,250


In [85]:
simplified_ingredient_list_df.head()

Unnamed: 0,ingredients,count,words,new_name
0,1% fat buttermilk,24,buttermilk,buttermilk
1,1% fat cottage cheese,32,cottage,cottage cheese
1,1% fat cottage cheese,32,cheese,cottage cheese
2,1% low-fat chocolate milk,2,chocolate,chocolate milk
2,1% low-fat chocolate milk,2,milk,chocolate milk


In [86]:
import inflection as inf

def combine_with_stop_words(x):
    arr = []
    for item in x:
        if item == "with":
            break
        if '\'' in item:
            continue
        arr.append(inf.singularize(item))
            
    return ' '.join(arr)

new_ingredient_series = simplified_ingredient_list_df.groupby(level=0, axis=0)['words'].agg(lambda x: combine_with_stop_words(x))

In [87]:
new_ingredient_series

0                   buttermilk
1               cottage cheese
2               chocolate milk
3                         milk
4                bean soup mix
                 ...          
14937    zoom quick hot cereal
14938                 zucchini
14939                 zucchini
14940           zwieback toast
14941     zwieback toast crumb
Name: words, Length: 14939, dtype: object

In [88]:
df_merged["new_name"] = new_ingredient_series

In [89]:
df_merged

Unnamed: 0,ingredients,count,words,new_name
0,1% fat buttermilk,24,"[1%, fat, buttermilk]",buttermilk
1,1% fat cottage cheese,32,"[1%, fat, cottage, cheese]",cottage cheese
2,1% low-fat chocolate milk,2,"[1%, low-fat, chocolate, milk]",chocolate milk
3,1% low-fat milk,472,"[1%, low-fat, milk]",milk
4,10 bean soup mix,2,"[10, bean, soup, mix]",bean soup mix
...,...,...,...,...
14937,zoom quick hot cereal,1,"[zoom, quick, hot, cereal]",zoom quick hot cereal
14938,zucchini,4591,[zucchini],zucchini
14939,zucchini with italian-style tomato sauce,3,"[zucchini, with, italian-style, tomato, sauce]",zucchini
14940,zwieback toast,5,"[zwieback, toast]",zwieback toast


In [90]:
pd.unique(df_merged.ingredients).shape

(14942,)

In [91]:
pd.unique(df_merged.new_name).shape

(11325,)

In [92]:
pd.unique(df_merged.new_name)[1000:1400]

array(['blueberry', 'blueberry in heavy syrup',
       'blueberry in light syrup', 'blueberry bagel', 'blueberry filling',
       'blueberry gelatin', 'blueberry ice cream', 'blueberry jam',
       'blueberry juice', 'blueberry liqueur', 'blueberry muffin mix',
       'blueberry pie filling', 'blueberry preserf', 'blueberry sauce',
       'blueberry schnapp', 'blueberry sorbet', 'blueberry tea bag',
       'blueberry vinegar', 'blueberry vodka', 'blueberry wine',
       'blueberry yogurt', 'blueberry-flavored syrup', 'bluefish',
       'bluefish fillet', 'blush wine', 'blush wine vinaigrette dressing',
       'boar', 'bob evan sausage',
       'mill gluten-free all-purpose baking flour', 'boboli pizza crust',
       'boboli thin pizza shell', 'boca meatless ground burger',
       'bocconcini', 'bock beer', 'bockwurst', 'boil-in-the-bag rice',
       'boiled ham', 'boiled potato', 'boiling onion', 'boiling potato',
       'boiling water', 'bolillo roll', 'bologna', 'bologna sausage',
  

## One-hot encode

In [96]:
# pd.get_dummies(df_merged['new_name'], prefix='b')

## TODO
* Join w/ ingredients_exploded_df on ingredients
* Re-do the grouping
* one-hot encode