In [9]:
import pandas as pd

In [10]:
# create a subset of data from the full dataset (only run once)
# full = open('full_dataset.tsv', 'r')
# subset = open('recipe_dataset.tsv', 'w')
# for _ in range(100001):
#     subset.write(full.readline())
# full.close()
# subset.close()

In [11]:
# read raw data into pandas data frame
recipe_df = pd.read_csv('recipe_dataset.tsv', sep='\t')
# recipe_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
recipe_df.index.name = 'id'
recipe_df.drop(columns=['Unnamed: 0', 'ingredients', 'directions', 'source'], inplace=True)
recipe_df[['vegan', 'vegetarian', 'lactose', 'gluten', 'halal', 'kosher', 'nut', 'shellfish', 'pescatarian']] = True
recipe_df[['cook_time']] = 0
recipe_df[['cuisine_type']] = 'none'
recipe_df['NER'] = recipe_df['NER'].apply(lambda e: ";".join(e[2:-2].split('\", \"')).lower())
recipe_df.head()

Unnamed: 0_level_0,title,link,NER,vegan,vegetarian,lactose,gluten,halal,kosher,nut,shellfish,pescatarian,cook_time,cuisine_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,brown sugar;milk;vanilla;nuts;butter;bite size...,True,True,True,True,True,True,True,True,True,0,none
1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,beef;chicken breasts;cream of mushroom soup;so...,True,True,True,True,True,True,True,True,True,0,none
2,Creamy Corn,www.cookbooks.com/Recipe-Details.aspx?id=10570,frozen corn;cream cheese;butter;garlic powder;...,True,True,True,True,True,True,True,True,True,0,none
3,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,chicken;chicken gravy;cream of mushroom soup;s...,True,True,True,True,True,True,True,True,True,0,none
4,Reeses Cups(Candy),www.cookbooks.com/Recipe-Details.aspx?id=659239,peanut butter;graham cracker crumbs;butter;pow...,True,True,True,True,True,True,True,True,True,0,none


In [12]:
# dropping rows with empty ingredient list or if it is too long
recipe_df.drop(recipe_df.loc[recipe_df['NER'] == ""].index, inplace=True)

recipe_df['count'] = recipe_df['NER'].apply(lambda e: len(e))
recipe_df.drop(recipe_df.loc[recipe_df['count'] > 1000].index, inplace=True)
recipe_df.drop(columns=['count'], inplace=True)

len(recipe_df)

99983

In [13]:
# create sets of ingredients for each dietary restriction checker
vegan = set(['gelatin', 'jello', 'honey', 'egg', 'cheese', 'milk', 'chocolate', 'marshmallow', 'yogurt', 'cream', 'butter'])
vegetarian = set(['pork', 'chicken', 'beef', 'bacon', 'fish', 'tuna', 'ham', 'gizzard', 'sirloin', 'veal', 'scallops', 'sausage', 'burger', 'crayfish', 'hen', 'rabbit', 'shrimp', 'lobster', 'oyster', 'crab', 'lamb', 'catfish', 'clam', 'squid', 'snail', 'escargo'])
lactose = set(['cheese', 'milk', 'yogurt'])
gluten = set(['bread', 'pie', 'cake', 'cereal', 'noodle', 'pasta', 'croutons', 'cracker', 'cookie', 'gravy', 'dressing', 'soup', 'tofu', 'sauce'])
halal = set(['pork', 'bacon', 'ham', 'alcohol', 'beer', 'whiskey', 'vodka', 'rum', 'wine', 'gin', 'sake', 'tequila', 'bourbon', 'vermouth', 'gelatin', 'jello'])
kosher = set(['pork', 'rabbit', 'bacon', 'ham', 'shrimp', 'lobster', 'oyster', 'crab', 'catfish', 'clam', 'gelatin', 'jello'])
nut = set(['nut', 'pecan', 'walnut', 'almond', 'peanut', 'cashew', 'pistachio', 'macadamia', 'chestnut', 'nutter', 'hazelnut', 'brazil', 'pine', 'acorn'])
shellfish = set(['scallops', 'crayfish','shrimp', 'lobster', 'oyster', 'crab', 'clam', 'squid', 'snail', 'escargo'])
pescatarian = set(['pork', 'chicken', 'beef', 'bacon', 'ham', 'gizzard', 'sirloin', 'veal', 'sausage', 'burger', 'hen', 'rabbit', 'lamb'])

In [14]:
all_ingredients = set()
for i in recipe_df.index:
    # update all ingredient list
    entry = recipe_df.loc[i, 'NER']
    all_ingredients.update(entry.split(';'))
    
    # check for dietary restrictions
    i_list = set(entry.replace(' ', ';').split(';'))
    if len(i_list.intersection(vegetarian)) > 0:
        recipe_df.loc[i, 'vegetarian'] = False
        recipe_df.loc[i, 'vegan'] = False
    
    if recipe_df.loc[i, 'vegetarian'] and len(i_list.intersection(vegan)) > 0:
        recipe_df.loc[i, 'vegan'] = False 
        
    if len(i_list.intersection(lactose)) > 0:
        recipe_df.loc[i, 'lactose'] = False
        
    if len(i_list.intersection(gluten)) > 0:
        recipe_df.loc[i, 'gluten'] = False
        
    if len(i_list.intersection(halal)) > 0:
        recipe_df.loc[i, 'halal'] = False
        
    if len(i_list.intersection(kosher)) > 0:
        recipe_df.loc[i, 'kosher'] = False
        
    if len(i_list.intersection(nut)) > 0:
        recipe_df.loc[i, 'nut'] = False
        
    if len(i_list.intersection(shellfish)) > 0:
        recipe_df.loc[i, 'shellfish'] = False
        
    if len(i_list.intersection(pescatarian)) > 0:
        recipe_df.loc[i, 'pescatarian'] = False

In [15]:
# convert cleaned data back to csv
recipe_df.head()
recipe_df.to_csv('cleaned_data.csv', sep=',')

In [16]:
# compiling all ingredients into txt file
print(len(all_ingredients))
ingredients = open('ingredient_list.txt', 'w')
for i in all_ingredients:
    ingredients.write(f'{i}\n')
ingredients.close()

15042
