In [46]:
import pandas as pd

In [47]:
# create a subset of data from the full dataset (only run once)
# full = open('full_dataset.tsv', 'r')
# subset = open('recipe_dataset.tsv', 'w')
# for _ in range(100001):
#     subset.write(full.readline())
# full.close()
# subset.close()

In [48]:
# read raw data into pandas data frame
recipe_df = pd.read_csv('recipe_dataset.tsv', sep='\t')
recipe_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
recipe_df.drop(columns=['ingredients', 'directions', 'source'], inplace=True)
recipe_df[['vegan', 'vegetarian', 'lactose', 'gluten', 'halal', 'kosher']] = True
recipe_df[['cook time']] = 0
recipe_df['NER'] = recipe_df['NER'].apply(lambda e: ",".join(e[2:-2].split('\", \"')).lower())
recipe_df.head()

Unnamed: 0,id,title,link,NER,vegan,vegetarian,lactose,gluten,halal,kosher,cook time
0,0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,"brown sugar,milk,vanilla,nuts,butter,bite size...",True,True,True,True,True,True,0
1,1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,"beef,chicken breasts,cream of mushroom soup,so...",True,True,True,True,True,True,0
2,2,Creamy Corn,www.cookbooks.com/Recipe-Details.aspx?id=10570,"frozen corn,cream cheese,butter,garlic powder,...",True,True,True,True,True,True,0
3,3,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,"chicken,chicken gravy,cream of mushroom soup,s...",True,True,True,True,True,True,0
4,4,Reeses Cups(Candy),www.cookbooks.com/Recipe-Details.aspx?id=659239,"peanut butter,graham cracker crumbs,butter,pow...",True,True,True,True,True,True,0


In [49]:
# create sets of ingredients for each dietary restriction checker
vegan = set(['gelatin', 'jello', 'honey', 'egg', 'cheese', 'milk', 'chocolate', 'marshmallow', 'yogurt', 'cream', 'butter'])
vegetarian = set(['pork', 'chicken', 'beef', 'bacon', 'fish', 'tuna', 'ham', 'gizzard', 'sirloin', 'veal', 'scallops', 'sausage', 'burger', 'crayfish', 'hen', 'rabbit', 'shrimp', 'lobster', 'oyster', 'crab', 'lamb', 'catfish', 'clam'])
lactose = set(['cheese', 'milk', 'yogurt'])
gluten = set(['bread', 'pie', 'cake', 'cereal', 'noodle', 'pasta', 'croutons', 'cracker', 'cookie', 'gravy', 'dressing', 'soup', 'tofu', 'sauce'])
halal = set(['pork', 'bacon', 'ham', 'alcohol', 'beer', 'whiskey', 'vodka', 'rum', 'wine', 'gin', 'sake', 'tequila', 'bourbon', 'vermouth', 'gelatin', 'jello'])
kosher = set(['pork', 'rabbit', 'bacon', 'ham', 'shrimp', 'lobster', 'oyster', 'crab', 'catfish', 'clam', 'gelatin', 'jello'])

In [50]:
i = 0
all_ingredients = set()
for i in range(len(recipe_df)):
    # update all ingredient list
    entry = recipe_df.loc[i, 'NER']
    all_ingredients.update(entry.split(','))
    
    # check for dietary restrictions
    i_list = set(entry.replace(' ', ',').split(','))    
    if len(i_list.intersection(vegetarian)) > 0:
        recipe_df.loc[i, 'vegetarian'] = False
        recipe_df.loc[i, 'vegan'] = False
    
    if recipe_df.loc[i, 'vegetarian'] and len(i_list.intersection(vegan)) > 0:
        recipe_df.loc[i, 'vegan'] = False 
        
    if len(i_list.intersection(lactose)) > 0:
        recipe_df.loc[i, 'lactose'] = False
        
    if len(i_list.intersection(gluten)) > 0:
        recipe_df.loc[i, 'gluten'] = False
        
    if len(i_list.intersection(halal)) > 0:
        recipe_df.loc[i, 'halal'] = False
        
    if len(i_list.intersection(kosher)) > 0:
        recipe_df.loc[i, 'kosher'] = False

In [51]:
# convert cleaned data back to tsv
recipe_df.head()
recipe_df.to_csv('cleaned_data.tsv', sep='\t')

In [52]:
# compiling all ingredients into txt file
print(len(all_ingredients))
ingredients = open('ingredient_list.txt', 'w')
for i in all_ingredients:
    ingredients.write(f'{i}\n')
ingredients.close()