# Some more pruning

In [218]:
import pandas as pd
import numpy as np

In [219]:
df = pd.read_csv('kagglerecipes/RAW_recipes.csv')

In [220]:
df = df[['name', 'id', 'ingredients']].copy()

In [221]:
df

Unnamed: 0,name,id,ingredients
0,arriba baked winter squash mexican style,137739,"['winter squash', 'mexican seasoning', 'mixed ..."
1,a bit different breakfast pizza,31490,"['prepared pizza crust', 'sausage patty', 'egg..."
2,all in the kitchen chili,112140,"['ground beef', 'yellow onions', 'diced tomato..."
3,alouette potatoes,59389,"['spreadable cheese with garlic and herbs', 'n..."
4,amish tomato ketchup for canning,44061,"['tomato juice', 'apple cider vinegar', 'sugar..."
...,...,...,...
231632,zydeco soup,486161,"['celery', 'onion', 'green sweet pepper', 'gar..."
231633,zydeco spice mix,493372,"['paprika', 'salt', 'garlic powder', 'onion po..."
231634,zydeco ya ya deviled eggs,308080,"['hard-cooked eggs', 'mayonnaise', 'dijon must..."
231635,cookies by design cookies on a stick,298512,"['butter', 'eagle brand condensed milk', 'ligh..."


In [222]:
ingredients = []
for row in df['ingredients']:
    row = row.replace('[\'', '\']')
    row = row.replace(']', '') 
    items = row.split(', ')
    for item in items: 
        ingredients.append(item.replace('\'', ''))

In [223]:
len(ingredients)

2103719

In [224]:
len(set(ingredients))

14968

In [225]:
#ingredients[0:100]

#### What would removing sneaky duplicates do?

In [226]:
ingrsets = []
for item in ingredients: 
    ingrsets.append(frozenset(item))

In [227]:
len(ingrsets)

2103719

In [228]:
len(set(ingrsets))

13406

#### Ok, that's something

In [1]:
#ingredients

### I will check how many words each entry has. The more it has, the more likely that it can be reduced

In [32]:
lengths = []
for item in ingredients: 
    itemlist = item.split(' ')
    lengths.append(len(itemlist))

In [37]:
pd.value_counts(lengths)

2     980541
1     805945
3     239079
4      62491
5       9738
6       4895
7        730
8        249
9         33
11         9
10         8
12         1
dtype: int64

I'll clean everything that has 5 and up, saving the shorter ones for (maybe) later

In [38]:
zipper = list(zip(ingredients, lengths))

In [40]:
toclean = []
shorts = []
for item in zipper: 
    if item[1] >= 5:
        toclean.append(item[0])
    else: 
        shorts.append(item[0])
        

In [47]:
toclean2 = (list(set(toclean)))

In [62]:
sortshort = sorted(shorts, key = len)

In [63]:
sortshort.reverse()

In [65]:
sortshort = list(set(sortshort))

In [67]:
sortshort = sorted(sortshort, key = len)

In [72]:
len(sortshort)

13765

A list of 'shorter' ingredients, from long to short. I'll now loop over the long ones and check if the substring is in the list of shorter ones. If so, I drop the long one. I'll see which longer ones I end up with. 

In [108]:
droplist = []
keeplist = []
for item in toclean2: 
    for item2 in sortshort: 
        if item2 in item: 
            droplist.append(item)
            break
        else:
            keeplist.append(item)

In [109]:
keep = set(keeplist) - set(droplist)

In [110]:
keep

{'"jack daniels sizzling smokehouse blend grilling sauce"',
 '"jack daniels tennessee hickory mesquite grilling sauce"',
 '"morningstar farms better n burgers"',
 '"smuckers black cherry flavored topping"',
 'complete caesar salad in a bag',
 'country bob all purpose sauce',
 'dried* and torn into pieces',
 'el torito adobo al pastor sauce',
 'general foods international suisse mocha cafe',
 'hidden valley ranch salad crispins',
 'jack daniels black label mix',
 'knorr parma rosa sauce mix',
 'leaves separated and torn into pieces',
 'martha white wild berry muffin mix',
 'morningstar farms spicy black bean burgers',
 'muscat de beaumes de venise',
 'nestl toll house holiday morsels',
 'nestl toll house premier white morsel',
 'nestl toll house premier white morsels',
 'nestle toll house halloween shapes and morsels',
 'nestle toll house holiday shapes and morsels',
 'pastry for a double-crust 9-inch pie',
 'pastry for double-crust deep dish pie',
 'philadelphia santa fe blend cooking 

No big loss to dump these. So 'shorts' is now the list I will continue to work with.

In [81]:
'ricotta' in shorts

False

In [82]:
'berry' in shorts

False

In [86]:
shorts.append('ricotta')

In [87]:
shorts.append('berry')

In [88]:
len(set(shorts))

13767

In [84]:
'beans' in shorts

True

In [106]:
fours = []
shorts2 = []
for item in zipper: 
    if item[1] > 3:
        fours.append(item[0])
    else: 
        shorts2.append(item[0])
        

In [99]:
len(set(shorts2))

11904

In [100]:
checker = []
for item in shorts2:
    checker.append(frozenset(item))

In [101]:
len(set(checker))

10746

I checked, and items with four words have a lot more sensible stuff in there. Instead of brute-forcing it out, I'll look at the frequencies - items that occur often can (probably) stay, and I'll focus on the bottom of the barrel

In [120]:
newdf = pd.DataFrame(pd.value_counts(fours)).reset_index()

In [122]:
newdf

Unnamed: 0,index,0
0,extra virgin olive oil,7704
1,fresh ground black pepper,7160
2,boneless skinless chicken breasts,3994
3,salt & freshly ground black pepper,3123
4,crushed red pepper flakes,2936
...,...,...
3059,frozen macaroni and cheese,1
3060,cooked lean ground turkey,1
3061,multi-grain flakes cereal with oat clusters cr...,1
3062,skinless boneless pheasant breast halves,1


In [127]:
lows = newdf[newdf[0] < 50]

I'll deal with the ones that occur less than fifty times in the same way as I did with the above. Everything else (200 items) can stay.

In [128]:
shorts3 = lows['index'].to_list()

In [2]:
#shorts3

In [131]:
shorts2_new = sorted(shorts2, key = len)

In [132]:
shorts2_new.reverse() 

In [136]:
droplist2 = []
keeplist2 = []
for item in shorts3:
    for item2 in shorts2_new:
        if item2 in item: 
            droplist2.append(item)
            break
        else: 
            keeplist2.append(item)

In [140]:
len(set(droplist2))

2779

In [143]:
keepers2 = set(shorts3) - set(droplist2)

In [3]:
#keepers2

In [153]:
shorts.append('pastry')

In [154]:
shorts.append('sweet and sour sauce')

In [157]:
len(set(frozenset(item) for item in shorts))

12346

In [None]:
print("Hi")

In [166]:
with open('newmasterlist.txt', 'w') as f:
    for item in set(shorts):
        f.write(f"{item}\n")

# Start here for cleaning

In [5]:
masterlist = []
with open('newmasterlist.txt', 'r') as f: 
    lines = f.readlines()
    for line in lines: 
        masterlist.append(line.replace('\n', ''))

In [4]:
#masterlist

In [230]:
len(masterlist)

13768

In [176]:
df

Unnamed: 0,name,id,ingredients
0,arriba baked winter squash mexican style,137739,"['winter squash', 'mexican seasoning', 'mixed ..."
1,a bit different breakfast pizza,31490,"['prepared pizza crust', 'sausage patty', 'egg..."
2,all in the kitchen chili,112140,"['ground beef', 'yellow onions', 'diced tomato..."
3,alouette potatoes,59389,"['spreadable cheese with garlic and herbs', 'n..."
4,amish tomato ketchup for canning,44061,"['tomato juice', 'apple cider vinegar', 'sugar..."
...,...,...,...
231632,zydeco soup,486161,"['celery', 'onion', 'green sweet pepper', 'gar..."
231633,zydeco spice mix,493372,"['paprika', 'salt', 'garlic powder', 'onion po..."
231634,zydeco ya ya deviled eggs,308080,"['hard-cooked eggs', 'mayonnaise', 'dijon must..."
231635,cookies by design cookies on a stick,298512,"['butter', 'eagle brand condensed milk', 'ligh..."


In [198]:
data = pd.read_csv('dataframe.csv', sep = '|')

In [181]:
len(data)

253712

In [182]:
data.shape

(253712, 4)

In [199]:
data.drop(columns = ['cleaned'], inplace = True)

In [200]:
allrecipes = pd.read_csv('allrecipes.csv', sep = '|')

In [201]:
allrecipes = allrecipes.rename(columns = {'0':'name', '1': 'url', '2': 'ingredients'})

In [202]:
data = pd.concat([data, allrecipes], axis = 0).copy()

In [188]:
len(data)

304328

### Lemmatizing the masterlist

In [210]:
from nltk.stem import WordNetLemmatizer
  
wnl = WordNetLemmatizer()

In [211]:
def lemmatize(masterlist):
    result = []
    ingredientlist_split = []
    for ingredient in masterlist:
        ingredientlist_split.append(ingredient.split(' '))

    for entry in ingredientlist_split: 
        dummy = []
        for item in entry:         
            dummy.append(wnl.lemmatize(item))
        result.append(' '.join(dummy))
    return result

In [213]:
masterlist = lemmatize(masterlist)

In [214]:
# Function, copy-pasted from previous cleaning notebook
def simplify(row):

    rowresults = []
    row = row.replace('[\'', '\']')
    row = row.replace(']', '') 
    items = row.split(', ')
    for item in items: 
        for ingredient in masterlist:
            if ingredient in item: 
                rowresults.append(ingredient)
                break
    return rowresults

In [215]:
data['cleaned']= data['ingredients'].apply(simplify)

In [216]:
data

Unnamed: 0,name,url,ingredients,cleaned
0,arriba baked winter squash mexican style,https://www.food.com/recipe/137739,"['winter squash', 'mexican seasoning', 'mixed ...","[squash, mexican seasoning, mixed spice, honey..."
1,a bit different breakfast pizza,https://www.food.com/recipe/31490,"['prepared pizza crust', 'sausage patty', 'egg...","[pizza crust, sage, egg, milk, salt, cheese]"
2,all in the kitchen chili,https://www.food.com/recipe/112140,"['ground beef', 'yellow onions', 'diced tomato...","[beef, onion, diced tomato, tomato paste, soup..."
3,alouette potatoes,https://www.food.com/recipe/59389,"['spreadable cheese with garlic and herbs', 'n...","[garlic, potato, shallot, parsley, tarragon, o..."
4,amish tomato ketchup for canning,https://www.food.com/recipe/44061,"['tomato juice', 'apple cider vinegar', 'sugar...","[juice, cider vinegar, sugar, salt, pepper, cl..."
...,...,...,...,...
50611,Chocolate-Covered Coffee Beans,https://www.allrecipes.com/recipe/284194/choco...,"['4 ounces milk chocolate, chopped, divided', ...","[chocolate, hop, divided, coffee]"
50612,Bracciole (Flank Steak Rolls),https://www.allrecipes.com/recipe/229505/bracc...,"['2 tablespoons olive oil', '0.5 onion, choppe...","[olive oil, onion, hop, salt, tea, hop, garlic..."
50613,Garlic-Smashed Potatoes,https://www.allrecipes.com/recipe/282326/garli...,"['5 pounds small red potatoes', '0.5 cup olive...","[potato, olive oil, garlic, salt, tea, hop]"
50614,Chewy Cheesecake Cookies,https://www.allrecipes.com/recipe/284693/chewy...,"['0.5 cup unsalted butter, softened', '3 ounce...","[salt, cream cheese, sugar, flour, pecan, hop]"


In [217]:
data.to_csv("data2.csv", sep = '|', index = False)

# A new and improved masterlist

In [231]:
masterlist = []
with open('masterlist.txt', 'r') as file: 
    lines = file.readlines()
    for line in lines: 
        masterlist.append(line.replace('/n', ''))

In [232]:
len(masterlist)

1180

In [233]:
masterlist = lemmatize(masterlist)

In [236]:
len(masterlist)

1180

In [237]:
masterlist = list(set(masterlist))

In [234]:
len(set(masterlist))

988

In [244]:
dummy = []
for line in masterlist:
    dummy.append(line.strip())

In [246]:
masterlist = list(set(dummy))

In [247]:
len(masterlist)

911

In [248]:
with open('masterlist.txt', 'w') as file:
    for line in masterlist: 
        line = line.strip()
        file.write(f"{line}\n")

# Newer and improveder

In [253]:
newdf = pd.read_csv('masterlist_tag.txt')

In [254]:
newdf.head()

Unnamed: 0,ingredient,meat
0,green bean,0
1,mincemeat,1
2,beef,1
3,graham crackers,0
4,leek,0


In [258]:
new_ml = newdf['ingredient'].to_list()

In [6]:
#new_ml

In [260]:
len(new_ml)

904

In [262]:
len(duplicates)

22

In [7]:
#duplicates