# Imports

In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from difflib import SequenceMatcher

sns.set_style("whitegrid")

## Load Training Data Set

In [299]:
train_set = pd.read_json('train.json')
train_set.drop(columns=['id'], inplace=True)
display(train_set.sample(5))
print('number of data points: ', len(train_set))

Unnamed: 0,cuisine,ingredients
7520,southern_us,"[water, heavy cream, carrots, fresh tomatoes, ..."
1715,moroccan,"[eggs, salt, boneless skinless chicken breast ..."
12385,irish,"[club soda, lime, ice cubes, Irish whiskey]"
17797,korean,"[sugar, green onions, dried shiitake mushrooms..."
18904,mexican,"[water, cilantro sprigs, chopped pecans, chipo..."


number of data points:  39774


In [34]:
train_set['ingredients'] = train_set['ingredients'].map(lambda x: list(map(str.lower, x)))
train_set.tail(20)

Unnamed: 0,cuisine,ingredients
39754,vietnamese,"[jasmine rice, bay leaves, sticky rice, rotiss..."
39755,indian,"[mint leaves, cilantro leaves, ghee, tomatoes,..."
39756,mexican,"[vegetable oil, cinnamon sticks, water, all-pu..."
39757,greek,"[red bell pepper, garlic cloves, extra-virgin ..."
39758,greek,"[milk, salt, ground cayenne pepper, ground lam..."
39759,korean,"[red chili peppers, sea salt, onions, water, c..."
39760,southern_us,"[butter, large eggs, cornmeal, baking powder, ..."
39761,chinese,"[honey, chicken breast halves, cilantro leaves..."
39762,indian,"[curry powder, salt, chicken, water, vegetable..."
39763,italian,"[fettuccine pasta, low-fat cream cheese, garli..."


In [35]:
all_ingredients = np.concatenate(tuple(train_set.ingredients.values[i] for i in range(len(train_set.ingredients.values))))
unique_ingredients, counts = np.unique(all_ingredients, return_counts=True)
print('unique number of ingredients in dataset: ', len(unique_ingredients))

unique number of ingredients in dataset:  6703


In [175]:
def contains_print(string, list):
    for word in list:
        if string in word:
            print(word)
def contains(string, list):
    words = []
    for word in list:
        if string in word:
            words.append(word)
    return words
def is_smallest(word, list):
    smallest = len(word)
    for other_word in list:
        if smallest <= len(other_word):
            continue
        else:
            return False
    return True
def find_common_match(list):
    if len(list) == 0:
        return ''
    elif len(list) == 1:
        return list[0]
    else:
        match_str = list[0]
        for i in range (0, len(list) - 1):
            match = SequenceMatcher(None, match_str, list[i+1]).find_longest_match()
            match_str=match_str[match.a:match.a+match.size]
        return match_str
            

In [37]:
contains('onion', unique_ingredients)

bermuda onion
black onion seeds
boiling onions
chopped onion
cipollini onions
condensed french onion soup
cream cheese with chives and onion
diced onions
diced red onions
diced tomatoes with garlic and onion
diced yellow onion
dried minced onion
finely chopped onion
french fried onions
french onion soup
fresh onion
frozen onion rings
frozen peppers and onions
green onion bottoms
green onions
knorr onion minicubes
lipton onion soup mix
lipton recip secret golden onion soup mix
lipton® recipe secrets® onion soup mix
maui onion
minced onion
onion flakes
onion gravy
onion powder
onion rings
onion salt
onion slices
onion soup
onion soup mix
onion tops
onions
pearl onions
pickled onion
purple onion
sliced green onions
small yellow onion
spanish onion
spring onions
sweet onion
vidalia onion
white onion
yellow onion


In [38]:
count_by_cuisine = train_set.groupby(['cuisine']).count()
count_by_cuisine['recipe count'] = count_by_cuisine['ingredients']
cuisines = count_by_cuisine.index.values
count_by_cuisine.drop(columns=['ingredients'], inplace=True)
display(count_by_cuisine)

Unnamed: 0_level_0,recipe count
cuisine,Unnamed: 1_level_1
brazilian,467
british,804
cajun_creole,1546
chinese,2673
filipino,755
french,2646
greek,1175
indian,3003
irish,667
italian,7838


In [39]:
ingredient_count_df = pd.DataFrame(0, index=unique_ingredients, columns=cuisines)
for i in range(len(train_set)):
    for j in range(len(train_set.ingredients[i])):
        ingredient_count_df[train_set["cuisine"][i]][train_set.ingredients[i][j]]=ingredient_count_df[train_set["cuisine"][i]][train_set.ingredients[i][j]]+1

In [49]:
ingredients_needed = np.zeros(5000)
for i in range(len(ingredients_needed)):
    # Ingredients that are present in more than i recipes
    ingredients_needed[i] = len(ingredient_count_df[ingredient_count_df.sum(axis=1) > i])
ingredients_in_1pct = ingredients_needed[round(len(train_set) / 100)]
print(ingredients_in_1pct, ' ingredients that are present in more than 1% of recipes.')

194.0  ingredients that are present in more than 1% of recipes.


In [54]:
most_popular_ingredients_df = ingredient_count_df[ingredient_count_df.sum(axis=1) > ingredients_in_1pct]

In [151]:
most_popular_ingredients = most_popular_ingredients_df.index.values
print(most_popular_ingredients)
print(len(most_popular_ingredients))

['active dry yeast' 'all-purpose flour' 'andouille sausage'
 'apple cider vinegar' 'arborio rice' 'asparagus' 'avocado' 'baby spinach'
 'bacon' 'bacon slices' 'baguette' 'baking potatoes' 'baking powder'
 'baking soda' 'balsamic vinegar' 'bananas' 'basil' 'basil leaves'
 'basmati rice' 'bay leaf' 'bay leaves' 'beansprouts' 'beef' 'beef broth'
 'beer' 'bell pepper' 'black beans' 'black olives' 'black pepper'
 'black peppercorns' 'black-eyed peas' 'boiling water'
 'boneless chicken skinless thigh'
 'boneless skinless chicken breast halves'
 'boneless skinless chicken breasts' 'bourbon whiskey' 'bread crumbs'
 'broccoli' 'brown rice' 'brown sugar' 'butter' 'buttermilk' 'cabbage'
 'cajun seasoning' 'cannellini beans' 'canola oil' 'capers' 'carrots'
 'cashew nuts' 'cauliflower' 'cayenne' 'cayenne pepper' 'celery'
 'celery ribs' 'cheddar cheese' 'cheese' 'cherry tomatoes' 'chicken'
 'chicken breasts' 'chicken broth' 'chicken stock' 'chicken thighs'
 'chickpeas' 'chile pepper' 'chiles' 'chili

In [161]:
words_list = [[] for _ in range(len(most_popular_ingredients))]
singular_words_list = []
i = 0
for word in most_popular_ingredients:
    words_list[i] = contains(word, most_popular_ingredients)
    i+=1
#print(words_list)
words_list_v2 = []
for words in words_list:
    if len(words)>1:
        if is_smallest(words[0], words):
            words_list_v2.append(words)
print(words_list_v2)
print(len(words_list_v2))
words_list_v3 = ['']*len(words_list_v2)
for i in range(len(words_list_v3)):
    words_list_v3[i] = words_list_v2[i][0]
print(words_list_v3)


[['bacon', 'bacon slices'], ['basil', 'basil leaves', 'dried basil', 'fresh basil', 'fresh basil leaves'], ['basil leaves', 'fresh basil leaves'], ['beef', 'beef broth', 'ground beef', 'lean ground beef'], ['bell pepper', 'chopped green bell pepper', 'green bell pepper', 'red bell pepper', 'yellow bell pepper'], ['black pepper', 'black peppercorns', 'cracked black pepper', 'ground black pepper', 'salt and ground black pepper'], ['bread crumbs', 'dry bread crumbs'], ['brown sugar', 'dark brown sugar', 'light brown sugar'], ['butter', 'buttermilk', 'melted butter', 'unsalted butter'], ['cabbage', 'green cabbage', 'napa cabbage'], ['cayenne', 'cayenne pepper'], ['celery', 'celery ribs', 'chopped celery'], ['cheddar cheese', 'shredded cheddar cheese'], ['chicken broth', 'fat free less sodium chicken broth', 'low salt chicken broth', 'low sodium chicken broth'], ['chili powder', 'red chili powder'], ['chives', 'chopped fresh chives'], ['chopped cilantro', 'chopped cilantro fresh'], ['choppe

In [157]:
common_words = ['bacon', 'basil', 'bell pepper', 'black pepper', 'brown sugar', 'bread crumbs', 'brown sugar', 'cayenne', 'celery', 'cheddar cheese', 'chicken breast', 'chicken broth', 'chicken thigh','chili powder', 'cilantro', 'coriander', 'garlic cloves', 'ground beef', 'green onion' 'whipping cream', 'red pepper' , 'feta cheese', 'fresh ginger' , 'orange juice', 'lemon juice', 'lime juice', 'mozzarella cheese', 'mushrooms', 'nutmeg', 'parmesan cheese' , 'sesame oil', 'vegetable oil', 'white rice', 'sea salt', 'soy sauce', 'shrimp',  'thyme', 'cider vinegar', ' water', 'sesame seeds', 'vanilla', 'cumin', 'eggs']
len(common_words)

41

In [158]:
filtered_popular_ingredients = most_popular_ingredients.copy().tolist()
for word_i in most_popular_ingredients:
    for word_j in common_words:
        if word_j in word_i:
            if word_j != word_i:
                filtered_popular_ingredients.remove(word_i)
print(filtered_popular_ingredients)
print(len(filtered_popular_ingredients))

['active dry yeast', 'all-purpose flour', 'andouille sausage', 'arborio rice', 'asparagus', 'avocado', 'baby spinach', 'bacon', 'baguette', 'baking potatoes', 'baking powder', 'baking soda', 'balsamic vinegar', 'bananas', 'basil', 'basmati rice', 'bay leaf', 'bay leaves', 'beansprouts', 'beef', 'beef broth', 'beer', 'bell pepper', 'black beans', 'black olives', 'black pepper', 'black-eyed peas', 'boneless chicken skinless thigh', 'bourbon whiskey', 'bread crumbs', 'broccoli', 'brown rice', 'brown sugar', 'butter', 'buttermilk', 'cabbage', 'cajun seasoning', 'cannellini beans', 'canola oil', 'capers', 'carrots', 'cashew nuts', 'cauliflower', 'cayenne', 'celery', 'celery ribs', 'cheddar cheese', 'cheese', 'cherry tomatoes', 'chicken', 'chicken broth', 'chicken stock', 'chickpeas', 'chile pepper', 'chiles', 'chili powder', 'chinese five-spice powder', 'chives', 'chopped celery', 'chopped fresh chives', 'chopped fresh mint', 'chopped garlic', 'chopped onion', 'chopped parsley', 'chopped pe

In [159]:
filtered_popular_ingredients

['active dry yeast',
 'all-purpose flour',
 'andouille sausage',
 'arborio rice',
 'asparagus',
 'avocado',
 'baby spinach',
 'bacon',
 'baguette',
 'baking potatoes',
 'baking powder',
 'baking soda',
 'balsamic vinegar',
 'bananas',
 'basil',
 'basmati rice',
 'bay leaf',
 'bay leaves',
 'beansprouts',
 'beef',
 'beef broth',
 'beer',
 'bell pepper',
 'black beans',
 'black olives',
 'black pepper',
 'black-eyed peas',
 'boneless chicken skinless thigh',
 'bourbon whiskey',
 'bread crumbs',
 'broccoli',
 'brown rice',
 'brown sugar',
 'butter',
 'buttermilk',
 'cabbage',
 'cajun seasoning',
 'cannellini beans',
 'canola oil',
 'capers',
 'carrots',
 'cashew nuts',
 'cauliflower',
 'cayenne',
 'celery',
 'celery ribs',
 'cheddar cheese',
 'cheese',
 'cherry tomatoes',
 'chicken',
 'chicken broth',
 'chicken stock',
 'chickpeas',
 'chile pepper',
 'chiles',
 'chili powder',
 'chinese five-spice powder',
 'chives',
 'chopped celery',
 'chopped fresh chives',
 'chopped fresh mint',
 'cho

In [None]:
['green onions' 'scallions']
['parmigiano reggiano cheese', 'parmesan cheese']

In [160]:
train_set.replace('light brown sugar', 'brown sugar').tail(20)

Unnamed: 0,cuisine,ingredients
39754,vietnamese,"[jasmine rice, bay leaves, sticky rice, rotiss..."
39755,indian,"[mint leaves, cilantro leaves, ghee, tomatoes,..."
39756,mexican,"[vegetable oil, cinnamon sticks, water, all-pu..."
39757,greek,"[red bell pepper, garlic cloves, extra-virgin ..."
39758,greek,"[milk, salt, ground cayenne pepper, ground lam..."
39759,korean,"[red chili peppers, sea salt, onions, water, c..."
39760,southern_us,"[butter, large eggs, cornmeal, baking powder, ..."
39761,chinese,"[honey, chicken breast halves, cilantro leaves..."
39762,indian,"[curry powder, salt, chicken, water, vegetable..."
39763,italian,"[fettuccine pasta, low-fat cream cheese, garli..."


In [221]:
contains('active', unique_ingredients)

['active dry yeast', 'fast-rising active dry yeast']

In [185]:
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
#fdist = FreqDist()
#word_tokenize(text)

In [292]:
def remove_adj(sentence):
    adjective_tags = ["JJ", "JJR", "JJS"]
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    for i in range(len(tags)):
        word = [word for word,pos in tags if (pos not in adjective_tags)]
    return ' '.join(word)

In [191]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\Ronak
[nltk_data]     Desai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ronak Desai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [212]:
remove_adj('green onions')

'onions'

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [216]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(filtered_popular_ingredients)

In [220]:
print(x[0])
print(vectorizer.get_feature_names()[0])

  (0, 277)	0.60635138897985
  (0, 83)	0.5144667007342779
  (0, 0)	0.60635138897985
active


In [226]:
def remove_tm(list):
    new_list = list.copy()
    for word in list:
        if word.contains([])
        

SyntaxError: invalid syntax (2974611492.py, line 4)

In [245]:
def gen_basic_words(all_words):
    words_list = [[] for _ in range(len(all_words))]
    i = 0
    for word in all_words:
        words_list[i] = contains(word, all_words)
        i+=1
    #print(words_list)
    words_list_v2 = []
    for words in words_list:
        if len(words)>1:
            if is_smallest(words[0], words):
                words_list_v2.append(words)
    words_list_v3 = ['']*len(words_list_v2)
    for i in range(len(words_list_v3)):
        words_list_v3[i] = words_list_v2[i][0]
    filtered = all_words.copy().tolist()
    for word_i in all_words:
        for word_j in words_list_v3:
            if word_j in word_i:
                if word_j != word_i:
                    if word_i in filtered:
                        filtered.remove(word_i)
    return filtered

In [246]:
basic_list = gen_basic_words(unique_ingredients)

In [247]:
print(len(basic_list))
print(basic_list)

5242
['(    oz.) tomato sauce', '(   oz.) tomato paste', '(10 oz.) frozen chopped spinach', '(14 oz.) sweetened condensed milk', '(14.5 oz.) diced tomatoes', '(15 oz.) refried beans', '1% low-fat buttermilk', '1% low-fat chocolate milk', '1% low-fat cottage cheese', '1% low-fat milk', '2 1/2 to 3 lb. chicken, cut into serving pieces', '2% low fat cheddar chees', '2% low-fat cottage cheese', '2% lowfat greek yogurt', '2% milk shredded mozzarella cheese', '2% reduced-fat milk', '25% less sodium chicken broth', '33% less sodium cooked deli ham', '33% less sodium cooked ham', '33% less sodium ham', '33% less sodium smoked fully cooked ham', '40% less sodium taco seasoning', '7 up', '8 ounc ziti pasta, cook and drain', '95% lean ground beef', 'a taste of thai rice noodles', 'abalone', 'abbamele', 'absinthe', 'abura age', 'acai juice', 'accent', 'accompaniment', 'achiote', 'acini di pepe', 'ackee', 'acorn squash', 'active dry yeast', 'adobo', 'adzuki beans', 'agar', 'agave nectar', 'agave te

In [233]:
filtered_popular_ingredients = most_popular_ingredients.copy().tolist()
for word_i in most_popular_ingredients:
    for word_j in common_words:
        if word_j in word_i:
            if word_j != word_i:
                filtered_popular_ingredients.remove(word_i)
print(filtered_popular_ingredients)
print(len(filtered_popular_ingredients))

In [234]:
a.copy()

[1, 2]

In [235]:
a.remove(1)

In [236]:
a

[2]

In [237]:
a = [1, 2]

In [242]:
np.array(a).tolist()

[1, 2]

In [249]:
a = ['apple', 'banana']
min(a, key=len)
    

'apple'

In [276]:
def gen_base_words(ingredient_list):
    my_list = []
    for ingredient in ingredient_list:
        current_word = min(contains(ingredient, ingredient_list), key=len)
        i = 0
        flag = True
        while (i < len(my_list) and flag):
            if my_list[i] in current_word:
                flag = False
            i+=1
        if flag:
            my_list.append(current_word)
    return my_list         

In [277]:
base_words = gen_base_words(unique_ingredients)
print(len(base_words))

2644


In [278]:
print(base_words)

['(    oz.) tomato sauce', '(   oz.) tomato paste', '(10 oz.) frozen chopped spinach', '(14 oz.) sweetened condensed milk', '(14.5 oz.) diced tomatoes', '(15 oz.) refried beans', '1% low-fat buttermilk', '1% low-fat chocolate milk', '1% low-fat cottage cheese', '1% low-fat milk', '2 1/2 to 3 lb. chicken, cut into serving pieces', '2% low fat cheddar chees', '2% low-fat cottage cheese', '2% lowfat greek yogurt', '2% milk shredded mozzarella cheese', '2% reduced-fat milk', '25% less sodium chicken broth', '33% less sodium cooked deli ham', '33% less sodium cooked ham', '33% less sodium ham', '33% less sodium smoked fully cooked ham', '40% less sodium taco seasoning', '7 up', '8 ounc ziti pasta, cook and drain', '95% lean ground beef', 'a taste of thai rice noodles', 'abalone', 'abbamele', 'absinthe', 'abura age', 'acai juice', 'accent', 'accompaniment', 'achiote', 'acini di pepe', 'ackee', 'acorn squash', 'active dry yeast', 'adobo', 'adzuki beans', 'agar', 'agave nectar', 'agave tequila

In [280]:
base_base_words = gen_base_words(base_words)
print(base_base_words)
print(len(base_base_words))

['(    oz.) tomato sauce', '(   oz.) tomato paste', '(10 oz.) frozen chopped spinach', '(14 oz.) sweetened condensed milk', '(14.5 oz.) diced tomatoes', '(15 oz.) refried beans', '1% low-fat buttermilk', '1% low-fat chocolate milk', '1% low-fat cottage cheese', '1% low-fat milk', '2 1/2 to 3 lb. chicken, cut into serving pieces', '2% low fat cheddar chees', '2% low-fat cottage cheese', '2% lowfat greek yogurt', '2% milk shredded mozzarella cheese', '2% reduced-fat milk', '25% less sodium chicken broth', '33% less sodium cooked deli ham', '33% less sodium cooked ham', '33% less sodium ham', '33% less sodium smoked fully cooked ham', '40% less sodium taco seasoning', '7 up', '8 ounc ziti pasta, cook and drain', '95% lean ground beef', 'a taste of thai rice noodles', 'abalone', 'abbamele', 'absinthe', 'abura age', 'acai juice', 'accent', 'accompaniment', 'achiote', 'acini di pepe', 'ackee', 'acorn squash', 'active dry yeast', 'adobo', 'adzuki beans', 'agar', 'agave nectar', 'agave tequila

In [282]:
a = ['apple', 'banana']
nltk.word_tokenize('apple juice')

['apple', 'juice']

In [283]:
train_set.head(5) 

Unnamed: 0,cuisine,ingredients
0,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,"[water, vegetable oil, wheat, salt]"
4,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [300]:
train_set['tokenized_ingredients_1'] = train_set['ingredients'].apply(lambda x: ' '.join(x))
train_set.head(5)

Unnamed: 0,cuisine,ingredients,tokenized_ingredients_1
0,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...


In [311]:
train_set.head(5)

Unnamed: 0,cuisine,ingredients,tokenized_ingredients_1,tokenized_ingredients_2
0,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...,romaine lettuce olives grape tomatoes purple o...
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...,plain ground pepper salt tomatoes ground peppe...
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...,eggs pepper salt mayonaise cooking oil chilies...
3,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt,water oil wheat salt
4,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...,pepper shallots cornflour pepper onions paste ...


In [301]:
train_set['tokenized_ingredients_2'] = train_set['tokenized_ingredients_1'].apply(remove_adj)

In [310]:
train_set.tokenized_ingredients_2.values[1]

'plain ground pepper salt tomatoes ground pepper thyme eggs tomatoes corn meal milk oil'

In [320]:
nltk.pos_tag(nltk.word_tokenize('flour plain'))

[('flour', 'NN'), ('plain', 'NN')]

In [321]:
word_tokenize('cold feet')

['cold', 'feet']