In [21]:
import numpy as np
from json import load
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# Load train, test is not loaded to keep memory usage down.
train_pd = pd.read_json(open("../Data/train.json"))
print(train_pd.head(10))
print(train_pd.dtypes)

       cuisine     id                                        ingredients
0        greek  10259  [romaine lettuce, black olives, grape tomatoes...
1  southern_us  25693  [plain flour, ground pepper, salt, tomatoes, g...
2     filipino  20130  [eggs, pepper, salt, mayonaise, cooking oil, g...
3       indian  22213                [water, vegetable oil, wheat, salt]
4       indian  13162  [black pepper, shallots, cornflour, cayenne pe...
5     jamaican   6602  [plain flour, sugar, butter, eggs, fresh ginge...
6      spanish  42779  [olive oil, salt, medium shrimp, pepper, garli...
7      italian   3735  [sugar, pistachio nuts, white almond bark, flo...
8      mexican  16903  [olive oil, purple onion, fresh pineapple, por...
9      italian  12734  [chopped tomatoes, fresh basil, garlic, extra-...
cuisine        object
id              int64
ingredients    object
dtype: object


In [13]:
# Print the unique cuisine types
print(train_pd.cuisine.unique())

['greek' 'southern_us' 'filipino' 'indian' 'jamaican' 'spanish' 'italian'
 'mexican' 'chinese' 'british' 'thai' 'vietnamese' 'cajun_creole'
 'brazilian' 'french' 'japanese' 'irish' 'korean' 'moroccan' 'russian']


In [43]:
# Obtain all the ingredients listed
all_ingredient = {}
for ingredient_ls in train_pd.ingredients:
    for ingredient in ingredient_ls:
        if ingredient.lower() in all_ingredient.keys():
            all_ingredient[ingredient.lower()] += 1
        else:
            all_ingredient[ingredient.lower()] = 1

In [44]:
ingredient_map = {'ingredient':list(all_ingredient.keys()), 'vals':list(all_ingredient.values())}
ingredient_count_df = pd.DataFrame(ingredient_map)
ingredient_count_df.head(10)

Unnamed: 0,ingredient,vals
0,romaine lettuce,270
1,black olives,229
2,grape tomatoes,228
3,garlic,7380
4,pepper,4438
5,purple onion,1896
6,seasoning,137
7,garbanzo beans,148
8,feta cheese crumbles,358
9,plain flour,154


In [45]:
print("The total number of ingredients: {}".format(ingredient_count_df.ingredient.count()))
print("The number of ingredient's that are listed once are: {}".format(np.sum(ingredient_count_df.vals == 1)))
print("The number of ingredient's that are listed less than 10 times is: {}".format(np.sum(ingredient_count_df.vals < 10)))

The total number of ingredients: 6703
The number of ingredient's that are listed once are: 1756
The number of ingredient's that are listed less than 10 times is: 4302


In [50]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(ingredient_count_df[ingredient_count_df.vals == 1])

                                             ingredient  vals
64                                    white almond bark     1
232               johnsonville andouille dinner sausage     1
286                                     lemon olive oil     1
363                                  taiwanese bok choy     1
378                                           membrillo     1
445                                      chestnut flour     1
499                            cinnamon graham crackers     1
570                                     chorizo spanish     1
619                    instant butterscotch pudding mix     1
808                                      guinness lager     1
831                                    franks hot sauce     1
901                                           pecan pie     1
961                          whole grain thin spaghetti     1
977                                   blueberri preserv     1
986                      wish bone guacamol ranch dress     1
1000    

In [64]:
# Print potential equivalencies
equiv = {}
for not_one_ingred in ingredient_count_df.ingredient[ingredient_count_df.vals > 1]:
    for one_inreg in ingredient_count_df.ingredient[ingredient_count_df.vals == 1]:
        if not_one_ingred in one_inreg:
            if not_one_ingred not in equiv:
                equiv[not_one_ingred] = [one_inreg]
            else:
                equiv[not_one_ingred].append(one_inreg)

In [65]:
def print_pretty(dic):
    for key, value in dic.items():
        print(key)
        for word in value:
            print("\t{}".format(word))
print_pretty(equiv)

black olives
	greek black olives
garlic
	black bean sauce with garlic
	garlic mayonnaise
	knorr italian side   creami garlic shell
	garlic naan
	flowering garlic chives
	garlic olive oil
	garlic herb spreadable cheese
	spice islandsâ® minced garlic
	soy vayâ® hoisin garlic marinade & sauce
	roasted garlic oil
	black garlic
	garlic pepper blend
	spice islands garlic salt
	stonefire tandoori garlic naan
	tomato garlic pasta sauce
pepper
	ground roasted sichuan peppers
	margherita pepperoni
	mccormick ground white pepper
	mini pepperoni slices
	kraft shredded pepper jack cheese with a touch of philadelphia
	pepper leaves
	peppermint schnapps
	blackpepper
	crushed peppermint candy
	roasted bell peppers
	tabascoâ® chipotle pepper sauce
	padron peppers
	dried chipotle pepper
	pointed peppers
	kraft big slice pepper jack cheese slices
	long green pepper
	bird pepper
	sichuan peppercorn oil
	fresno pepper
	pepper cheese
	kraft shredded pepper jack cheese
	chinese pepper
	garlic pepper blend
	c

	jack daniels whiskey
	straight bourbon whiskey
	jameson whiskey
pitas
	roasted pepitas
goat cheese
	low-fat soft goat cheese
cream
	neapolitan ice cream
	low-fat vanilla ice cream
	fat free cream of mushroom soup
	skippy creamy peanut butter
	knudsen light sour cream
	knorr italian side   creami garlic shell
	chocolate chip cookie dough ice cream
	light cream or half and half
	chocolate ice cream mix
	breakstone's sour cream
	nonfat block cream cheese
	reduced sodium cream of mushroom soup
	cream yogurt
	instant banana cream pudding
	vegan coffee creamer
	creamy gravy
	cream of tomato soup
	dairy free coconut ice cream
	non dairy sour cream
	chocolate fudge ice cream
	bertolli vodka sauc made with fresh cream
	sweet cream butter
	reduced fat cream of mushroom soup
	creamed spinach
	low fat cream of celery soup
	cinnamon ice cream
	tomato cream sauce
	reduced sodium condensed cream of chicken soup
	condensed cream of broccoli soup
	hellmann's dijonnaise creamy dijon mustard
	ice cream 

	marshmallow vodka
	vanilla vodka
	lemon vodka
	bertolli vodka sauc made with fresh cream
	vodka sauce
vanilla flavoring
	imitation vanilla flavoring
meatballs
	turkey meatballs
	homemade meatballs
	italian meatballs
turkey sausage
	low fat mild italian turkey sausage
	sweet turkey sausage
	ground turkey sausage
	honeysuckle whiteâ® hot italian turkey sausage links
wafer
	low-fat vanilla wafers
	cooki vanilla wafer
frozen lemonade concentrate
	frozen lemonade concentrate, thawed and undiluted
grapes
	champagne grapes
	purple grapes
	black grapes
cornbread mix
	martha white cornbread mix
candy bar
	heath candy bars
graham crackers
	cinnamon graham crackers
	honey graham crackers
	chocolate graham crackers
stir fry sauce
	sweet & sour stir fry sauce
	black bean stir fry sauce
gelatin
	gelatin sheet
	strawberry gelatin
fusilli
	whole wheat fusilli
almond milk
	unsweetened vanilla almond milk
seafood
	seafood breader
	seafood base
	seafood glaze
	creole seafood seasoning
	mixed frozen seaf

This gives good results, but I need to make sure if I make a substitution, than I need to watch for cases like this:  
eel  
*	wagon wheels
*	candied citron peel
*	pasta wagon wheel
*	darjeeling tea leaves
*	meyer lemon peel
*	citrus peel
*	smoked eel
*	raw peeled prawns
*	peeled canned low sodium tomatoes
*	conger eel
*	peeled diced tomatoes  
I should be able to fix this by ensuring only whole words can match each other, otherwise wagon wheels are going to be taken as wheels

### Summary  
A lot of the ingredients are already listed, but they are listed with another word next to the main ingredient, so things like "rye flour" will come in commonly, but then "dark rye bread" will come in too, and so generate a new ingredient.

I will use common ingredients to reduce the number of one of ingredients. I am not entirely sure what to do about grouped ingredients for instance "garlic naan" comes up under both "garlic", and "naan". I think the most sensible thing to do is count them as both.

In [70]:
# Try making each ingredient a list of words, to remove the eel problem.
equiv_br_by_wrd = {}
for not_one_ingred in ingredient_count_df.ingredient[ingredient_count_df.vals > 1]:
    for one_ingred in ingredient_count_df.ingredient[ingredient_count_df.vals == 1]:
        one_ingred_split = one_ingred.split(" ")
        if all(word in one_ingred_split for word in not_one_ingred.split(" ")):
            if not_one_ingred not in equiv:
                equiv_br_by_wrd[not_one_ingred] = [one_ingred]
            else:
                equiv_br_by_wrd[not_one_ingred].append(one_ingred)

# TODO: Use regular expression with a white space edit.

KeyError: 'black olives'

In [None]:
print_pretty(equiv_br_by_wrd)