# Project: Cuisine Classified
Authors: 
* Ryan Chang - rchan123
* Ryan Chandler - rchan129
* Kunal Mittal - kmitt006
* Fiorello Estuar - festu001
* Kiet Lam - klam073

# Description

This project aims to analyze the relationships between various flavors found in different foods and the cuisines that they are prominent in. 

# Reading/Cleaning Data
This section handles any cleaning/pre-processing of data such that analysis can take place.

## Reading

In [18]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import plotly.express as px
import json, os

flavor_dict = {'piquant': [], 'meaty': [], 'bitter': [], 'sweet': [], 'sour': [], 'salty': []}
FLAVORS = list(flavor_dict.keys())
CUISINES = ['American','Chinese','Cuban','English','French','German','Greek','Hawaiian','Hungarian','Indian','Italian','Japanese','Mexican','Moroccan','Portuguese','Spanish','Swedish','Thai']

In [19]:
# Reading in the data
food_list_folder = 'data/lists/'
food_list_jsons = [food_json for food_json in os.listdir(food_list_folder) if food_json.endswith('.json')]

food_df = pd.DataFrame(columns=['recipeName','ingredients','cuisine','flavors'])

for index, js in enumerate(food_list_jsons):
    with open(os.path.join(food_list_folder, js)) as food_list_file:
        food_json = json.load(food_list_file)
        num_items = len(food_json['matches'])
        for idx in range(0,num_items):
            f_obj = food_json['matches'][idx]
            f_id = f_obj['id']
            f_recipeName = f_obj['recipeName']
            f_ingredients = f_obj['ingredients']
            f_cuisine = f_obj['attributes']['cuisine']
            f_flavors = f_obj['flavors']
            food_df.loc[f_id] =  np.array([f_recipeName,f_ingredients,f_cuisine,f_flavors],dtype=object)

## Cleaning

In [20]:
# dropping all rows that do not have labels
food_df = food_df[food_df['flavors'].notna()]
food_df.head()

Unnamed: 0,recipeName,ingredients,cuisine,flavors
Easy-Seafood-Paella-2077829,Easy Seafood Paella,"[lobster tails, water, olive oil, yellow onion...",[Spanish],"{'piquant': 0.6666666666666666, 'meaty': 0.166..."
Spanish-style-garlic-shrimp-298317,Spanish-style Garlic Shrimp,"[I Can't Believe It's Not Butter!® Spread, unc...",[Spanish],"{'piquant': 0.8333333333333334, 'meaty': 0.833..."
Classic-Paella-898818,Classic Paella,"[extra-virgin olive oil, skinless chicken brea...",[Spanish],"{'piquant': 0.16666666666666666, 'meaty': 0.16..."
One-Pot-Spanish-Chicken-and-Rice-2237989,One Pot Spanish Chicken and Rice,"[olive oil, chicken thighs, salt, pepper, onio...",[Spanish],"{'piquant': 0.16666666666666666, 'meaty': 0.16..."
Portuguese-fish-stew-306395,Portuguese Fish Stew,"[plum tomatoes, savoy cabbage, extra-virgin ol...","[Spanish, Portuguese]","{'piquant': 0.5, 'meaty': 0.16666666666666666,..."


In [21]:
#clean the ingredients
ingredientDict = {}
for ingredeintList in food_df.ingredients :
    for ingredient in ingredeintList :
        if ingredient not in ingredientDict :
            ingredientDict[ingredient] = {}

ingredientdf = pd.DataFrame.from_dict(ingredientDict, orient='index')

for index, row in food_df.iterrows() :
    for ingredient in row.ingredients :
        for flavor in row.flavors :
                if(flavor not in ingredientDict[ingredient]) :
                    ingredientDict[ingredient][flavor] = [row.flavors[flavor]]
                else :
                    ingredientDict[ingredient][flavor].append(row.flavors[flavor])

discardList = []
for ingredient in ingredientDict :
    if len(ingredientDict[ingredient]['piquant']) < 10 :
        discardList.append(ingredient)
        continue
    for flavor in ingredientDict[ingredient] :
        ingredientDict[ingredient][flavor] = sum(ingredientDict[ingredient][flavor])/len(ingredientDict[ingredient][flavor])
#print(ingredientDict)
for ingredient in discardList :
    ingredientDict.pop(ingredient)
ingredient_df = pd.DataFrame.from_dict(ingredientDict, orient='index')
ingredient_df.head()


Unnamed: 0,piquant,meaty,bitter,sweet,sour,salty
water,0.251602,0.313866,0.363069,0.212502,0.405454,0.464843
olive oil,0.262075,0.361599,0.357929,0.198106,0.486413,0.428841
yellow onion,0.302704,0.319814,0.331643,0.192438,0.471483,0.429024
garlic cloves,0.290848,0.351786,0.375818,0.207738,0.492336,0.441964
saffron threads,0.339976,0.289251,0.265097,0.189614,0.515097,0.366546


In [22]:
# clean the flavors
flavor_dict = {'piquant': [], 'meaty': [], 'bitter': [], 'sweet': [], 'sour': [], 'salty': []}
for flavors in food_df.flavors:    
    for specificFlavor in flavors:
        flavor_dict[specificFlavor].append(flavors[specificFlavor])

for flavorKey in flavor_dict:
    try:
        food_df.insert(len(food_df.columns), flavorKey, flavor_dict[flavorKey])
    except:
        pass

food_df.drop(columns = ["flavors"], inplace=True)

In [23]:
# clean the cuisines, give each it's own column and label 0 or 1 based on presence
for cus in CUISINES:
    for key, cus_list in food_df.iterrows():
        food_df.loc[key,cus] = 1 if cus in cus_list['cuisine'] else 0
food_df.head()

Unnamed: 0,recipeName,ingredients,cuisine,piquant,meaty,bitter,sweet,sour,salty,American,...,Hungarian,Indian,Italian,Japanese,Mexican,Moroccan,Portuguese,Spanish,Swedish,Thai
Easy-Seafood-Paella-2077829,Easy Seafood Paella,"[lobster tails, water, olive oil, yellow onion...",[Spanish],0.666667,0.166667,0.333333,0.166667,0.166667,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Spanish-style-garlic-shrimp-298317,Spanish-style Garlic Shrimp,"[I Can't Believe It's Not Butter!® Spread, unc...",[Spanish],0.833333,0.833333,0.833333,0.166667,0.666667,0.833333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Classic-Paella-898818,Classic Paella,"[extra-virgin olive oil, skinless chicken brea...",[Spanish],0.166667,0.166667,0.333333,0.166667,0.666667,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
One-Pot-Spanish-Chicken-and-Rice-2237989,One Pot Spanish Chicken and Rice,"[olive oil, chicken thighs, salt, pepper, onio...",[Spanish],0.166667,0.166667,0.166667,0.166667,0.833333,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Portuguese-fish-stew-306395,Portuguese Fish Stew,"[plum tomatoes, savoy cabbage, extra-virgin ol...","[Spanish, Portuguese]",0.5,0.166667,0.166667,0.166667,0.5,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [24]:
testing_df_kunal = food_df.copy()

def filterCuisines(df):
    newCol = []
    for cuisineList in df["cuisine"]:
        temp = []
        for cuisine in cuisineList:
            if cuisine in CUISINES:
                temp.append(cuisine)
        newCol.append(temp)
    return newCol

testing_df_kunal.assign(CleanedCuisine=filterCuisines, inplace=True)
testing_df_kunal.head()



Unnamed: 0,recipeName,ingredients,cuisine,piquant,meaty,bitter,sweet,sour,salty,American,...,Hungarian,Indian,Italian,Japanese,Mexican,Moroccan,Portuguese,Spanish,Swedish,Thai
Easy-Seafood-Paella-2077829,Easy Seafood Paella,"[lobster tails, water, olive oil, yellow onion...",[Spanish],0.666667,0.166667,0.333333,0.166667,0.166667,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Spanish-style-garlic-shrimp-298317,Spanish-style Garlic Shrimp,"[I Can't Believe It's Not Butter!® Spread, unc...",[Spanish],0.833333,0.833333,0.833333,0.166667,0.666667,0.833333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Classic-Paella-898818,Classic Paella,"[extra-virgin olive oil, skinless chicken brea...",[Spanish],0.166667,0.166667,0.333333,0.166667,0.666667,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
One-Pot-Spanish-Chicken-and-Rice-2237989,One Pot Spanish Chicken and Rice,"[olive oil, chicken thighs, salt, pepper, onio...",[Spanish],0.166667,0.166667,0.166667,0.166667,0.833333,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Portuguese-fish-stew-306395,Portuguese Fish Stew,"[plum tomatoes, savoy cabbage, extra-virgin ol...","[Spanish, Portuguese]",0.5,0.166667,0.166667,0.166667,0.5,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


# Elementary Data Analysis

In [25]:
# flavor
#   v1. MMM - using every recipe together at first (broken down by cuisine)
#       - flavors: broken down into the 6 categories
#   v2. which are the top n cuisines (frequency) in the dataset
#   v3. for each of the cuisines, visualize the flavors - spider graph
#   v4. ??? 

# ingredients
#   v1. MMM
#   v2. 


## Cuisine vs. Flavor

In this section, we aim to analyze the relationship between a recipe's cuisine and the flavors that are associated with each respective cuisine.

In [26]:
cf_df_1 = food_df[CUISINES + FLAVORS]
cuisine_flavor_means = pd.pivot_table(cf_df_1,index=FLAVORS,columns=CUISINES,aggfunc=np.mean)
cuisine_flavor_means.head()

piquant,meaty,bitter,sweet,sour,salty
0.0,0.0,0.166667,0.0,0.0,0.166667
0.0,0.0,0.166667,0.0,0.166667,0.166667
0.0,0.0,0.166667,0.166667,0.0,0.166667
0.0,0.0,0.333333,0.166667,0.166667,0.833333
0.0,0.166667,0.166667,0.0,0.0,0.833333


## Ingredients and Flavor

In [27]:
print(ingredient_df.loc[:, ['salty']].idxmax())
print(ingredient_df.loc[:, ['bitter']].idxmax())
print(ingredient_df.loc[:, ['piquant']].idxmax())
print(ingredient_df.loc[:, ['meaty']].idxmax())
print(ingredient_df.loc[:, ['sour']].idxmax())
print(ingredient_df.loc[:, ['sweet']].idxmax())
ingredient_df.head()

salty    hot dog buns
dtype: object
bitter    diced ham
dtype: object
piquant    worcestershire sauce
dtype: object
meaty    rib eye steaks
dtype: object
sour    yellow peppers
dtype: object
sweet    bbq sauce
dtype: object


Unnamed: 0,piquant,meaty,bitter,sweet,sour,salty
water,0.251602,0.313866,0.363069,0.212502,0.405454,0.464843
olive oil,0.262075,0.361599,0.357929,0.198106,0.486413,0.428841
yellow onion,0.302704,0.319814,0.331643,0.192438,0.471483,0.429024
garlic cloves,0.290848,0.351786,0.375818,0.207738,0.492336,0.441964
saffron threads,0.339976,0.289251,0.265097,0.189614,0.515097,0.366546


# K-Nearest Neighbors Classifier

In [28]:
#ML STUFF
neighbordata = food_df.loc[:, FLAVORS].values.tolist()
def knn(unknown, data, neighbors: int):
    classifier = NearestNeighbors(n_neighbors=neighbors)
    classifier.fit(data)
    return classifier.kneighbors([unknown])

closestpts = knn(random, datastuff, 5)
finalCuisine = dict(zip(CUISINES, [0] * len(CUISINES)))
for food in closestpts:
    for listCuisines in food_df.iloc[food, food_df.columns.get_loc("cuisine")]:
        for cuisine in listCuisines:
             finalCuisine[cuisine] += 1/len(listCuisines)
        

NameError: name 'random' is not defined

{'American': 0,
 'Chinese': 0,
 'Cuban': 0,
 'English': 0,
 'French': 0,
 'German': 0,
 'Greek': 0,
 'Hawaiian': 0,
 'Hungarian': 0,
 'Indian': 0,
 'Italian': 0,
 'Japanese': 0,
 'Mexican': 0,
 'Moroccan': 0,
 'Portuguese': 0,
 'Spanish': 0,
 'Swedish': 0,
 'Thai': 0}

# Linear Regressor

In [None]:
#Ryan Chandler