# Project: Cuisine Classified
Authors: 
* Ryan Chang - rchan123
* Ryan Chandler - rchan129
* Kunal Mittal - kmitt006
* Fiorello Estuar - festu001
* Kiet Lam - klam073

# Description

This project aims to analyze the relationships between various flavors found in different foods and the cuisines that they are prominent in. 

# Reading/Cleaning Data
This section handles any cleaning/pre-processing of data such that analysis can take place.

## Reading

In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json, os

flavor_dict = {'piquant': [], 'meaty': [], 'bitter': [], 'sweet': [], 'sour': [], 'salty': []}
FLAVORS = list(flavor_dict.keys())
CUISINES = ['American','Chinese','Cuban','English','French','German','Greek','Hawaiian','Hungarian','Indian','Italian','Japanese','Mexican','Moroccan','Portuguese','Spanish','Swedish','Thai']
CUISINES

['American',
 'Chinese',
 'Cuban',
 'English',
 'French',
 'German',
 'Greek',
 'Hawaiian',
 'Hungarian',
 'Indian',
 'Italian',
 'Japanese',
 'Mexican',
 'Moroccan',
 'Portuguese',
 'Spanish',
 'Swedish',
 'Thai']

In [2]:
# Reading in the data
food_list_folder = 'data/lists/'
food_list_jsons = [food_json for food_json in os.listdir(food_list_folder) if food_json.endswith('.json')]

food_df = pd.DataFrame(columns=['recipeName','ingredients','cuisine','flavors'])

for index, js in enumerate(food_list_jsons):
    with open(os.path.join(food_list_folder, js)) as food_list_file:
        food_json = json.load(food_list_file)
        num_items = len(food_json['matches'])
        for idx in range(0,num_items):
            f_obj = food_json['matches'][idx]
            f_id = f_obj['id']
            f_recipeName = f_obj['recipeName']
            f_ingredients = f_obj['ingredients']
            f_cuisine = f_obj['attributes']['cuisine']
            f_flavors = f_obj['flavors']
            food_df.loc[f_id] =  np.array([f_recipeName,f_ingredients,f_cuisine,f_flavors],dtype=object)

## Cleaning

In [3]:
# dropping all rows that do not have labels
food_df = food_df[food_df['flavors'].notna()]
food_df.head()

Unnamed: 0,recipeName,ingredients,cuisine,flavors
Revolutionary-Mac-_-Cheese-1048867,Revolutionary Mac & Cheese,"[dried pasta, milk, shredded cheddar cheese, s...","[Kid-Friendly, Italian, American]","{'piquant': 0.16666666666666666, 'meaty': 0.16..."
Chicago-Chicken-822419,Chicago Chicken,"[jalapeno chilies, lemon, dried oregano, olive...","[Barbecue, American]","{'piquant': 0.8333333333333334, 'meaty': 0.666..."
Chicken-Avocado-Burgers-1031197,Chicken Avocado Burgers,"[ground chicken, avocado, chopped garlic, pank...","[Barbecue, Mexican, American]","{'piquant': 0.6666666666666666, 'meaty': 1.0, ..."
Best-Basic-Burger-500667,Best Basic Burger,"[ground beef, eggs, salt, ground black pepper,...","[Barbecue, American]","{'piquant': 0.8333333333333334, 'meaty': 0.833..."
Easy-garlic-and-lemon-shrimp-309257,Easy Garlic and Lemon Shrimp,"[jumbo shrimp, olive oil, butter, minced garli...","[Barbecue, Italian, Asian, American]","{'piquant': 0.0, 'meaty': 0.8333333333333334, ..."


In [4]:
# clean the flavors
flavor_dict = {'piquant': [], 'meaty': [], 'bitter': [], 'sweet': [], 'sour': [], 'salty': []}
for flavors in food_df.flavors:    
    for specificFlavor in flavors:
        flavor_dict[specificFlavor].append(flavors[specificFlavor])

for flavorKey in flavor_dict:
    try:
        food_df.insert(len(food_df.columns), flavorKey, flavor_dict[flavorKey])
    except:
        pass

food_df.drop(columns = ["flavors"], inplace=True)

In [5]:
# clean the cuisines, give each it's own column and label 0 or 1 based on presence
for cus in CUISINES:
    for key, cus_list in food_df.iterrows():
        food_df.loc[key,cus] = 1 if cus in cus_list['cuisine'] else 0
food_df.head()

Unnamed: 0,recipeName,ingredients,cuisine,piquant,meaty,bitter,sweet,sour,salty,American,...,Hungarian,Indian,Italian,Japanese,Mexican,Moroccan,Portuguese,Spanish,Swedish,Thai
Revolutionary-Mac-_-Cheese-1048867,Revolutionary Mac & Cheese,"[dried pasta, milk, shredded cheddar cheese, s...","[Kid-Friendly, Italian, American]",0.166667,0.166667,0.666667,0.166667,0.166667,0.833333,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chicago-Chicken-822419,Chicago Chicken,"[jalapeno chilies, lemon, dried oregano, olive...","[Barbecue, American]",0.833333,0.666667,0.166667,0.0,0.833333,0.166667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chicken-Avocado-Burgers-1031197,Chicken Avocado Burgers,"[ground chicken, avocado, chopped garlic, pank...","[Barbecue, Mexican, American]",0.666667,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Best-Basic-Burger-500667,Best Basic Burger,"[ground beef, eggs, salt, ground black pepper,...","[Barbecue, American]",0.833333,0.833333,0.666667,0.166667,0.166667,0.833333,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Easy-garlic-and-lemon-shrimp-309257,Easy Garlic and Lemon Shrimp,"[jumbo shrimp, olive oil, butter, minced garli...","[Barbecue, Italian, Asian, American]",0.0,0.833333,0.333333,0.166667,0.5,0.666667,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
testing_df_kunal = food_df.copy()

def filterCuisines(df):
    newCol = []
    for cuisineList in df["cuisine"]:
        temp = []
        for cuisine in cuisineList:
            if cuisine in CUISINES:
                temp.append(cuisine)
        newCol.append(temp)
    return newCol

testing_df_kunal.assign(CleanedCuisine=filterCuisines, inplace=True)
testing_df_kunal.head()



Unnamed: 0,recipeName,ingredients,cuisine,piquant,meaty,bitter,sweet,sour,salty,American,...,Hungarian,Indian,Italian,Japanese,Mexican,Moroccan,Portuguese,Spanish,Swedish,Thai
Revolutionary-Mac-_-Cheese-1048867,Revolutionary Mac & Cheese,"[dried pasta, milk, shredded cheddar cheese, s...","[Kid-Friendly, Italian, American]",0.166667,0.166667,0.666667,0.166667,0.166667,0.833333,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chicago-Chicken-822419,Chicago Chicken,"[jalapeno chilies, lemon, dried oregano, olive...","[Barbecue, American]",0.833333,0.666667,0.166667,0.0,0.833333,0.166667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chicken-Avocado-Burgers-1031197,Chicken Avocado Burgers,"[ground chicken, avocado, chopped garlic, pank...","[Barbecue, Mexican, American]",0.666667,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Best-Basic-Burger-500667,Best Basic Burger,"[ground beef, eggs, salt, ground black pepper,...","[Barbecue, American]",0.833333,0.833333,0.666667,0.166667,0.166667,0.833333,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Easy-garlic-and-lemon-shrimp-309257,Easy Garlic and Lemon Shrimp,"[jumbo shrimp, olive oil, butter, minced garli...","[Barbecue, Italian, Asian, American]",0.0,0.833333,0.333333,0.166667,0.5,0.666667,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Elementary Data Analysis

In [7]:
# cuisines
#   v1. which are the top n cuisines (frequency) in the dataset
#   v2. which cuisines have higher amounts of fusion.

# flavor
#   v1. MMM - using every recipe together at first (broken down by cuisine)
#       - flavors: broken down into the 6 categories
#   v2. for each of the cuisines, visualize the flavors - spider graph
#   v3. ??? 

# ingredients
#   v1. MMM
#   v2. 


In [36]:
### Fio's part

### Top cuisines (by frequency) in the dataset
# We create a dictionary where the keys are the names of the cuisines, and the values are the number of recipes which possess that cuisine tag.
dict_cuisine_freq = {}
for cus in CUISINES:
    dict_cuisine_freq[cus] = int(food_df[cus].sum())

# Because each recipe may have more than one cuisine tag, we cannot calculate a proportion off the total number of recipes alone because our numerator
#  would be bigger than the denominator, so we instead calculate our proportions off the total number of tags. This provides the secondary benefit of
#  having non-fusion recipes "count" for more, as they contribute less tags to the total count of recipes.
#  EX: A fusion recipe with 5 (American, Chinese, Hawaiian, Japanese, Indian) tags would only be 1/5 as meaningful to the overall proportion of chinese recipes
#        compared to a recipe with only 1 tag, Chinese.
num_tags = sum(dict_cuisine_freq.values())
# print(dict_cuisine_freq) 
# print(num_tags)


# Calculate the proportion of each cuisine across all recipes, then convert into a dataframe.
dict_proportion = {}
for cus in CUISINES:
    dict_proportion[cus] = dict_cuisine_freq[cus] / num_tags
df_proportions = pd.DataFrame(list(dict_proportion.items()), columns = ['Cuisine', 'Proportion'])

# Sort the dataframe by descending proportion
df_proportions.sort_values(by=['Proportion'], ascending=False)

Unnamed: 0,Cuisine,Proportion
5,German,0.070454
1,Chinese,0.0676
15,Spanish,0.066262
6,Greek,0.066084
13,Moroccan,0.065549
4,French,0.06323
3,English,0.06109
0,American,0.060644
10,Italian,0.060109
12,Mexican,0.05993


In [None]:
# TODO: Visualization for Cuisine proportion

TODO: Written Analysis for proportion of cuisines 

##  Flavor

In this section, we aim to examine flavors of a given recipe, and any patterns that may exist within each flavor, or it's predicitve power over the various cuisines.

In [16]:
# manually creating pivot table. TODO: find out if there's a better way to do this (pivot table? crosstab?)
cuisine_flavor = pd.DataFrame(columns=FLAVORS,index=CUISINES)
for cus in CUISINES:
    for f in FLAVORS:
        cuisine_flavor.loc[cus,f] = food_df.loc[food_df[cus]==1,f].mean()
cuisine_flavor['magnitude'] = cuisine_flavor[FLAVORS].sum(axis=1)/6*100
cuisine_flavor

Unnamed: 0,piquant,meaty,bitter,sweet,sour,salty,magnitude
American,0.308578,0.421569,0.422549,0.22402,0.415686,0.475,37.790033
Chinese,0.228012,0.427221,0.568821,0.274846,0.42942,0.668646,43.282762
Cuban,0.315616,0.411411,0.357958,0.213213,0.575976,0.45976,38.898899
English,0.256934,0.416058,0.366423,0.179075,0.361557,0.43163,33.527981
French,0.122238,0.415609,0.338035,0.176305,0.344147,0.392102,29.80724
German,0.133122,0.405907,0.412025,0.203376,0.394937,0.531224,34.676512
Greek,0.111561,0.369546,0.409132,0.18556,0.496401,0.480882,34.218024
Hawaiian,0.267355,0.328956,0.490306,0.434959,0.652908,0.640088,46.909527
Hungarian,0.364113,0.311722,0.285527,0.185986,0.465946,0.361166,32.907662
Indian,0.458017,0.346485,0.379616,0.197016,0.48002,0.399595,37.679143


In [17]:
# initial work on cuisines/flavors, partition by continent/country?
specs_list = []
for i in range (0,3):
    specs_list.append([])
    for j in range(0,6):
        specs_list[i].append({"type":"polar"})
fig1 = make_subplots(rows=3,cols=6,specs=specs_list)
cus_idx=0
for i in range(0,3):
    for j in range(0,6):
        fig1.append_trace(go.Scatterpolar(
            r=cuisine_flavor.loc[CUISINES[cus_idx]],
            theta=FLAVORS,
            fill='toself',
            name=CUISINES[cus_idx]
        ),row=(i+1)%3+1,col=j+1)
        print(CUISINES[cus_idx], end=" ")
        cus_idx += 1
fig1.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 0.7]
    )),
  showlegend=False
)
fig1.show()

American Chinese Cuban English French German Greek Hawaiian Hungarian Indian Italian Japanese Mexican Moroccan Portuguese Spanish Swedish Thai 

In [18]:
# 3D Scatter Plot, Recipe vs. Cuisine
# TODO: fix legend labels (has to do with order you add countries to the trace), add title
fig2 = make_subplots(rows=1,cols=2,specs=[[{'type':'scene'},{'type':'scene'}]])
fig2.append_trace(go.Scatter3d(
    x=cuisine_flavor['sweet'],
    y=cuisine_flavor['sour'],
    z=cuisine_flavor['salty'],
    text=cuisine_flavor.index,
    mode='markers',
    marker=dict(
        size=10,
        color=px.colors.qualitative.Bold,
    )
    ),row=1,col=1
)
fig2.append_trace(go.Scatter3d(
    x=cuisine_flavor['piquant'],
    y=cuisine_flavor['bitter'],
    z=cuisine_flavor['meaty'],
    text=cuisine_flavor.index,
    mode='markers',
    marker=dict(
        size=10,
        color=px.colors.qualitative.Bold,
    )
    ),row=1,col=2)
fig2.show()

From this vis. we can see that...
* Why is hawaiian food so sweet? Use a visualization

# Statistical Analysis

# K-Nearest Neighbors Classifier

In [19]:
#ML STUFF
neighbordata = food_df.loc[:, FLAVORS].values.tolist()
def knn(unknown, data, neighbors: int):
    classifier = NearestNeighbors(n_neighbors=neighbors)
    classifier.fit(data)
    return classifier.kneighbors([unknown])

closestpts = knn(random, datastuff, 5)
finalCuisine = dict(zip(CUISINES, [0] * len(CUISINES)))
for food in closestpts:
    for listCuisines in food_df.iloc[food, food_df.columns.get_loc("cuisine")]:
        for cuisine in listCuisines:
             finalCuisine[cuisine] += 1/len(listCuisines)
        

NameError: name 'random' is not defined

{'American': 0,
 'Chinese': 0,
 'Cuban': 0,
 'English': 0,
 'French': 0,
 'German': 0,
 'Greek': 0,
 'Hawaiian': 0,
 'Hungarian': 0,
 'Indian': 0,
 'Italian': 0,
 'Japanese': 0,
 'Mexican': 0,
 'Moroccan': 0,
 'Portuguese': 0,
 'Spanish': 0,
 'Swedish': 0,
 'Thai': 0}