In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
from sklearn.model_selection import train_test_split

In [6]:
train_set = pd.read_json('train.json')

In [7]:
X_train, X_val = train_test_split(train_set,
                                  test_size=.2,
                                  shuffle=True,
                                  random_state=41,
                                  stratify= train_set['cuisine'])

In [8]:
#Set index in ascending order
X_val=X_val.set_index(np.arange(0,len(X_val)))
X_train=X_train.set_index(np.arange(0,len(X_train)))


In [9]:
#Number of unique ingredients in the test set
ing_col = X_train.ingredients.values
all_ings = np.concatenate(tuple(ing_col[i] for i in range(len(ing_col))))
unique_ings, counts = np.unique(all_ings, return_counts=True)
print('Unique number of ingredients in dataset: ', len(unique_ings))
ings=pd.DataFrame(all_ings, columns=["Ingredients"])
unique_in=ings.Ingredients.unique()

Unique number of ingredients in dataset:  6308


In [10]:
#Grouped by cuisines
count_by_cuisine = train_set.groupby(['cuisine']).count()

In [11]:
#Here we will construct a dataframe with ingredients as index, 
#cuisines as columns and fill it with number of occurences in X_train
temp=pd.DataFrame(0, index= unique_in, columns= count_by_cuisine.index.values)
for i in range(len(X_train)):
    for j in range(len(X_train.ingredients[i])):
        temp[X_train["cuisine"][i]][X_train.ingredients[i][j]]=temp[X_train["cuisine"][i]][X_train.ingredients[i][j]]+1

In [12]:
temp.head(25)

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
vinaigrette dressing,0,0,1,0,0,0,3,0,0,3,0,0,1,0,0,0,0,0,0,0
ground black pepper,45,59,202,115,106,312,165,167,54,1135,69,55,69,453,121,36,374,122,36,85
cherry tomatoes,1,4,5,5,0,30,38,11,0,135,1,2,0,57,10,0,9,5,27,0
lemon juice,10,17,48,39,8,72,157,222,9,166,5,33,8,74,60,22,135,22,14,7
kale,7,1,4,3,0,2,2,6,3,49,5,8,3,11,3,1,9,8,5,1
kalamata,0,0,1,0,0,20,65,0,0,45,0,0,0,4,16,1,2,3,0,0
feta cheese,0,0,0,0,0,3,156,2,0,27,0,0,0,23,5,2,1,1,1,0
oregano,5,3,35,0,0,6,33,2,1,88,3,0,0,160,0,2,9,7,0,0
garlic powder,4,8,119,61,18,10,28,26,10,193,29,9,9,406,6,4,189,4,15,15
white rice,8,0,36,20,5,0,8,9,0,8,4,21,12,47,0,3,9,7,10,3


In [13]:
#Next we will divide the rows by the total number of occurences of corresponding ingredients
temp['Total']=temp.sum(axis=1)
a=temp.sort_values(by=['Total'], ascending=False)
norm_temp =a.divide(a['Total'],axis=0)
norm_temp.drop(["Total"], axis=1, inplace=True)
norm_temp

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
salt,0.010756,0.021860,0.041152,0.050520,0.023317,0.066204,0.031714,0.108258,0.020749,0.192366,0.018112,0.022554,0.014434,0.151145,0.022901,0.015684,0.126787,0.026024,0.021305,0.014157
olive oil,0.014313,0.007235,0.037748,0.016357,0.006449,0.065901,0.063699,0.044511,0.006134,0.394306,0.008179,0.006920,0.004718,0.162944,0.050488,0.005977,0.038220,0.046870,0.014470,0.004561
onions,0.016562,0.017981,0.065142,0.037539,0.039905,0.047003,0.021767,0.149685,0.017035,0.153943,0.021767,0.018770,0.025868,0.187382,0.034700,0.017981,0.060252,0.031703,0.021451,0.013565
water,0.012331,0.014664,0.036994,0.103816,0.041326,0.057990,0.019163,0.110148,0.013664,0.142143,0.020330,0.052325,0.031828,0.122480,0.024663,0.012998,0.091318,0.021830,0.040993,0.028995
garlic,0.011483,0.004897,0.048970,0.104188,0.045593,0.029551,0.029044,0.098109,0.006248,0.199595,0.019250,0.021952,0.042047,0.178656,0.019588,0.002026,0.032759,0.016548,0.058595,0.030902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
white baking bar,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
pain au levain,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
seafood breader,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
legumes,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [14]:
#Next we will construct similar dataframe which is further normalized by the occurence of cuisines
#This will provide a vector of 20 dimension for each ingredient
n_norm_temp=norm_temp.copy()
count_by_cuisine_train = X_train.groupby(['cuisine']).count()
n_norm_temp=n_norm_temp.divide(count_by_cuisine_train.id.values, axis='columns')
n_norm_temp

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
salt,0.000029,0.000034,0.000033,0.000024,0.000039,0.000031,0.000034,0.000045,0.000039,0.000031,0.000043,0.000020,0.000022,0.000029,0.000035,0.000040,0.000037,0.000033,0.000017,0.000021
olive oil,0.000038,0.000011,0.000031,0.000008,0.000011,0.000031,0.000068,0.000019,0.000011,0.000063,0.000019,0.000006,0.000007,0.000032,0.000077,0.000015,0.000011,0.000059,0.000012,0.000007
onions,0.000044,0.000028,0.000053,0.000018,0.000066,0.000022,0.000023,0.000062,0.000032,0.000025,0.000052,0.000016,0.000039,0.000036,0.000053,0.000046,0.000017,0.000040,0.000017,0.000021
water,0.000033,0.000023,0.000030,0.000049,0.000068,0.000027,0.000020,0.000046,0.000026,0.000023,0.000048,0.000046,0.000048,0.000024,0.000038,0.000033,0.000026,0.000028,0.000033,0.000044
garlic,0.000031,0.000008,0.000040,0.000049,0.000075,0.000014,0.000031,0.000041,0.000012,0.000032,0.000046,0.000019,0.000063,0.000035,0.000030,0.000005,0.000009,0.000021,0.000048,0.000047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
white baking bar,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000289,0.000000,0.000000,0.000000
pain au levain,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001264,0.000000,0.000000
seafood breader,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000289,0.000000,0.000000,0.000000
legumes,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000416,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
#Predict the cuisine training data set
out_train=[0]*len(X_train)


for i in range(len(X_train)):
    s=[0]*20
    for j in range(len(X_train.ingredients[i])):
        s =s+n_norm_temp.loc[X_train.ingredients[i][j]]
    out_train[i]=s.argmax()

In [34]:
#Convert the actual training data output in numbers (0,20)
tr_set=X_train.copy()

y_true=tr_set.replace(to_replace=count_by_cuisine.index, value=np.arange(0,20,1)).cuisine.values

In [35]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [36]:
#Build confusion matrix for training data
n_con_mat=confusion_matrix(y_true, out_train)
n_c_mat=pd.DataFrame(n_con_mat, index= count_by_cuisine.index.values, columns= count_by_cuisine.index.values)
n_c_mat

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
brazilian,270,4,28,1,6,3,0,7,0,1,10,0,0,3,7,4,4,6,18,2
british,10,501,6,1,2,6,2,8,40,0,19,1,0,1,6,29,9,0,0,2
cajun_creole,7,18,1072,0,1,8,3,4,4,4,14,0,3,3,10,48,32,4,0,2
chinese,11,28,16,1673,24,4,1,7,2,1,18,38,177,1,10,12,4,2,37,72
filipino,15,5,4,29,434,2,2,5,5,2,11,3,16,1,3,9,3,4,21,30
french,30,179,70,5,15,1160,78,6,88,47,61,7,13,3,81,121,42,96,7,8
greek,3,16,14,0,5,7,749,8,4,4,11,0,4,0,78,24,3,7,0,3
indian,19,26,12,3,4,1,31,1962,19,0,31,0,5,0,190,26,3,2,53,15
irish,6,39,7,3,3,10,6,2,398,1,13,0,1,0,15,22,5,1,2,0
italian,56,243,351,17,38,356,453,17,132,3505,99,12,43,34,271,219,80,297,14,33


In [19]:
# Training set accuracy
acc=0
for i in n_c_mat.index.values:
    acc=acc+n_c_mat[i][i]

In [20]:
acc_train=acc/(n_c_mat.sum().sum())
acc_train

0.6866652000377134

In [37]:
print("Precision score for training set is ", precision_score(y_true, out_train, average="macro"))
print("f1 score for training set is ",f1_score(y_true, out_train, average="macro"))
print("Recall score for training set is ",recall_score(y_true, out_train, average="macro")) 

Precision score for training set is  0.6112784186575013
f1 score for training set is  0.6363978180696894
Recall score for training set is  0.7436696300097323


In [38]:
print(classification_report(y_true, out_train, target_names= count_by_cuisine.index.values))

              precision    recall  f1-score   support

   brazilian       0.37      0.72      0.49       374
     british       0.35      0.78      0.48       643
cajun_creole       0.44      0.87      0.59      1237
     chinese       0.88      0.78      0.83      2138
    filipino       0.63      0.72      0.67       604
      french       0.68      0.55      0.61      2117
       greek       0.52      0.80      0.63       940
      indian       0.87      0.82      0.84      2402
       irish       0.43      0.75      0.55       534
     italian       0.97      0.56      0.71      6270
    jamaican       0.37      0.89      0.52       421
    japanese       0.91      0.72      0.80      1139
      korean       0.59      0.90      0.71       664
     mexican       0.97      0.76      0.85      5150
    moroccan       0.37      0.91      0.53       657
     russian       0.29      0.78      0.42       391
 southern_us       0.86      0.47      0.61      3456
     spanish       0.45    

In [39]:
#Validation data set
X_val.head(10)

Unnamed: 0,id,cuisine,ingredients
0,13751,thai,"[fish sauce, water chestnuts, purple onion, se..."
1,12652,chinese,"[ground ginger, dry roasted peanuts, dark sesa..."
2,30957,mexican,"[cheddar cheese, fresh cilantro, chili powder,..."
3,3007,mexican,"[black beans, non-fat sour cream, chopped cila..."
4,29422,southern_us,"[cider vinegar, tupelo honey, olive oil, beets..."
5,36624,thai,"[lemon grass, sea salt, white peppercorns, hot..."
6,30504,french,"[cocoa, large egg yolks, heavy cream, fresh le..."
7,30704,japanese,"[fresh ginger, rice noodles, carrots, mirin, w..."
8,39253,southern_us,"[black pepper, cayenne, bay leaves, salt, ham ..."
9,3796,japanese,"[white vinegar, water, all-purpose flour, chic..."


In [40]:
#Predicting on the validation data
out_val=[0]*len(X_val)


for i in range(len(X_val)):
    s = ([0]*20)
    for j in range(len(X_val.ingredients[i])):
        if n_norm_temp.index.isin([X_val.ingredients[i][j]]).any()==True:
            s =s+n_norm_temp.loc[X_val.ingredients[i][j]]
    out_val[i]=pd.Series(s).argmax()  

In [41]:
#Convert the validation data set output in numbers (0,20)
v_set=X_val.copy()

y_true=v_set.replace(to_replace=count_by_cuisine.index, value=np.arange(0,20,1)).cuisine.values

In [42]:
#Build confusion matrix for validation data
n_con_val_mat=confusion_matrix(y_true, out_val)
n_c_val_mat=pd.DataFrame(n_con_val_mat, index= count_by_cuisine.index.values, columns= count_by_cuisine.index.values)
n_c_val_mat

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
brazilian,44,3,9,1,2,0,1,0,2,0,6,0,1,1,2,4,3,4,9,1
british,6,84,1,0,4,3,0,4,16,0,11,0,1,0,8,13,9,1,0,0
cajun_creole,3,7,257,0,1,3,3,2,2,4,5,0,0,1,2,6,9,2,0,2
chinese,2,8,5,386,8,1,0,1,1,1,4,7,61,1,3,3,1,1,10,31
filipino,8,3,4,11,91,0,0,0,1,0,3,3,7,1,1,2,0,0,6,10
french,5,87,22,1,5,237,18,2,16,16,16,2,1,0,27,39,8,24,1,2
greek,0,9,2,0,2,2,170,1,3,4,6,0,1,0,21,7,0,5,1,1
indian,7,3,3,0,1,0,14,469,4,0,14,0,1,0,51,10,0,2,16,6
irish,4,23,2,0,1,2,1,0,75,0,4,0,0,1,8,7,4,0,1,0
italian,15,59,91,8,10,83,128,4,34,834,35,5,16,7,76,51,14,87,5,6


In [43]:
# Validation set accuracy
acc=0
for i in n_c_val_mat.index.values:
    acc=acc+n_c_val_mat[i][i]

In [44]:
acc_val=acc/(n_c_val_mat.sum().sum())
acc_val

0.6290383406662476

In [45]:
from sklearn.datasets import make_classification

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [46]:
print("Precision score for test set is ", precision_score(y_true, out_val, average="macro"))
print("f1 score for test set is ",f1_score(y_true, out_val, average="macro"))
print("Recall score for test set is ",recall_score(y_true, out_val, average="macro")) 

Precision score for test set is  0.5500735808652528
f1 score for test set is  0.5619481054483884
Recall score for test set is  0.6593421781960019


In [47]:
print(classification_report(y_true, out_val, target_names= count_by_cuisine.index.values))

              precision    recall  f1-score   support

   brazilian       0.24      0.47      0.32        93
     british       0.21      0.52      0.30       161
cajun_creole       0.42      0.83      0.56       309
     chinese       0.84      0.72      0.78       535
    filipino       0.50      0.60      0.55       151
      french       0.64      0.45      0.53       529
       greek       0.44      0.72      0.55       235
      indian       0.88      0.78      0.82       601
       irish       0.34      0.56      0.42       133
     italian       0.95      0.53      0.68      1568
    jamaican       0.30      0.87      0.44       105
    japanese       0.85      0.69      0.76       284
      korean       0.52      0.89      0.65       166
     mexican       0.97      0.73      0.83      1288
    moroccan       0.34      0.90      0.49       164
     russian       0.23      0.62      0.34        98
 southern_us       0.82      0.41      0.54       864
     spanish       0.35    