# Build Classification Models

In [1]:
import pandas as pd
import numpy as np

In [2]:
cuisines_df = pd.read_csv('../data/cleaned_cuisines.csv')

In [3]:
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, precision_recall_curve
from sklearn.svm import SVC

In [5]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df

0       indian
1       indian
2       indian
3       indian
4       indian
         ...  
3990      thai
3991      thai
3992      thai
3993      thai
3994      thai
Name: cuisine, Length: 3995, dtype: object

In [6]:
cuisines_features_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis= 1)
cuisines_features_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
cuisines_df.cuisine.value_counts()

cuisine
indian      799
thai        799
chinese     799
japanese    799
korean      799
Name: count, dtype: int64

Choosing your classifier

In [8]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_features_df, cuisines_label_df, test_size=0.3)

In [9]:
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print('Accuracy is {}'.format(accuracy))

Accuracy is 0.8181818181818182




In [10]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')


ingredients: Index(['cilantro', 'egg', 'lemon_juice', 'potato', 'shallot', 'vegetable_oil'], dtype='object')
cuisine: indian


In [11]:
test= X_test.iloc[50].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
japanese,0.483064
indian,0.21741
chinese,0.187187
korean,0.077022
thai,0.035316


In [12]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.76      0.76      0.76       230
      indian       0.91      0.93      0.92       241
    japanese       0.74      0.82      0.78       239
      korean       0.88      0.78      0.82       245
        thai       0.81      0.82      0.81       244

    accuracy                           0.82      1199
   macro avg       0.82      0.82      0.82      1199
weighted avg       0.82      0.82      0.82      1199

