In [68]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
from numpy import mean, std

In [47]:
clean_df = pd.read_csv('.../GitHub/FoodReceipeNLP/data/cleaned_data.csv')
clean_df.head()

Unnamed: 0,name_0,name_1,name_2,name_3,name_4,ingre_0,ingre_1,ingre_2,ingre_3,ingre_4,...,ingre_141,ingre_142,ingre_143,ingre_144,ingre_145,ingre_146,ingre_147,ingre_148,ingre_149,Cuisine
0,0.080166,0.048612,0.077307,-0.117698,0.038966,0.043213,-0.026358,0.025549,-0.06129,0.012261,...,-0.049989,-0.166183,0.182986,0.192492,0.081331,-0.096553,-0.181219,0.15965,-0.205233,mexican
1,0.057411,-0.062757,0.066737,0.127286,0.141621,0.027747,-0.016764,0.015889,-0.038428,0.005781,...,-0.02931,-0.102793,0.115859,0.120391,0.050211,-0.059696,-0.112801,0.101228,-0.128654,mexican
2,0.004638,-0.067046,0.033966,0.014586,0.062332,0.046457,-0.026615,0.025139,-0.056627,0.0118,...,-0.048154,-0.164412,0.181197,0.188647,0.080837,-0.095031,-0.178533,0.160942,-0.202676,mexican
3,-0.061102,-0.071237,0.181439,-0.108818,0.163737,0.047029,-0.029167,0.025548,-0.061664,0.011055,...,-0.053704,-0.176375,0.195091,0.201184,0.085895,-0.10104,-0.188453,0.170051,-0.216035,mexican
4,0.00324,0.039615,-0.036836,0.175529,0.026537,0.040022,-0.025716,0.020734,-0.056372,0.010302,...,-0.048729,-0.158852,0.176082,0.18215,0.074182,-0.09088,-0.170465,0.150954,-0.194791,mexican


In [48]:
X = clean_df.iloc[:,:105].values
y = clean_df['Cuisine'].values
# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# PCA dimensionality reduction
pca = PCA(n_components=100)
pca_X = pca.fit_transform(X)
cv = KFold(n_splits=10, random_state=42, shuffle=True)

In [60]:
svc = SVC()
svc.fit(pca_X, y)
y_pred = svc.predict(pca_X)
scores = cross_val_score(svc, pca_X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of SVC : %.3f (std = %.3f)' % (mean(scores), std(scores)))
print('F1 score of SVC : %.3f' % (mean(f1_score(y, y_pred, average='weighted'))))

Accuracy of SVC : 0.459 (std = 0.069)
F1 score of SVC : 0.586


In [63]:
rfc = RandomForestClassifier(max_depth=5)
rfc.fit(pca_X, y)
y_pred = rfc.predict(pca_X)
scores = cross_val_score(rfc, pca_X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Random Forest : %.3f (std = %.3f)' % (mean(scores), std(scores)))
print('F1 score of Random Frorest : %.3f' % (mean(f1_score(y, y_pred, average='weighted'))))

Accuracy of Random Forest : 0.438 (std = 0.069)
F1 score of Random Frorest : 0.890


In [72]:
catboost = CatBoostClassifier(iterations=10)
catboost.fit(pca_X, y)
y_pred = catboost.predict(pca_X)
scores = cross_val_score(catboost, pca_X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of CatBoost : %.3f (std = %.3f)' % (mean(scores), std(scores)))
print('F1 score of CatBoost : %.3f' % (mean(f1_score(y, y_pred, average='weighted'))))

Learning rate set to 0.5
0:	learn: 2.1043470	total: 42.4ms	remaining: 381ms
1:	learn: 1.9136714	total: 68.1ms	remaining: 272ms
2:	learn: 1.7817036	total: 88.5ms	remaining: 207ms
3:	learn: 1.6823975	total: 108ms	remaining: 162ms
4:	learn: 1.5451465	total: 128ms	remaining: 128ms
5:	learn: 1.4567574	total: 147ms	remaining: 98.3ms
6:	learn: 1.3591815	total: 167ms	remaining: 71.6ms
7:	learn: 1.2789443	total: 187ms	remaining: 46.7ms
8:	learn: 1.2176103	total: 207ms	remaining: 23ms
9:	learn: 1.1406785	total: 228ms	remaining: 0us
Accuracy of CatBoost : 0.342 (std = 0.038)
F1 score of CatBoost : 0.805
Learning rate set to 0.5
0:	learn: 2.1058903	total: 196ms	remaining: 1.76s
1:	learn: 1.9219016	total: 337ms	remaining: 1.35s
2:	learn: 1.7345913	total: 528ms	remaining: 1.23s
3:	learn: 1.6494745	total: 631ms	remaining: 946ms
4:	learn: 1.5480124	total: 780ms	remaining: 780ms
5:	learn: 1.4602963	total: 967ms	remaining: 645ms
6:	learn: 1.3786128	total: 1.05s	remaining: 450ms
7:	learn: 1.3170742	total