# **PCA Model**
Since I could not single out a large set of ingredients by mere exploration of the dataset to eliminate in order to mitigate the dimensionality problem, I decided to apply the Principal Component Analysis (PCA) technique to do so.

### Load Train Dataset

In [140]:
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
rcParams['figure.figsize'] = 12, 10
sb.set_style('whitegrid')
import importlib
from Methods import get_ingredients

In [141]:
df_original = pd.read_json('Dataset/train.json')

df_original.head()
df_original.shape

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


(39774, 3)

## Categorization cuisines

In [142]:
# Categorizes the cuisines Train set
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

encoder_cuisine = LabelEncoder()
cuisine = df_original['cuisine']
cuisine_encoded = encoder_cuisine.fit_transform(cuisine)
cuisine_encoded

array([ 6, 16,  4, ...,  8,  3, 13], dtype=int64)

In [143]:
encoder_cuisine.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

In [144]:
# Add encoded to Train set

if (df_original.columns).contains('cuisine_encoded'):
    df_original.head()
    df_original.shape
else:
    df_original_left = df_original.iloc[:, 0:1]
    df_original_right = df_original.iloc[:, 1:]

    df_original_left['cuisine_encoded'] = cuisine_encoded
    df_original = df_original_left.join(df_original_right)

    df_original.head()
    df_original.shape

Unnamed: 0,cuisine,cuisine_encoded,id,ingredients
0,greek,6,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,16,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,4,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,7,22213,"[water, vegetable oil, wheat, salt]"
4,indian,7,13162,"[black pepper, shallots, cornflour, cayenne pe..."


(39774, 4)

## Dummification

### Dummifies the ingredients of dataset

In [145]:
# Dummify the original set with all the original unique ingredients
from sklearn.preprocessing import MultiLabelBinarizer

mlb_original = MultiLabelBinarizer()

dummies_original = DataFrame(mlb_original.fit_transform(df_original['ingredients']), columns=mlb_original.classes_, index=df_original.index)
dummies_original.head()
dummies_original.shape

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(39774, 6714)

### Join non-ingredient columns with the dummified ingredient columns to create the new dataset 

In [146]:
# Join Original and Dummfied

df_original_dummified = df_original.loc[:, ['cuisine', 'cuisine_encoded', 'id']].join(dummies_original)

df_original_dummified.head()
df_original_dummified.shape

Unnamed: 0,cuisine,cuisine_encoded,id,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,greek,6,10259,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,16,25693,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,filipino,4,20130,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,7,22213,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,7,13162,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(39774, 6717)

### Partition Creation (Train, Validation, Test)
Because the `Test Dataset` given in this exercise does not have the cuisines of the recipes so the effectiveness of my model can be verified, I also created a validation set to further test the accuracy of the models tested.

In [147]:
from sklearn.model_selection import train_test_split

# Divides dataset in train and set
x_train, x_test, y_train, y_test = train_test_split(dummies_original, cuisine_encoded, test_size=0.2, random_state=123)

# Divides train set and train and validation
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=123)



test_set = df_original_dummified.loc[x_test.index]

#Reset index
x_train.reset_index(drop=True, inplace=True)
x_validation.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)

x_train.shape
x_validation.shape
x_test.shape

(25455, 6714)

(6364, 6714)

(7955, 6714)

### PCA Fitting
To implement the PCA model, the number of dimensions was set to 1000.

In [148]:
# PCA fitting
from sklearn.decomposition import PCA

pca = PCA(n_components=1000, whiten='True')
pca.fit(x_train)

x_train_pca = pca.transform(x_train)
x_validation_pca = pca.transform(x_validation)
x_test_pca = pca.transform(x_test)

PCA(copy=True, iterated_power='auto', n_components=1000, random_state=None,
  svd_solver='auto', tol=0.0, whiten='True')

# Models

### Logistic Regression

In [169]:
# Logistic Regression ModelPCA
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=123)
lr.fit(x_train_pca, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [150]:
lr_score = lr.score(x_validation_pca, y_validation)
lr_score

0.7423004399748586

In [151]:
TEST_lr_score = lr.score(x_test_pca, y_test)
TEST_lr_score

0.7400377121307354

### KNN

In [152]:
# KNN Model PCA
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=13) # Best n from previous tests
knn.fit(x_train_pca, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

In [153]:
# Train-Test
knn_score = knn.score(x_validation_pca, y_validation)
knn_score

0.5688246385920804

In [154]:
# TEST
TEST_knn_score = knn.score(x_test_pca, y_test)
TEST_knn_score

0.5561282212445003

### Decision Tree Classifier

In [155]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=123)
dt.fit(x_train_pca, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [156]:
dt_score = dt.score(x_validation_pca, y_validation)
dt_score

0.3876492771841609

In [157]:
TEST_dt_score = dt.score(x_test_pca, y_test)
TEST_dt_score

0.37611565053425516

### SVM Classifier 

In [158]:
#SVM Model
from sklearn.svm import SVC  

svclassifier = SVC(kernel='linear', random_state=123)  
svclassifier.fit(x_train_pca, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [159]:
svm_score = svclassifier.score(x_validation_pca, y_validation)
svm_score

0.6816467630421119

In [160]:
TEST_svm_score = svclassifier.score(x_test_pca, y_test)
TEST_svm_score

0.6747957259585167

### Random Forest Classifier

In [161]:
# Random Forest PCA
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=460, random_state=123)
rf.fit(x_train_pca, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=460, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [162]:
#RF PCA score
rf_score = rf.score(x_validation_pca, y_validation)
rf_score

0.5906662476429918

In [163]:
TEST_rf_score = rf.score(x_test_pca, y_test)
TEST_rf_score

0.583029541169076

## Scores

### Train

In [164]:
scores_train = Series([lr_score, knn_score, dt_score, svm_score, rf_score], index=['logistic regression', 'knn', 'decision tree classifier', 'svm classifier', 'random forest classifier'])
scores_train.sort_values(ascending=False)

logistic regression         0.742300
svm classifier              0.681647
random forest classifier    0.590666
knn                         0.568825
decision tree classifier    0.387649
dtype: float64

### Test

In [165]:
scores_TEST = Series([TEST_lr_score, TEST_knn_score, TEST_dt_score, TEST_svm_score, TEST_rf_score], index=['logistic regression', 'knn', 'decision tree classifier', 'svm classifier', 'random forest classifier'])
scores_TEST.sort_values(ascending=False)

logistic regression         0.740038
svm classifier              0.674796
random forest classifier    0.583030
knn                         0.556128
decision tree classifier    0.376116
dtype: float64