In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
import statsmodels.api as sm
from scipy.stats import normaltest
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from statsmodels.stats import weightstats as stests
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
sns.set(style="whitegrid", palette="pastel", color_codes=True)

#### Link to Dataset: https://www.kaggle.com/hugodarwood/epirecipes

In [9]:
df = pd.read_csv('../cleaned_epi.csv')

In [10]:
df.head(1)

Unnamed: 0,title,rating,calories,protein,fat,sodium,appetizer,dessert,dinner,low carb,low sugar,meat,vegan,vegetarian,snack,alcoholic,holidays,SqrtCalories,LogProtein,SqrtFat
0,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,20.07486,2.944439,4.795832


In [11]:
df.drop(columns='title',inplace=True)

In [134]:
df.rating.value_counts()

4.375    5419
3.750    3530
5.000    1745
3.125    1019
Name: rating, dtype: int64

In [15]:
df.dtypes

rating          float64
calories        float64
protein         float64
fat             float64
sodium          float64
appetizer       float64
dessert         float64
dinner          float64
low carb        float64
low sugar       float64
meat            float64
vegan           float64
vegetarian      float64
snack           float64
alcoholic       float64
holidays          int64
SqrtCalories    float64
LogProtein      float64
SqrtFat         float64
dtype: object

In [221]:
# create feature and target variables
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

In [222]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test

In [223]:
lab_enc = preprocessing.LabelEncoder()
ytrain_enc = lab_enc.fit_transform(y_train)
ytest_enc = lab_enc.fit_transform(y_test)

In [224]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="gini")

# Train Decision Tree Classifer
clf = clf.fit(X_train,ytrain_enc)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [225]:
ypred_enc = lab_enc.fit_transform(y_pred)

In [226]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(ytest_enc, ypred_enc))

Accuracy: 0.43491250533504056


This accuracy score is on the lower end, it could be improved by tuning the hyperparameters.

ROC Evaluation

In [140]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

In [145]:
# Binarize the output
y = label_binarize(y_enc, classes=[0,1,2,3])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test

clf = DecisionTreeClassifier(criterion="gini")

clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

In [146]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [147]:
print(roc_auc['micro'])

0.6250533504054632


Classification Report

In [148]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.23      0.26      0.24       206
          1       0.40      0.40      0.40       706
          2       0.52      0.54      0.53      1026
          3       0.39      0.34      0.37       405

avg / total       0.44      0.44      0.44      2343



### Parameter Tuning

In [62]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy")

clf = clf.fit(X_train,ytrain_enc)

y_pred = clf.predict(X_test)

In [63]:
ypred_enc = lab_enc.fit_transform(y_pred)

In [64]:
print("Accuracy:",metrics.accuracy_score(ytest_enc, ypred_enc))

Accuracy: 0.42168160478019634


The accuracy decreased with the criterion change, so we will keep the criterion as "gini."

Max Depth

In [165]:
# redefine X and y
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

In [166]:
lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 
# 80% training and 20% test

In [168]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
scores = []
for max_depth in max_depths:
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    ypred_enc = lab_enc.fit_transform(y_pred)
    scores.append(metrics.accuracy_score(y_test,ypred_enc))

In [183]:
best_max_depth = max_depths[scores.index(max(scores))]
print('Best Maximum Depth: ',best_max_depth)

Best Maximum Depth:  2.0


Minimum Sample Split

In [176]:
# redefine X and y
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

In [177]:
lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 
# 80% training and 20% test

In [181]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
scores = []
for min_split in min_samples_splits:
    clf = DecisionTreeClassifier(min_samples_split=min_split)
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    ypred_enc = lab_enc.fit_transform(y_pred)
    scores.append(metrics.accuracy_score(y_test,ypred_enc))

In [185]:
best_min_split = min_samples_splits[scores.index(max(scores))]
print('Best Minimum Sample Split: ',best_min_split)

Best Minimum Sample Split:  0.2


Minimum Samples Leaf

In [None]:
# redefine X and y
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 
# 80% training and 20% test

In [187]:
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
scores = []
for leaf in min_samples_leaf:
    clf = DecisionTreeClassifier(min_samples_leaf=leaf)
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    ypred_enc = lab_enc.fit_transform(y_pred)
    scores.append(metrics.accuracy_score(y_test,ypred_enc))

In [188]:
best_min_leaf = min_samples_leaf[scores.index(max(scores))]
print('Best Minimum Sample Leaf: ',best_min_leaf)

Best Minimum Sample Leaf:  0.1


### Parameter Tuning - CV Grid Search

In [324]:
from sklearn.model_selection import GridSearchCV

# redefine X and y
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 

In [325]:
clf = DecisionTreeClassifier()

In [326]:
# Setup the hyperparameter grid
max_depths = np.linspace(1, 32, 32, endpoint=True)
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leaf = [0.001, 0.1, 1, 10, 100]
criterion = ['gini','entropy']
param_grid = {'criterion': criterion,'max_depth': max_depths,'min_samples_split': min_samples_splits, 
              'min_samples_leaf': min_samples_leaf}

In [327]:
#combine parameters with model
clf_model = GridSearchCV(clf,param_grid,cv=5)

In [328]:
# Fit it to the data
clf_model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32.]), 'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'min_samples_leaf': [0.001, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [329]:
# Print the tuned parameter and score
print("Tuned Logistic Regression Parameters: {}".format(clf_model.best_params_))
print('Test Data Accuracy Score: ',accuracy_score(clf_model.predict(X_test), y_test))

Tuned Logistic Regression Parameters: {'criterion': 'gini', 'max_depth': 4.0, 'min_samples_leaf': 0.001, 'min_samples_split': 0.2}
Test Data Accuracy Score:  0.4379001280409731


In [334]:
y_pred = clf_model.predict(X_test)

In [335]:
ypred_enc = lab_enc.fit_transform(y_pred)

In [336]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       206
          1       0.00      0.00      0.00       706
          2       0.44      1.00      0.61      1026
          3       0.00      0.00      0.00       405

avg / total       0.19      0.44      0.27      2343



  'precision', 'predicted', average, warn_for)


### PCA

Below, we will use PCA to reduce the dimensions to the 2 most important in predicting a rating.

In [276]:
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

In [277]:
# Import PCA
from sklearn.decomposition import PCA

# Create a PCA model with 2 components: pca
pca = PCA(n_components = 2)

# Fit the PCA instance to tX
pca.fit(X)

# Transform the scaled samples: pca_features
X = pca.transform(X)

In [278]:
lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

In [279]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 
# 80% training and 20% test

In [280]:
# Create Decision Tree classifer object with best parameter values
clf = DecisionTreeClassifier(max_depth = 4.0, min_samples_leaf = 0.001, min_samples_split = 0.2)

# Train Decision Tree Classifer
clf = clf.fit(X_train, ytrain_enc)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [281]:
ypred_enc = lab_enc.fit_transform(y_pred)

In [282]:
print("Accuracy:",metrics.accuracy_score(y_test, ypred_enc))

Accuracy: 0.3013230900554844


### Random Forest

In [302]:
X = df[['calories','protein','fat','vegetarian','holidays','sodium']]
y = df['rating']

In [303]:
lab_enc = preprocessing.LabelEncoder()
y_enc = lab_enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=1) 

In [304]:
from sklearn.ensemble import RandomForestClassifier

regressor = RandomForestClassifier(n_estimators=100, random_state=1)  
regressor.fit(X_train, y_train)  
y_pred = regressor.predict(X_test) 

In [305]:
ypred_enc = lab_enc.fit_transform(y_pred)

In [306]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(ytest_enc,ypred_enc))  
print(classification_report(ytest_enc,ypred_enc))  
print(accuracy_score(ytest_enc, ypred_enc))  

[[ 34  60 103   9]
 [  5 267 400  34]
 [  6 216 757  47]
 [  5  59 227 114]]
             precision    recall  f1-score   support

          0       0.68      0.17      0.27       206
          1       0.44      0.38      0.41       706
          2       0.51      0.74      0.60      1026
          3       0.56      0.28      0.37       405

avg / total       0.51      0.50      0.47      2343

0.5002134016218523
