# TASK:
Improve test score to 0.8

In [1]:
# https://www.kaggle.com/c/whats-cooking-kernels-only/data
# https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy  

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [3]:
# load data
train_df = pd.read_json('all/train.json')
# train_df = train_df[(train_df['cuisine'] == 'russian') | (train_df['cuisine'] == 'brazilian')]
test_df = pd.read_json('all/test.json')
# train_df.shape, test_df.shape

In [4]:
train_df.columns, test_df.columns

(Index([u'cuisine', u'id', u'ingredients'], dtype='object'),
 Index([u'id', u'ingredients'], dtype='object'))

# Clean Data

In [5]:
# Detect  missing values
print(train_df.isna().sum())
print(test_df.isna().sum())

cuisine        0
id             0
ingredients    0
dtype: int64
id             0
ingredients    0
dtype: int64


In [6]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
# possible issues:
# multiple ID
# spelling mistakes
# spelling case
# ingredients multiple times
# singular / plural
# different names for same ingredient

In [8]:
train_df['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

# Exploratory analysis

# Model Data

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier

# ML pipeline consisting of two steps
pipeline = Pipeline(steps=[
    
    ('tdidf', TfidfVectorizer()),
    ('SVC', SVC(C=100,
                kernel='rbf',
                gamma= 1, 
#                 probability=True, 
                class_weight=None
               )),
#     ('RFC', RandomForestClassifier(n_estimators= 100)),
#     ('LR', LogisticRegression()),
#     ('GB', GradientBoostingClassifier()),
#     ('GBC', KNeighborsClassifier(n_neighbors=100)),
])
pipeline = OneVsRestClassifier(pipeline)

In [10]:
list_ing = ['romaine lettuce', 'black olives', 'grape']

def func(l):
    string = "/".join(l)
    result = string.replace(" ","_")
    result2 = result.replace("/"," ")    
    return result2
print func(list_ing)

romaine_lettuce black_olives grape


In [11]:
from sklearn.model_selection import train_test_split

x_cols = ['ingredients']
y_cols = ['cuisine']

x = train_df.filter(items=x_cols)['ingredients'].apply(func)# turn list of strings into one string
y = train_df.filter(items=y_cols).values.ravel()

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, test_size=0.3)

   


In [12]:
%%time
pipeline.fit(x_train, y_train)
print ("TRAIN SCORE: ")
print(pipeline.score(x_train, y_train))
print ("TEST SCORE: ")
print(pipeline.score(x_test, y_test))

TRAIN SCORE: 
0.99974857224956
TEST SCORE: 
0.7996312746166094
CPU times: user 27min 58s, sys: 10.7 s, total: 28min 9s
Wall time: 28min 44s


In [13]:
# cross_validate(pipeline, x, y, cv=3, return_train_score=False)

In [14]:
# from sklearn.model_selection import learning_curve
# for train_sizes, train_scores, test_scores in learning_curve(pipeline, x, y, cv=5):
#     print (train_sizes, train_scores, test_scores)

In [15]:
pipeline.get_params()

{'estimator': Pipeline(memory=None,
      steps=[('tdidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
         dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=T...,
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))]),
 'estimator__SVC': SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'estimator__SVC__C': 100,
 'estimator__SVC__cache_size': 200,
 'estimator__SVC__class_weight': None,
 'estimator__SVC__coef0': 0.0,
 'estimator__SVC__decision_function_shape': 'ovr',
 'estimator__SVC__degree': 3,
 'estimator__SVC__gamma': 1,
 'estimator__SVC__kernel': 'rbf',
 'estimator__SVC__max_iter'

# Plot

## Validation curves

In [19]:
from sklearn.model_selection import validation_curve

# Function:
def vc(estimator, x, y, param_name, param_range, cv, p_type):
    # calculate train and test scores
    train_scores, test_scores = validation_curve(estimator=estimator,
                                                X=x, 
                                                y=y, 
                                                param_name=param_name,
                                                param_range=param_range,
                                                cv=cv)
    # specify xticklabels by using range when param_range is a list of strings
    p_range = range(len(param_range))
    # switch xtick labels with strings
    if type(param_range[0]) == str:
        plt.xticks(range(len(param_range)), param_range)
        param_range = p_range
        
    mean_train = np.mean(train_scores, axis=1)
    mean_test = np.mean(test_scores, axis=1)
    plt.title("Validation Curve with " + param_name.split("__")[-2]+ " on parameter "+ param_name.split("__")[-1])
    plt.xlabel(param_name.split("__")[-1])
    plt.ylabel("accuracy")
    plt.ylim(0.0, 1.1)
    # plot the train and test scores
    if p_type == 'scatter':
        plt.scatter(param_range, mean_train, label = "Training score",color="darkorange", lw=2)
        plt.scatter(param_range, mean_test, label = "Cross-validation score",color="navy", lw=2)
    elif p_type == 'plot':
        plt.plot(param_range, mean_train,label = "Training score",color="darkorange", lw=2)
        plt.plot(param_range, mean_test, label = "Cross-validation score",color="navy", lw=2)
    elif p_type == 'semilogx': 
        plt.semilogx(param_range, train_scores,label = "Training score",color="darkorange", lw=2)
        plt.semilogx(param_range, test_scores, label = "Cross-validation score",color="navy", lw=2)
    plt.legend(loc="best")
    plt.show()


In [None]:
# Kernel

vc(estimator = pipeline,
   x = x, 
   y= y, 
   param_name = "estimator__SVC__kernel",
   param_range = ['rbf', 'poly', 'linear','sigmoid'], 
   cv = 3,
   p_type = 'scatter')

In [None]:
vc(estimator = pipeline,
   x = x, 
   y= y, 
   param_name = "estimator__SVC__probability",
   param_range = [0, 0.5, 1], 
   cv = 3,
  p_type = 'scatter')

In [None]:
# C 
vc(estimator = pipeline,
   x = x, 
   y= y, 
   param_name = "estimator__SVC__C",
   param_range = [0.1, 1, 10, 100, 1000], 
   cv = 3,
   p_type = 'semilogx')

In [None]:
# Gamma 
vc(estimator = pipeline,
   x = x, 
   y= y, 
   param_name = "estimator__SVC__gamma",
   param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 
   cv = 3,
   p_type = 'semilogx')

In [None]:
# use function on other parameters
# clean up code
# run whole dataset 
## (bonus) make subplot