# TASK:
Improve test score to 0.8

In [1]:
# https://www.kaggle.com/c/whats-cooking-kernels-only/data
# https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy  

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [3]:
# load data
train_df = pd.read_json('all/train.json')
test_df = pd.read_json('all/test.json')
train_df.shape, test_df.shape

((39774, 3), (9944, 2))

In [4]:
train_df.columns, test_df.columns

(Index([u'cuisine', u'id', u'ingredients'], dtype='object'),
 Index([u'id', u'ingredients'], dtype='object'))

# Clean Data

In [5]:
# Detect  missing values
print(train_df.isna().sum())
print(test_df.isna().sum())

cuisine        0
id             0
ingredients    0
dtype: int64
id             0
ingredients    0
dtype: int64


In [6]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
# possible issues:
# multiple ID
# spelling mistakes
# spelling case
# ingredients multiple times
# singular / plural
# different names for same ingredient

In [8]:
train_df['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

# Exploratory analysis

# Model Data

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier

# ML pipeline consisting of two steps
pipeline = Pipeline(steps=[
    
    ('tdidf', TfidfVectorizer()),
    ('SVC', SVC(C=100, # penalty parameter
	 			 kernel='rbf', # kernel type, rbf working fine here
	 			 degree=3, # default value
	 			 gamma=1, # kernel coefficient
	 			 coef0=1, # change to 1 from default value of 0.0
	 			 shrinking=True, # using shrinking heuristics
	 			 tol=0.001, # stopping criterion tolerance 
	      		 probability=False, # no need to enable probability estimates
	      		 cache_size=600, # 200 MB cache size
	      		 class_weight=None, # all classes are treated equally 
# 	      		 verbose=False, # print the logs 
# 	      		 max_iter=-1, # no limit, let it run
#           		 decision_function_shape=None, # will use one vs rest explicitly 
#           		 random_state=None
               )),
#     ('RFC', RandomForestClassifier(n_estimators= 100)),
#     ('LR', LogisticRegression()),
#     ('GB', GradientBoostingClassifier()),
#     ('GBC', KNeighborsClassifier(n_neighbors=100)),
])
pipeline = OneVsRestClassifier(pipeline)

In [20]:
list_ing = ['romaine lettuce', 'black olives', 'grape']

def func(l):
    string = "/".join(l)
    result = string.replace(" ","_")
    result2 = result.replace("/"," ")    
    return result2
print func(list_ing)

romaine_lettuce black_olives grape


In [21]:
from sklearn.model_selection import train_test_split

x_cols = ['ingredients']
y_cols = ['cuisine']

x = train_df.filter(items=x_cols)['ingredients'].apply(func)# turn list of strings into one string
y = train_df.filter(items=y_cols).values.ravel()

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, test_size=0.3)

   


In [22]:
%%time
pipeline.fit(x_train, y_train)
print ("TRAIN SCORE: ")
print(pipeline.score(x_train, y_train))
print ("TEST SCORE: ")
print(pipeline.score(x_test, y_test))

TRAIN SCORE: 
0.9995689809992457
TEST SCORE: 
0.8037375345680047
CPU times: user 17min 32s, sys: 6.67 s, total: 17min 39s
Wall time: 17min 39s


In [13]:
# cross_validate(pipeline, x, y, cv=3, return_train_score=False)

In [14]:
# from sklearn.model_selection import learning_curve
# for train_sizes, train_scores, test_scores in learning_curve(pipeline, x, y, cv=5):
#     print (train_sizes, train_scores, test_scores)