In [68]:
train_file_path = "C:/Users/shabbir.hasan/Documents/Hasan/Anaconda_rep/Kaggle/whats-cooking/train.json"

In [69]:
import json

In [70]:
with open(train_file_path, encoding='utf-8') as data_file:
    data = json.loads(data_file.read())

In [71]:
from pandas.io.json import json_normalize

In [72]:
df = json_normalize(data)

In [73]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


#### Converting to text

In [74]:
from nltk.stem import WordNetLemmatizer
import re

In [75]:
import nltk

In [76]:
def create_long_str(inp):
    temp_str = ''
    for i in inp:
        temp_str += ' ' + (i.replace("''", '')) 
    return temp_str.strip()

In [77]:
df['ingredients_str'] = df.ingredients.apply(create_long_str)

In [78]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
 

In [79]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(re.sub('<.*?>', ' ', token))
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [80]:
df['ingredients_stem'] = df['ingredients_str'].apply(tokenize_and_stem)

In [81]:
df.head()

Unnamed: 0,cuisine,id,ingredients,ingredients_str,ingredients_stem
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...,"[romain, lettuc, black, oliv, grape, tomato, g..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...,"[plain, flour, ground, pepper, salt, tomato, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...,"[egg, pepper, salt, mayonais, cook, oil, green..."
3,indian,22213,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt,"[water, veget, oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...,"[black, pepper, shallot, cornflour, cayenn, pe..."


In [82]:
#df = df.drop(['ingredients', 'ingredients_str'], axis=1)

In [83]:
from sklearn.pipeline import Pipeline

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [85]:
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

In [86]:
text_clf = Pipeline([
     ('tfidf', TfidfVectorizer(max_df=0.95, min_df=3, 
                                  max_features=1000,
#                                   tokenizer=tokenize_and_stem,
                                  stop_words='english', ngram_range=(1, 3))),
    ('svd', TruncatedSVD()),
#     ('clf', OneVsRestClassifier(SGDClassifier(class_weight='balanced', 
#                                               alpha=1e-3, random_state=42,
#                                               max_iter=5, tol=None)))
    ('clf', RandomForestClassifier())
 
])

In [87]:
from sklearn.model_selection import GridSearchCV
params = {
    'tfidf__max_df': (0.8, 0.95), # 0.95),
    'svd__n_components': (100,200),
    'clf__n_estimators': (100,150),
    'clf__max_depth': (5,10)
#     'clf__estimator__alpha': (1e-2), #, 1e-3)
    }

In [91]:
gridsearch = GridSearchCV(text_clf, params, scoring='accuracy', refit=True, cv=2, verbose=5)

In [92]:
gridsearch.fit(df['ingredients_str'], df['cuisine'])

Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.8 
[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.8, score=0.4839893429849696, total=  11.4s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.7s remaining:    0.0s


[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.8, score=0.49137367335647103, total=  11.3s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.95 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.3s remaining:    0.0s


[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.95, score=0.47871110440858594, total=  11.2s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.95 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   37.9s remaining:    0.0s


[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=100, tfidf__max_df=0.95, score=0.4965544992706604, total=  11.1s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.8 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   50.5s remaining:    0.0s


[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.8, score=0.4585029910018599, total=  15.4s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.8 
[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.8, score=0.4628036819073487, total=  15.3s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.95 
[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.95, score=0.4567435781430654, total=  15.4s
[CV] clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.95 
[CV]  clf__max_depth=5, clf__n_estimators=100, svd__n_components=200, tfidf__max_df=0.95, score=0.46179769629294304, total=  15.3s
[CV] clf__max_depth=5, clf__n_estimators=150, svd__n_components=100, tfidf__max_df=0.8 
[CV]  clf__max_depth=5, clf__n_estimators=150, svd__n_components=100, tfidf__max_df=0.8, score=0.48283315739204746, total=  15.

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 11.7min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidf__max_df': (0.8, 0.95), 'svd__n_components': (100, 200), 'clf__n_estimators': (100, 150), 'clf__max_depth': (5, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=5)

In [96]:
gridsearch.best_estimator_.named_steps

{'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.8, max_features=1000, min_df=3,
         ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words='english', strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 'svd': TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
        random_state=None, tol=0.0),
 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=10, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_st