# What's Cooking - Kaggle Solutions

### Importing modules

In [1]:
import pandas as pd
import json
from pprint import pprint
import numpy as np
import re
from nltk.stem.snowball import SnowballStemmer

### Importing datasets

In [2]:
train_data = pd.read_json('./train.json')
train_data.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39774 entries, 0 to 39773
Data columns (total 3 columns):
cuisine        39774 non-null object
id             39774 non-null int64
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [4]:
train_ingr = train_data['ingredients']
label = train_data['cuisine']

In [5]:
test_data = pd.read_json('./test.json')
test_data.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [6]:
test_ingr = test_data['ingredients']

In [7]:
def listtostr(listring):
    s = ','.join(listring)
    s = s.lower()
    return s

In [8]:
train_text = train_ingr.apply(lambda x: listtostr(x))
test_text = test_ingr.apply(lambda x: listtostr(x))
train_text = train_text.apply(lambda x: re.sub('\(([^)]+)\)','',x))
test_text = test_text.apply(lambda x: re.sub('\(([^)]+)\)','',x))

In [9]:
stemmer = SnowballStemmer('english')
def stemming(stri):
    str_strip = stri.strip()
    str_split = str_strip.split(',')
    new_stri = []
    for s in str_split:
        space_split = s.split()
        stem_split = [stemmer.stem(x) for x in space_split]
        new_split = ' '.join(stem_split)
        new_stri.append(new_split)
    return new_stri    
        

In [10]:
train_text = train_text.apply(lambda x: stemming(x))
test_text = test_text.apply(lambda x: stemming(x))

In [11]:
train_text[0]

[u'romain lettuc',
 u'black oliv',
 u'grape tomato',
 u'garlic',
 u'pepper',
 u'purpl onion',
 u'season',
 u'garbanzo bean',
 u'feta chees crumbl']

In [12]:
test_text.head()

0    [bake powder, egg, all-purpos flour, raisin, m...
1    [sugar, egg yolk, corn starch, cream of tartar...
2    [sausag link, fennel bulb, frond, oliv oil, cu...
3    [meat cut, file powder, smoke sausag, okra, sh...
4    [ground black pepper, salt, sausag case, leek,...
Name: ingredients, dtype: object

In [13]:
train_text.head()

0    [romain lettuc, black oliv, grape tomato, garl...
1    [plain flour, ground pepper, salt, tomato, gro...
2    [egg, pepper, salt, mayonais, cook oil, green ...
3                      [water, veget oil, wheat, salt]
4    [black pepper, shallot, cornflour, cayenn pepp...
Name: ingredients, dtype: object

In [14]:
# testing_train = train_text[:2]
# testing_train
all_together = []
for i in train_text:
    all_together = all_together + i

corpus = set(all_together)

vocab = {}
i = 0
for c in corpus:
    if c not in vocab:
        vocab[c] = i
        i += 1
        
# sorted(vocab.values())

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=vocab, token_pattern=u'[a-z]+[\s[a-z]+]*')
print vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'[a-z]+[\\s[a-z]+]*',
        tokenizer=None,
        vocabulary={'': 0, u'cherri pie fill': 2, u'chanterell': 3, u'taco shell': 4, u'cajun season': 5, u'mung bean noodl': 6, u'garlic clove': 7, u'sandwich wrap': 8, u'condens french onion soup': 70, u'roast pork season mix': 10, u'back rib': 11, u'serrano pepper': 12, u'veget soup mix': 3293, u'peel fr...'cinnamon ice cream': 6679, u'tropic fruit': 6680, u'peach slice': 6673, u'tree ear mushroom': 6682})


In [16]:
# tets = test_text[:2]
train_ingredients = [','.join(s) for s in train_text]
test_ingredients = [','.join(s) for s in test_text]
train_features = vectorizer.fit_transform(train_ingredients)
test_features = vectorizer.transform(test_ingredients)

In [17]:
train_features_df = pd.DataFrame(train_features.todense(), columns=vectorizer.get_feature_names())
test_features_df = pd.DataFrame(test_features.todense(), columns=vectorizer.get_feature_names())
# sum_fea_tr = train_features_df.sum(axis=1)
# print sum_fea_tr[:5]
# print train_ingredients[:5]

### train and test data sets

In [18]:
X = train_features_df.iloc[:10000,:]
y = label.iloc[:10000]

### Building the best model

In [19]:
# Trying classification algorithms
# from sklearn.cross_validation import train_test_split
# from sklearn import grid_search
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)



In [20]:
# from sklearn.neighbors import KNeighborsClassifier 

In [21]:
# parameters = {'kernel':('linear', 'rbf','poly','sigmoid'), 'C':[0.1, 1, 10, 100]}
# svr = SVC()
# model = grid_search.GridSearchCV(svr, parameters, n_jobs=4, cv=5)
# model.fit(X_train, y_train)
# print model.best_score_ 
# print model.best_params_ 

# model_knn = KNeighborsClassifier(n_neighbors=5)
# model_knn.fit(X_train, y_train)
# print accuracy_score(y_test,model_knn.predict(X_test))

In [22]:
from sklearn.linear_model import LogisticRegression
# from sklearn import grid_search
# model_lr = LogisticRegression(solver='lbfgs')
# parameters = {'C':[0.01,0.1,1,10]}
# model = grid_search.GridSearchCV(model_lr, parameters, n_jobs=4)
# model.fit(X_train, y_train)
# # print accuracy_score(y_test,model_lr.predict(X_test))
# print model.best_score_ 
# print model.best_params_ 

In [23]:
# from sklearn.ensemble import RandomForestClassifier 
from sklearn.cross_validation import cross_val_score

# model_rf = RandomForestClassifier()
# model_rf.fit(X_train, y_train)
# print accuracy_score(y_test,model_dt.predict(X_test))

# print cross_val_score(model_rf, X, y, cv=5).mean()

In [24]:
# from sklearn.ensemble import GradientBoostingClassifier
# est = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
model_lr = LogisticRegression(solver='lbfgs')
print cross_val_score(model_lr, X, y, cv=5).mean()

KeyboardInterrupt: 

### Predicting

In [None]:
# predictions = pd.DataFrame()
# predictions['id'] = test_data['id']

In [None]:
# Z = test_features_df
# predictions['cuisine'] = model_lr.predict(Z)

In [None]:
# predictions.head(10)

In [None]:
# _ = predictions.to_csv('submission.csv', index=False)