In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation

In [2]:
# read data and transform to X: N*P, y: N*1 format
train_data = pd.read_json("train.json")
print "[INFO] N is %d." % train_data.shape[0]
print train_data.columns

[INFO] N is 39774.
Index([u'cuisine', u'id', u'ingredients'], dtype='object')


In [3]:
# handle labels (y)

# TODO
_labels = train_data.cuisine
label_enc = LabelEncoder()
y = label_enc.fit_transform(_labels)

assert len(label_enc.classes_) == len(set(_labels))
assert y.shape[0] == train_data.shape[0]
print "[INFO] In total %d labels" % len(label_enc.classes_)

print label_enc.classes_

[INFO] In total 20 labels
[u'brazilian' u'british' u'cajun_creole' u'chinese' u'filipino' u'french'
 u'greek' u'indian' u'irish' u'italian' u'jamaican' u'japanese' u'korean'
 u'mexican' u'moroccan' u'russian' u'southern_us' u'spanish' u'thai'
 u'vietnamese']


In [4]:
%%time

# handle ingredients (e.g. X[i, 0:p])

# Q to TA : common operations on csr_matrix
# Q to TA : code too ugly

from itertools import chain
all_igdt = set(chain.from_iterable(train_data.ingredients))
print "[INFO] In total %d ingredients." % len(all_igdt)
        
    
_igdt_str_list = map(lambda r: "sepearate".join(r), train_data.ingredients)
assert len(_igdt_str_list) == train_data.shape[0]

# def tok(x):
#     print "[INFO] %s" % x
#     return x.split('sepearate')

enc = CountVectorizer(vocabulary=all_igdt, 
                      tokenizer=lambda x : x.split('sepearate'))
X = enc.fit_transform(_igdt_str_list)

assert X.shape == (train_data.shape[0], len(all_igdt))
print "\n--show first sample--"
print X[0,:]
print "--"


# print enc.get_feature_names

[INFO] In total 6714 ingredients.

--show first sample--
  (0, 956)	1
  (0, 2548)	1
  (0, 2878)	1
  (0, 2884)	1
  (0, 3033)	1
  (0, 4569)	1
  (0, 4911)	1
  (0, 5222)	1
  (0, 5405)	1
--
CPU times: user 475 ms, sys: 20.3 ms, total: 495 ms
Wall time: 503 ms


In [5]:
print X.shape, y.shape

(39774, 6714) (39774,)


In [7]:
%%time
# LR
cls_lr = LogisticRegression()
cls_lr.fit(X[:30000], y[:30000])
print cls_lr.score(X[30000:], y[30000:])

0.779107837119
CPU times: user 5.59 s, sys: 2.03 ms, total: 5.6 s
Wall time: 5.63 s


In [235]:
print type(y)

<type 'numpy.ndarray'>


In [7]:
%%time

# Q TA why cannot use csr matrix in Gaussian prior NB?

# Naiive Bayes Classifier - Gaussian prior assumption
# cls_gaussian_nb = GaussianNB()
# cls_gaussian_nb.fit(X[:30000].toarray(), y[:30000])
# print cls_gaussian_nb.score(X[30000:].toarray(), y[30000:])

tr = []
te = []
for tr_idx, te_idx in cross_validation.KFold(X.shape[0], n_folds=3):
    tr.append(tr_idx)
    te.append(te_idx)
cls_gaussian_nb = GaussianNB()
cls_gaussian_nb.fit(X[tr[0]].toarray(), y[tr[0]])
score = cls_gaussian_nb.score(X[te[0]].toarray(), y[te[0]])
print score

0.370644139388
CPU times: user 10.6 s, sys: 30min 13s, total: 30min 23s
Wall time: 1h 22min 6s


In [197]:
# Naiive Bayes Classifier - Bernouli prior assumption
cls_bernoulli_nb = BernoulliNB()
cls_bernoulli_nb.fit(X[:30000], y[:30000])
print cls_bernoulli_nb.score(X[30000:], y[30000:])

0.690300798036


In [211]:
# Test phase
test_data = pd.read_json('test.json')
print test_data.columns

Index([u'id', u'ingredients'], dtype='object')


In [212]:
X_te = enc.transform(map(lambda r: "sepearate".join(r), test_data.ingredients))

In [215]:
print X_te.shape
mdl = LogisticRegression()
mdl.fit(X, y)

(9944, 6714)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [220]:
y_te = mdl.predict(X_te)
labels_te = label_enc.inverse_transform(y_te)
ret = np.column_stack((test_data.id, labels_te))

In [223]:
np.savetxt('testResult_cooking.csv', ret, delimiter=',', fmt='%s', header='id,cuisine', comments='')

0    18009
1    28583
2    41580
3    29752
4    35687
Name: id, dtype: int64


In [149]:
%reset_selective X

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [8]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [6]:
%whos

Variable             Type               Data/Info
-------------------------------------------------
BernoulliNB          ABCMeta            <class 'sklearn.naive_bayes.BernoulliNB'>
CountVectorizer      type               <class 'sklearn.feature_e<...>on.text.CountVectorizer'>
GaussianNB           ABCMeta            <class 'sklearn.naive_bayes.GaussianNB'>
LabelEncoder         type               <class 'sklearn.preproces<...>sing.label.LabelEncoder'>
LogisticRegression   type               <class 'sklearn.linear_mo<...>stic.LogisticRegression'>
X                    csr_matrix           (0, 956)	1\n  (0, 2548)<...>309)	1\n  (39773, 6519)	1
all_igdt             set                set([u'low-sodium fat-fre<...>ef broth', u'hot water'])
chain                type               <type 'itertools.chain'>
cm                   module             <module 'matplotlib.cm' f<...>kages/matplotlib/cm.pyc'>
cross_validation     module             <module 'sklearn.cross_va<...>rn/cross_validation.pyc'>
