In [5]:
# Load, parse and check data from train and test files

import numpy as np
import json
import pprint
pp = pprint.PrettyPrinter()

# load datasets into script
train_file = open("./whats_cooking/train.json")
test_file  = open("./whats_cooking/test.json")
train_data = json.load(train_file)
test_data  = json.load(test_file)

# sort lists according to id
train_data.sort(key = lambda dish: dish["id"])
test_data.sort(key = lambda dish: dish["id"])
train_labels = []

for i in range(len(train_data)):
    train_labels.append(train_data[i]["cuisine"])
    del train_data[i]["cuisine"]

print(len(train_labels))
print(len(test_data))

pp.pprint(train_data[:3])
pp.pprint(train_labels[:10])


39774
9944
[{'id': 0,
  'ingredients': ['mussels',
                  'ground black pepper',
                  'garlic cloves',
                  'saffron threads',
                  'olive oil',
                  'stewed tomatoes',
                  'arborio rice',
                  'minced onion',
                  'medium shrimp',
                  'fat free less sodium chicken broth',
                  'green peas']},
 {'id': 1,
  'ingredients': ['tomatoes',
                  'diced red onions',
                  'paprika',
                  'salt',
                  'corn tortillas',
                  'fresh cilantro',
                  'cremini',
                  'vegetable broth',
                  'freshly ground pepper',
                  'ground chipotle chile pepper',
                  'bell pepper',
                  'extra-virgin olive oil',
                  'yellow onion',
                  'ground cumin',
                  'poblano peppers',
                  'chili pow

In [6]:
# Exercise 2 b, find out how many unique ingredients and cuisines
# there are in train_dataset

unique_ingredients = {}
unique_cuisines = {}
for i in range(len(train_data)):
    dish = train_data[i]
    if train_labels[i] not in unique_cuisines:
        unique_cuisines[train_labels[i]] = True
    for ingredient in dish["ingredients"]:
        if ingredient not in unique_ingredients:
            unique_ingredients[ingredient] = True

print('# of samples in training set: {}'.format(len(train_labels)))
print('# of categories in training set: {}'.format(len(unique_ingredients)))
print('# of unique ingredients in training set: {}'.format(len(unique_cuisines)))

unique_features = np.array([feature for feature in unique_ingredients])
unique_labels = np.array([cuisine for cuisine in unique_cuisines])

# pp.pprint(unique_features[:10])

# of samples in training set: 39774
# of categories in training set: 6714
# of unique ingredients in training set: 20


In [7]:
# Exercise 2 c, convert all dishes cuisines into codes for model training

# word_labels = np.array([i for i in unique_labels])
# label_codes = {}
label_codes = []

for label in unique_labels:
    label = ( unique_labels == label ).astype(int)
    label_codes.append(label)
#     label_codes[word] = label

label_codes = np.array(label_codes)

pp.pprint(label_codes)
print(len(label_codes))

print(unique_labels[np.argmax(label_codes[0])])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
# Exercise 2 c, convert all dishes ingredients into "bag of words" style
# binary 1x6714 vectors

feature_codes = []

for feature in unique_features:
    feature_code = ( unique_features == feature ).astype(int)
    feature_codes.append(feature_code)

feature_codes = np.array(feature_codes)

pp.pprint(feature_codes[0])
print(len(feature_codes), " ", len(feature_codes[0]))

print(unique_features[np.argmax(feature_codes[0])])

array([1, 0, 0, ..., 0, 0, 0])
6714   6714
mussels


In [13]:
# assemble data matrix X and label matrix label(if applicable) 
# for sklearn use
def MLify(data, targets=None):
    X = []
    if targets != None:
        labels = []

    # for every item in data, 
    # 1) encode ingredients into a row of X
    # 2) encode item label into one vs all format
    for i in range(len(data)):
        X.append(np.array([0 for j in range(len(unique_features))]))
        if targets != None:
            labels.append( (unique_labels == targets[i]).astype(int) )
        for ingredient in data[i]["ingredients"]:
            X[i] = np.bitwise_or(X[i], (unique_features == ingredient).astype(int))
    print("features: ", len(X), " ", len(X[0]))
    if targets != None:
        print("labels: ", len(labels), " ", len(labels[0]))
    
    if targets != None:
        return X,labels
    return X

In [14]:
X_train, y_train = MLify(train_data, train_labels)
X_test = MLify(test_data)

features:  39774   6714
labels:  39774   20
features:  9944   6714


In [15]:
# test to check if X_train and y_train were assembled correctly
print("\n\nNumber of ingredients on list must match number on second print")
pp.pprint(train_data[0])
print(np.sum(X_train[0]))

print("\n\nBoth labels bellow must match")
print(train_labels[0])
print(unique_labels[np.argmax(y_train[0])])



Number of ingredients on list must match number on second print
{'id': 0,
 'ingredients': ['mussels',
                 'ground black pepper',
                 'garlic cloves',
                 'saffron threads',
                 'olive oil',
                 'stewed tomatoes',
                 'arborio rice',
                 'minced onion',
                 'medium shrimp',
                 'fat free less sodium chicken broth',
                 'green peas']}
11


Both labels bellow must match
spanish
spanish


In [16]:
# PROGRAMMING EXERCISES: 2d
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

kfold = model_selection.KFold(n_splits=3)

cv_results = model_selection.cross_val_score(GaussianNB(), X_train, train_labels, cv=kfold)
print('GaussianNB accuracy: {}'.format(cv_results.mean()))

cv_results = model_selection.cross_val_score(BernoulliNB(), X_train, train_labels, cv=kfold)
print('BernoulliNB accuracy: {}'.format(cv_results.mean()))

GaussianNB accuracy: 0.38039925579524314
BernoulliNB accuracy: 0.6829587167496354


In [9]:
# PROGRAMMING EXERCISES: 2f
from sklearn.linear_model import LogisticRegression

cv_results = model_selection.cross_val_score(LogisticRegression(), X_train, train_labels, cv=kfold)
print('LogisticRegression accuracy: {}'.format(cv_results.mean()))

LogisticRegression accuracy: 0.7751294815708755


In [17]:
test_id = []

for i in range(len(test_data)):
    test_id.append(test_data[i]["id"])

In [27]:
# PROGRAMMING EXERCISES: 2g
import pandas as pd
from collections import OrderedDict

logistic = LogisticRegression()
logistic.fit(X_train, train_labels)
logistic_prediction = logistic.predict(X_test)

d = pd.DataFrame(data=OrderedDict([('id', test_id), ('cuisine', logistic_prediction)]))
d.to_csv('submission_whats_cooking.csv', index=False)

# References
* https://docs.scipy.org/doc/numpy/
* https://docs.python.org/3/
* http://scikit-learn.org/stable/
* https://matplotlib.org/api/
* https://www.youtube.com/watch?v=IdsV0RaC9jM
* https://www.youtube.com/watch?v=mBcLRGuAFUk