In [1]:
import numpy as np
from scipy import misc
import imageio
from matplotlib import pylab as plt
import matplotlib.cm as cm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [2]:
with open('/content/train.json') as f:
    train_data = pd.read_json('train.json')

with open('/content/test.json') as f:
    test_data = pd.read_json('test.json')

In [3]:
# !unzip train.json.zip
# !unzip test.json.zip

In [4]:
train_data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
test_data.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [6]:
train_data_id = train_data['id'].values
print('Number of dishes: ', train_data_id.shape[0])

Number of dishes:  39774


In [7]:
train_cuisine = train_data['cuisine'].values
unique_cuisine = list(set(train_cuisine))
print('Number of unique cuisine: ', len(unique_cuisine))

Number of unique cuisine:  20


In [8]:
train_ingredients = train_data['ingredients']
train_ingredients.head()

0    [romaine lettuce, black olives, grape tomatoes...
1    [plain flour, ground pepper, salt, tomatoes, g...
2    [eggs, pepper, salt, mayonaise, cooking oil, g...
3                  [water, vegetable oil, wheat, salt]
4    [black pepper, shallots, cornflour, cayenne pe...
Name: ingredients, dtype: object

In [9]:
from tqdm import tqdm

In [10]:
#intializing four different for loops for faster parallel operations 
ingredients = []
ingredients1 = []
ingredients2 = []
ingredients3 = []
ingredients4 = []
n = len(train_ingredients)//4
for i in tqdm(train_ingredients[:n]):
  ingredients1 = np.concatenate((ingredients1,i))
for i in tqdm(train_ingredients[n:2*n]):
  ingredients2 = np.concatenate((ingredients2,i))
for i in tqdm(train_ingredients[2*n:3*n]):
  ingredients3 = np.concatenate((ingredients3,i))
for i in tqdm(train_ingredients[3*n:]):
  ingredients4 = np.concatenate((ingredients4,i))
ingredients = np.concatenate((ingredients1,ingredients2,ingredients3,ingredients4))
len(ingredients)

100%|██████████| 9943/9943 [01:11<00:00, 138.35it/s]
100%|██████████| 9943/9943 [01:08<00:00, 144.24it/s]
100%|██████████| 9943/9943 [01:19<00:00, 124.82it/s]
100%|██████████| 9945/9945 [01:02<00:00, 158.83it/s]


428275

In [11]:
unique_ingredients = list(set(ingredients))
print('Number of unique ingredients: ', len(unique_ingredients))

Number of unique ingredients:  6714


In [12]:
train_features = np.zeros((len(train_data), len(unique_ingredients)))
train_labels = np.zeros(len(train_data))

for i in tqdm(range(len(train_data))):
    for j in train_ingredients[i]:
        train_features[i][unique_ingredients.index(j)] = 1
    train_labels[i] = unique_cuisine.index(train_cuisine[i])

100%|██████████| 39774/39774 [00:33<00:00, 1202.47it/s]


In [13]:
print("New Shape of Features:", train_features.shape)

New Shape of Features: (39774, 6714)


In [14]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [None]:
three_fold = KFold(n_splits = 3)
gauss_acc = []
bern_acc = []
log_reg_acc = []
classifier_G = GaussianNB()
classifier_B = BernoulliNB()
classifier_L = LogisticRegression()
for train, val in three_fold.split(train_features):
  x_train, train_label = train_features[train], train_labels[train]
  x_test, test_label = train_features[val], train_labels[val]
  classifier_G.fit(x_train, train_label)
  gauss_acc = np.append(gauss_acc, classifier_G.score(x_test, test_label))
  classifier_B.fit(x_train, train_label)
  bern_acc = np.append(bern_acc, classifier_B.score(x_test, test_label))
  classifier_L.fit(x_train, train_label)
  log_reg_acc = np.append(log_reg_acc, classifier_L.score(x_test, test_label))
gauss_avg_acc = np.mean(gauss_acc)
bern_avg_acc = np.mean(bern_acc)
log_avg_acc = np.mean(log_reg_acc)

In [16]:
print('Cross Validation Accuracy for Gaussian distribution: ', gauss_avg_acc)
print('Cross Validation Accuracy for Bernouli distribution: ', bern_avg_acc)
print('Cross Validation Accuracy for Logistic Regression: ', log_avg_acc)

Cross Validation Accuracy for Gaussian distribution:  0.3798461306381053
Cross Validation Accuracy for Bernouli distribution:  0.6835369839593705
Cross Validation Accuracy for Logistic Regression:  0.7729169809423242


Looking at the results above, Gaussian distribution has the least validation score and Bernouli distribution has a score more than the Gaussian Distribution. This makes sense as the Bernouli distribution gives a clear indication of whether a particular ingredient is present in a dish ('1') or not ('0'). Therefore, making it easier for the classifier to understand the feature clearer in lieu of using a pdf from the gaussian distribution.

Additionally, Logistic Regression has the best Validation score - therefore Logistic Regression will be used to submit test labels to Kaggle

In [17]:
classifier = LogisticRegression()
classifier.fit(train_features,train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [18]:
test_features = np.zeros((len(test_data), len(unique_ingredients)))
test_ingredients = test_data['ingredients'].values
for i in tqdm(range(len(test_data))):
    for j in test_ingredients[i]:
      if j in unique_ingredients:
        test_features[i][unique_ingredients.index(j)] = 1

100%|██████████| 9944/9944 [00:18<00:00, 540.55it/s]


In [19]:
test_label_index = classifier.predict(test_features).astype(int)
test_label_index

array([16, 14,  1, ...,  1, 14,  2])

In [20]:
test_label = []
for i,idx in enumerate(test_label_index):
  test_label.append(unique_cuisine[idx])

In [21]:
np.array(test_data['ingredients'].values)

array([list(['baking powder', 'eggs', 'all-purpose flour', 'raisins', 'milk', 'white sugar']),
       list(['sugar', 'egg yolks', 'corn starch', 'cream of tartar', 'bananas', 'vanilla wafers', 'milk', 'vanilla extract', 'toasted pecans', 'egg whites', 'light rum']),
       list(['sausage links', 'fennel bulb', 'fronds', 'olive oil', 'cuban peppers', 'onions']),
       ...,
       list(['black pepper', 'salt', 'parmigiano reggiano cheese', 'ricotta', 'large egg yolks', 'dry bread crumbs', 'genoa salami', 'vegetable oil']),
       list(['cheddar cheese', 'cayenne', 'paprika', 'plum tomatoes', 'green bell pepper', 'water', 'Tabasco Pepper Sauce', 'scallions', 'canned low sodium chicken broth', 'milk', 'butter', 'red bell pepper', 'canned black beans', 'quickcooking grits', 'salt']),
       list(['cold water', 'olive oil', 'lime wedges', 'garlic cloves', 'boiling water', 'boneless chicken skinless thigh', 'guacamole', 'tomato salsa', 'corn tortillas', 'tomatoes', 'radishes', 'queso fresco'

In [22]:
ret_temp = np.vstack((test_data['id'], test_label))
ret = ret_temp.T
ret = pd.DataFrame(ret)

In [23]:
ret.head()

Unnamed: 0,0,1
0,18009,british
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian


In [24]:
ret.to_csv('./output.csv', header=False, index=False)