# What's cooking kernel !

In [1]:
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import MultinomialNB

Load the dataset

In [2]:
df = pd.read_json("F:\\python\\cooking\\train.json")
testset = pd.read_json("F:\\python\\cooking\\test.json")

In [3]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
testset.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


Check for any null values.

In [5]:
df.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [6]:
testset.isnull().sum()

id             0
ingredients    0
dtype: int64

Check different types of cuisines

In [7]:
df.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

# Text Data processing

Convert the ingredients to string.

In [8]:
df.ingredients = df.ingredients.astype('str')
testset.ingredients = testset.ingredients.astype('str')

In [9]:
df.ingredients[0]

"['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']"

In [10]:
testset.ingredients[0]

"['baking powder', 'eggs', 'all-purpose flour', 'raisins', 'milk', 'white sugar']"

Lets remove those unnecessary symbols, which might be problem when tokenizing and lemmatizing

In [11]:
df.ingredients = df.ingredients.str.replace("["," ")
df.ingredients = df.ingredients.str.replace("]"," ")
df.ingredients = df.ingredients.str.replace("'"," ")
df.ingredients = df.ingredients.str.replace(","," ")

In [12]:
testset.ingredients = testset.ingredients.str.replace("["," ")
testset.ingredients = testset.ingredients.str.replace("]"," ")
testset.ingredients = testset.ingredients.str.replace("'"," ")
testset.ingredients = testset.ingredients.str.replace(","," ")

In [13]:
df.ingredients[0]

'  romaine lettuce    black olives    grape tomatoes    garlic    pepper    purple onion    seasoning    garbanzo beans    feta cheese crumbles  '

In [14]:
testset.ingredients[0]

'  baking powder    eggs    all-purpose flour    raisins    milk    white sugar  '

Convert everything to lower ( I think they are already in lower case, but to be on safe side).

In [15]:
df.ingredients = df.ingredients.str.lower()
testset.ingredients = testset.ingredients.str.lower()

Lets TOKENIZE the data now. (the processing of splitting into individual words)

In [16]:
df.ingredients = df.ingredients.apply(lambda x: word_tokenize(x))
testset.ingredients = testset.ingredients.apply(lambda x: word_tokenize(x))

Lets LEMMATIZE the data now (Since i believe that dataset might have different representation of same words, like the olives and olive, tomatoes and tomato, which represent the same word)

In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
def lemmat(wor):
    l = []
    for i in wor:
        l.append(lemmatizer.lemmatize(i))
    return l

In [19]:
df.ingredients = df.ingredients.apply(lemmat)
testset.ingredients = testset.ingredients.apply(lemmat)

In [20]:
df.ingredients[0]

['romaine',
 'lettuce',
 'black',
 'olive',
 'grape',
 'tomato',
 'garlic',
 'pepper',
 'purple',
 'onion',
 'seasoning',
 'garbanzo',
 'bean',
 'feta',
 'cheese',
 'crumbles']

In [21]:
testset.ingredients[0]

['baking',
 'powder',
 'egg',
 'all-purpose',
 'flour',
 'raisin',
 'milk',
 'white',
 'sugar']

Observe that olives converted to olive, tomatoes to tomato etc, many words are now in their root form.

In [22]:
type(df.ingredients[0])

list

Lemmatization converted it back to list, so change to str again and remove the unncessary words.

In [23]:
df.ingredients = df.ingredients.astype('str')
df.ingredients = df.ingredients.str.replace("["," ")
df.ingredients = df.ingredients.str.replace("]"," ")
df.ingredients = df.ingredients.str.replace("'"," ")
df.ingredients = df.ingredients.str.replace(","," ")

In [24]:
testset.ingredients = testset.ingredients.astype('str')
testset.ingredients = testset.ingredients.str.replace("["," ")
testset.ingredients = testset.ingredients.str.replace("]"," ")
testset.ingredients = testset.ingredients.str.replace("'"," ")
testset.ingredients = testset.ingredients.str.replace(","," ")

In [25]:
type(df.ingredients[0])

str

In [29]:
df.ingredients[0]

'  romaine    lettuce    black    olive    grape    tomato    garlic    pepper    purple    onion    seasoning    garbanzo    bean    feta    cheese    crumbles  '

Now our data looks good for vectorization.

In [30]:
vect = CountVectorizer()

In [31]:
features = vect.fit_transform(df.ingredients)

In [32]:
features

<39774x2826 sparse matrix of type '<class 'numpy.int64'>'
	with 756474 stored elements in Compressed Sparse Row format>

So, now our features has 2826 features, which are created by the process of vectorization.

Lets visualize some random features.

In [33]:
vect.get_feature_names()[1650:1670]

['mora',
 'morcilla',
 'morel',
 'moroccan',
 'morsel',
 'mortadella',
 'morton',
 'moss',
 'mostaccioli',
 'mostarda',
 'moulard',
 'mountain',
 'mousse',
 'mozarella',
 'mozzarella',
 'mrs',
 'msg',
 'muenster',
 'muesli',
 'muffin']

Lets vectorize our testset as well, we only tranform it with already fitted model

In [34]:
testfeatures = vect.transform(testset.ingredients)

In [37]:
testfeatures

<9944x2826 sparse matrix of type '<class 'numpy.int64'>'
	with 189531 stored elements in Compressed Sparse Row format>

Lets create our labels now, which is obviously cuisine column

In [38]:
labels = df.cuisine

Lets split the dataset into training and testing parts

In [39]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

Check the shapes, to make sure.

In [40]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(31819, 2826) (7955, 2826) (31819,) (7955,)


# Data Modeling

In [41]:
logreg = LogisticRegression(C=12)
logreg.fit(X_train,y_train)

LogisticRegression(C=12, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
print("Logistic Regression accuracy",logreg.score(X_test, y_test))

Logistic Regression accuracy 0.7732243871778756


In [44]:
logreg.predict(X_test)

array(['mexican', 'thai', 'mexican', ..., 'indian', 'indian', 'japanese'],
      dtype=object)

In [56]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier()
sgd.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [80]:
print("SGD classifier accuracy",sgd.score(X_test, y_test))

SGD classifier accuracy 0.7417976115650534


In [77]:
from sklearn.svm import LinearSVC
linearsvm = LinearSVC(random_state=0, max_iter = 1500)
linearsvm.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [81]:
print("Linear SVM accuracy", linearsvm.score(X_test, y_test))

Linear SVM accuracy 0.7713387806411062


The accuracy of 77.4 with the logistic regression i s the best we acheived so far with out any tuning of parameters.
Now, lets try our luck with neural networks.

# NEURAL NETWORK'S

I have tried both Keras and tensorflow (Of course the backend is same), but Keras code looks simpler and clear.

For Neural Networks we need to have the dense array's as inputs and preferably one hot encoding for lables.
So, lets create lables.

In [59]:
labelsNN = df.cuisine

Convert it to one hot formatting, there are many ways to do, i prefer to do this way.

In [60]:
labelsNN = pd.get_dummies(labelsNN)

Convert it to arrays, you can do by values method or np.array() both are same

In [61]:
labelsNN = labelsNN.values

Here's how the one hot encoding looks like.

In [62]:
labelsNN[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=uint8)

Our labels are ready, now we need the features, we have already created the features above but it was sparse matrix, which neural network doesnt like, so convert to dense arrays.

In [63]:
from scipy.sparse import csr_matrix
sparse_dataset = csr_matrix(features)
featuresNN = sparse_dataset.todense()

Here's how the features look like.

In [64]:
featuresNN[0]

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Split the dataset.

In [65]:
X_trainNN, X_testNN, y_trainNN, y_testNN = train_test_split(featuresNN, labelsNN, test_size=0.2)

In [66]:
print(X_trainNN.shape, X_testNN.shape, y_trainNN.shape, y_testNN.shape)

(31819, 2826) (7955, 2826) (31819, 20) (7955, 20)


# KERAS

In [67]:
import keras
from keras.layers import *

A sequential NN with 300,500 and 400 nodes in first,second and third layers resp.

The loss is categorical cross entropy and the optimizer is adam with default learning rate.
We can tweak a lot of parameters like the no of nodes, epochs, batchsize etc to improve accuracy.

In [68]:
model = keras.models.Sequential()
model.add(Dense(300,input_dim = 2826,activation = 'relu'))
model.add(Dense(500,activation = 'relu'))
model.add(Dense(400,activation = 'relu'))
model.add(Dense(20,activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['categorical_accuracy'])
model.fit(X_trainNN,y_trainNN,epochs=50,shuffle=True, verbose =2,batch_size=500)

Epoch 1/50
 - 3s - loss: 1.5324 - categorical_accuracy: 0.5639
Epoch 2/50
 - 3s - loss: 0.7531 - categorical_accuracy: 0.7771
Epoch 3/50
 - 3s - loss: 0.5700 - categorical_accuracy: 0.8309
Epoch 4/50
 - 3s - loss: 0.4529 - categorical_accuracy: 0.8637
Epoch 5/50
 - 3s - loss: 0.3596 - categorical_accuracy: 0.8933
Epoch 6/50
 - 3s - loss: 0.2790 - categorical_accuracy: 0.9158
Epoch 7/50
 - 3s - loss: 0.2128 - categorical_accuracy: 0.9384
Epoch 8/50
 - 4s - loss: 0.1644 - categorical_accuracy: 0.9520
Epoch 9/50
 - 3s - loss: 0.1191 - categorical_accuracy: 0.9680
Epoch 10/50
 - 3s - loss: 0.0884 - categorical_accuracy: 0.9759
Epoch 11/50
 - 3s - loss: 0.0626 - categorical_accuracy: 0.9849
Epoch 12/50
 - 3s - loss: 0.0459 - categorical_accuracy: 0.9893
Epoch 13/50
 - 3s - loss: 0.0354 - categorical_accuracy: 0.9924
Epoch 14/50
 - 3s - loss: 0.0264 - categorical_accuracy: 0.9948
Epoch 15/50
 - 3s - loss: 0.0228 - categorical_accuracy: 0.9954
Epoch 16/50
 - 3s - loss: 0.0225 - categorical_ac

<keras.callbacks.History at 0x20247f24160>

In [69]:
print("Accuracy with KERAS" ,model.evaluate(X_testNN,y_testNN)[1])

Accuracy with KERAS 0.7864236329502463


I have trained with KERAS on my pc for few times and achieved max accuracy of 0.81.

Now, lets try with tensorflow (technically it same as before, but i just want to show you how it works)

# Tensorflow.

In [105]:
import tensorflow as tf

Define the placeholders.

In [106]:
X = tf.placeholder(tf.float32,[None,2826]) # Since we have 2826 features. 
y = tf.placeholder(tf.float32,[None,20])  # Since we have 20 outut labels.

Let's define the tensorflow graph.

In [107]:
weights1 = tf.get_variable("weights1",shape=[2826,600],initializer = tf.contrib.layers.xavier_initializer())
biases1 = tf.get_variable("biases1",shape = [600],initializer = tf.zeros_initializer)
layer1out = tf.nn.relu(tf.matmul(X,weights1)+biases1)

weights2 = tf.get_variable("weights2",shape=[600,900],initializer = tf.contrib.layers.xavier_initializer())
biases2 = tf.get_variable("biases2",shape = [900],initializer = tf.zeros_initializer)
layer2out = tf.nn.relu(tf.matmul(layer1out,weights2)+biases2)

weights3 = tf.get_variable("weights3",shape=[900,20],initializer = tf.contrib.layers.xavier_initializer())
biases3 = tf.get_variable("biases3",shape = [20],initializer = tf.zeros_initializer)
prediction =tf.matmul(layer2out,weights3)+biases3

The above is NN with 600,900 nodes in first and second layer resp.
Xavier initializer is usually advised over random weights and Zeros initializer over random biases.

In [108]:
cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

In [110]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(101): # no of epochs
        opt,costval = sess.run([optimizer,cost],feed_dict = {X:X_trainNN,y:y_trainNN})
        matches = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(matches, 'float'))
        if (epoch % 10 == 0):
            print("Epoch", epoch, "--" , "Cost",costval)
    print("Accuracy on the test set ->",accuracy.eval({X:X_testNN,y:y_testNN}))
    print("FINISHED !!!")


Epoch 0 -- Cost 96229.016
Epoch 10 -- Cost 56917.8
Epoch 20 -- Cost 35608.418
Epoch 30 -- Cost 25354.799
Epoch 40 -- Cost 19556.654
Epoch 50 -- Cost 15888.187
Epoch 60 -- Cost 13356.782
Epoch 70 -- Cost 11364.489
Epoch 80 -- Cost 9657.299
Epoch 90 -- Cost 8130.496
Epoch 100 -- Cost 6761.1396
Accuracy on the test set -> 0.7791326
FINISHED !!!


Now, we have achieved almost similar accuracies in all the above models, I personally dont prefer NN's on this data as it is computationally very expensive.

# PREDICTION

I prefer just using the logisticRegression for predictions, but linearSVC also has almost same results.
I'm not predict using Keras or Tensorflow, since it needs an extra two steps to convert the labels, which I dont want to waste my time on.

In [70]:
pred = logreg.predict(testfeatures)

In [106]:
sub = pd.DataFrame({'id':testset.id,'cuisine':pred})

In [110]:
output = sub[['id','cuisine']]

In [113]:
output.to_csv("output.csv",index = False)

# END

# NOTES:
1) You can achieve better accuracy by tuning the parameters.
2) Neural Network has even scored an accuracy of 0.81 but the computation is very time taking.
3) I have not used my time on visualizing the dataset.(which is not needed for this submission.
4) Please comment for any questions, doubts or suggestions.

 THANK YOU
 
# please UPVOTE, if you like.