In [1]:
import pandas as pd
import numpy as np
import json
import os
import gc

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import sys


In [2]:
with open('train.json', encoding='utf-8') as f:
    train = json.load(f)
with open('test.json', encoding='utf-8') as f:
    test = json.load(f)


In [3]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [7]:
print(train.head())
print(train.shape)

      id      cuisine                                        ingredients
0  10259        greek  [romaine lettuce, black olives, grape tomatoes...
1  25693  southern_us  [plain flour, ground pepper, salt, tomatoes, g...
2  20130     filipino  [eggs, pepper, salt, mayonaise, cooking oil, g...
3  22213       indian                [water, vegetable oil, wheat, salt]
4  13162       indian  [black pepper, shallots, cornflour, cayenne pe...
(39774, 3)


In [8]:
print(test.head())
print(test.shape)

      id                                        ingredients
0  18009  [baking powder, eggs, all-purpose flour, raisi...
1  28583  [sugar, egg yolks, corn starch, cream of tarta...
2  41580  [sausage links, fennel bulb, fronds, olive oil...
3  29752  [meat cuts, file powder, smoked sausage, okra,...
4  35687  [ground black pepper, salt, sausage casings, l...
(9944, 2)


In [9]:
# getting unique ingredients which are present in both test and train datasets. Discarding ingredients which are present
# only in test or only in train

uniqueIngredients = []
for items in train.ingredients:
    items = [item.lower().replace(' ', '_') for item in items]
    
    uniqueIngredients.extend(items)
    
uniqueIngredientsTrain = set(uniqueIngredients)

uniqueIngredients = []
for items in test.ingredients:
    items = [item.lower().replace(' ', '_') for item in items]
    
    uniqueIngredients.extend(items)
    
uniqueIngredientsTest = set(uniqueIngredients)

print(len(uniqueIngredientsTrain)) 
print(len(uniqueIngredientsTest)) 
print(len(uniqueIngredientsTrain - uniqueIngredientsTest)) 
print(len(uniqueIngredientsTest - uniqueIngredientsTrain )) 
print(len(set(uniqueIngredientsTest) & set(uniqueIngredientsTrain)) )

uniqueIngredients = list(set(uniqueIngredientsTest) & set(uniqueIngredientsTrain))

del uniqueIngredientsTest, uniqueIngredientsTrain
gc.collect

6703
4479
2647
423
4056


<function gc.collect(generation=2)>

In [10]:
train.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [12]:
%%time

# Creating one hot encoding on ingredient name
# Creating a column for each ingredient, updating the column with 0 and making its type as int8

for i in range(len(uniqueIngredients)):
    train[uniqueIngredients[i]] = 0
    test[uniqueIngredients[i]] = 0
    train[uniqueIngredients[i]] = train[uniqueIngredients[i]].astype('int8')
    test[uniqueIngredients[i]] = test[uniqueIngredients[i]].astype('int8')

#print("after for loop")
print(train.shape)
print(test.shape)

(39774, 4059)
(9944, 4058)
Wall time: 3min 30s


In [15]:
%%time
# looping through train and test data and populating the columns for each ingredient.
# if the ingredient is in ingredients column, then the corresponding column get a value 1 (similar to onehot encoding)
train.head(2)

for i in range(train.shape[0]):
    rowIngred = train['ingredients'][i]
    rowIngred = [item.lower().replace(' ', '_') for item in rowIngred]

    size = len(rowIngred)
    inUnique =[]
    for k in range(size):
        if rowIngred[k] in uniqueIngredients:
            train.at[i, rowIngred[k]] = 1

for i in range(test.shape[0]):
    rowIngred = test['ingredients'][i]
    rowIngred = [item.lower().replace(' ', '_') for item in rowIngred]
    size = len(rowIngred)
    for k in range(size):
        if rowIngred[k] in uniqueIngredients:
            test.at[i, rowIngred[k]] = 1 

#print("after for loop")
print(train.shape)
print(test.shape)

del rowIngred, size
gc.collect

(39774, 4059)
(9944, 4058)
Wall time: 20.7 s


<function gc.collect(generation=2)>

In [16]:
%%time

# The ingredients that are appearing for less than 40 items are dropped. This significantly reduces the number of columns,
# without much sacrifice in accuracy. However, the speed of model training and prediction increases greatly

dropped =0
dropList =[]
for i in uniqueIngredients:
    if train[i].sum() <= 40 :
        dropped+=1
        dropList.append(i)

train = train.drop(dropList, axis=1)
test = test.drop(dropList, axis=1)

print("total dropped" + str(dropped))
print(train.shape)
print(test.shape)


total dropped2933
(39774, 1126)
(9944, 1125)
Wall time: 922 ms
Parser   : 110 ms


In [17]:
train.dtypes

id                     int64
cuisine               object
ingredients           object
golden_brown_sugar      int8
quinoa                  int8
                       ...  
refried_beans           int8
chili                   int8
peas                    int8
chorizo_sausage         int8
fontina_cheese          int8
Length: 1126, dtype: object

In [18]:
Y = train['cuisine']
X = train.drop(['id', 'ingredients','cuisine'], axis=1)

XUnseen = test.drop(['id', 'ingredients'], axis=1)

print(X.shape)
print(XUnseen.shape)


(39774, 1123)
(9944, 1123)


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Applying Logistic Regression

In [20]:
classifier = LogisticRegression(random_state = 0, max_iter = 2111) 
classifier.fit(X_train, y_train) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2111,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
y_pred = classifier.predict(X_test) 

In [22]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test, y_pred)) 
cm = confusion_matrix(y_test, y_pred)
#print(cm)

Accuracy :  0.7519483784463253


In [24]:
pred_Test = classifier.predict(XUnseen)

dict = {"id": test.id, "cuisine" : pred_Test}
predictDF = pd.DataFrame(dict)

predictDF.to_csv("Logistic.csv")
print("prediction saved to file")


prediction saved to file
