# Kaggle Playground Prediction Competion
### What's Cooking?
Use recipe ingredients to categorize the cuisine

**Objective -** Predict the category of a dish's cuisine given a list of its ingredients. 

In [None]:
#Loading libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import zipfile
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

**Loading Train and Test Dataset**

In [None]:
for t in ['train','test']:
    with zipfile.ZipFile("../input/whats-cooking/{}.json.zip".format(t),"r") as z:
        z.extractall(".")
    
with open('./train.json') as train_file:    
    train_df = json.load(train_file)
    
with open('./test.json') as test_file:
    test_df = json.load(test_file)

In [None]:
train_df = pd.DataFrame(train_df)
train_df.head()

In [None]:
test_df = pd.DataFrame(test_df)
test_df.head()

In [None]:
#Checking the dimension of train dataframe
train_df.shape

In [None]:
#Inspecting train dataframe
train_df.info()

In [None]:
#Checking the dimension of test dataframe
test_df.shape

In [None]:
#Inspecting test dataframe
test_df.info()

**Exploratory Data Analysis**

In [None]:
#Checking for missing values in train dataframe
train_df.isnull().sum()

In [None]:
#Checking for missing values in test dataframe
test_df.isnull().sum()

Count of Unique Cuisines

In [None]:
#Finding count of unique cuisines in train dataframe
count_cuisine = train_df['cuisine'].value_counts(sort=True)
plt.figure(figsize=(10,10))
sns.barplot(y = count_cuisine.index, x = count_cuisine.values)
plt.title('Count of Unique Cuisines')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Cuisines', fontsize=12)
plt.show()

Count of Most Common Ingredients

In [None]:
#Finding count of most common ingredients in train dataframe
count_ingredients = []
for x in train_df['ingredients']:
    for y in x:
        count_ingredients.append(y)
count_ingredients = pd.Series(count_ingredients)
c_ingredients = count_ingredients.value_counts(sort=True).head(10)
plt.figure(figsize=(10,10))
sns.barplot(y = c_ingredients.index, x = c_ingredients.values)
plt.title('Count of Most Common Ingredients')
plt.ylabel('Ingredients', fontsize=12)
plt.xlabel('Count', fontsize=12)
plt.show()

Top 20 Ingredients in All Cuisines

In [None]:
#Top 20 ingredients in all cuisines
cuisines = train_df['cuisine'].unique()
for i in cuisines:
  ingredients=[]
  for j in train_df[train_df['cuisine']==i]['ingredients']:
    for k in j:
      ingredients.append(k)
  ingredients = pd.Series(ingredients)
  c_ingredients = ingredients.value_counts(sort=True).head(20)
  plt.figure(figsize=(10,10))
  sns.barplot(y = c_ingredients.index, x = c_ingredients.values)
  plt.title(i)
  plt.ylabel('Ingredients', fontsize=12)
  plt.xlabel('Count', fontsize=12)
  plt.show()

**Data Preprocessing**

In [None]:
#Converting to lowercase
def preprocess(ingredients):
  #Converting to lowercase
  ingredients = [[x.lower() for x in y] for y in ingredients]

  #Removing extra white spaces
  ingredients = [[re.sub( '\s+', ' ', x).strip() for x in y] for y in ingredients]

  #Removing numbers
  ingredients = [[re.sub("\d+", "", x) for x in y] for y in ingredients]

  #Removing punctuation and special characters
  ingredients = [[x.replace("-", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("&", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("'", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("''", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("%", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("!", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("(", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace(")", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("/", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace("/", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace(",", " ") for x in y] for y in ingredients] 
  ingredients = [[x.replace(".", " ") for x in y] for y in ingredients] 
  ingredients = [[re.sub(r'[^\w\s]', '', x) for x in y] for y in ingredients]

  #Removing units of measurement
  units_list = ['g', 'lb', 's', 'n','oz']
  def remove_units(word):
    s = word.split()
    resw  = [word for word in s if word.lower() not in units_list]
        
    return ' '.join(resw)

  ingredients =  [[remove_units(x) for x in y] for y in ingredients]

  #Removing words which are not ingredients
  ingredients = [[re.sub(r'crushed|crumbles|ground|minced|powder|chopped|sliced', '', x) for x in y] for y in ingredients]

  return ingredients

In [None]:
#CLeaning ingredients column for both train and test dataframe
train_df["ingredients"] = preprocess(train_df["ingredients"])
test_df["ingredients"] = preprocess(test_df["ingredients"])

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
train_df['ingredients'] = [' '.join([WordNetLemmatizer().lemmatize(line) for line in lists]).strip() for lists in train_df['ingredients']]       
test_df['ingredients'] = [' '.join([WordNetLemmatizer().lemmatize(line) for line in lists]).strip() for lists in test_df['ingredients']]

In [None]:
#Inspecting train dataframe after cleaning
train_df.head()

In [None]:
#Inspecting test dataframe after cleaning
test_df.head()

In [None]:
train = train_df['ingredients']
target = train_df['cuisine']
test = test_df['ingredients']

**TFIDF Vectorization**

In [None]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range = ( 1 , 1 ), analyzer="word", 
                        max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
train = tfidf.fit_transform(train)
test = tfidf.transform(test)

**Model Development**

Artificial Neural Network

In [None]:
#Loading libraries for ANN sequential model
from keras.models import Sequential
from keras.layers import Dense

In [None]:
#Building a model
ann = Sequential()
ann.add(Dense(32, activation='relu'))
ann.add(Dense(30, activation='relu'))
ann.add(Dense(23, activation='softmax'))

In [None]:
#Compiling the model
import tensorflow as tf
import keras
ann.compile(optimizer='rmsprop', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [None]:
#Converting string class labels to integers using label encoder to work with ann model
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df["target_coded"] = le.fit_transform(train_df["cuisine"])
train_df.head()
target_ann=train_df["target_coded"]

In [None]:
#Converting train csr matrix of tfidf to array to work with ann model
train = train.toarray()

In [None]:
#Fitting the model on train data
ann.fit(train,target_ann,batch_size=32,epochs = 100)

In [None]:
#Predictions on test data
test=test.toarray()
predictions=ann.predict(test)

In [None]:
#Converting predictions to classes
classes = predictions.argmax(axis=-1)

In [None]:
#Converting encoded labels back to original target labels
pred_ann = le.inverse_transform(classes)

In [None]:
#Adding cuisine column to test dataframe with predicted target values
test_df["cuisine"] = pred_ann.tolist()

In [None]:
test_df

In [None]:
#Creating the final submission CSV file
submission = test_df.drop('ingredients', axis=1)
submission.to_csv('submission.csv', index=False)