In [None]:
#For ignoring warning
import warnings
warnings.filterwarnings('ignore', category = DeprecationWarning)

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
from keras.preprocessing.text import text_to_word_sequence
import nltk
from nltk.stem import WordNetLemmatizer
import unidecode
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
tqdm.pandas()
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import json



# Dataset Overview

In [None]:
train = pd.read_json("../input/whats-cooking-kernels-only/train.json")
test = pd.read_json("../input/whats-cooking-kernels-only/test.json")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)                 
print(train.columns)

In [None]:
print(test.shape)                 
print(test.columns)

In [None]:
train.isnull().sum()

No missing values

* We have 39774 unique recipes from different cuisines in the train set
* We have 9944 unique recipes from different cuisines in the test set
* We have ingredients list

# **Exploratory data Analysis**

# Target Feature- cuisine

Our target feature is the cuisine

In [None]:
plt.figure(figsize=(16,5))
plt.xticks(rotation=60)
ax= sns.countplot(x='cuisine', data= train, order = train['cuisine'].value_counts().index)
#print(train.cuisine.value_counts())



From the countplot we can observe that most recipes were from italian cuisine, followed by mexican, southern_us.

# Great ingredients make great food

In [None]:
print('Maximum Number of Ingredients in a recipe: ',train['ingredients'].str.len().max())
print('Minimum Number of Ingredients in a recipe: ',train['ingredients'].str.len().min())

There seems to be recipe that contains only one ingredient.

**Lets create a feature that stores number of ingredients**

In [None]:
#no of Ingredients
train['ing_count'] = train['ingredients'].str.len()

In [None]:
#distribution of number of ingredients
plt.figure(figsize=(10,5))
sns.kdeplot(data=train["ing_count"], shade=True)
plt.title('kdeplot of ingredient count',fontweight="bold")

ingredient count is right skewed

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='ing_count', data= train)

We can see recipes with 1 or 2 ingredients. Then there are recipes with more than 30 ingredients. These are outliers and get adversely affect our model. But the thing is that,there are recipes with this number of ingredients.

**Lets look at the unbelievable ingredients**

In [None]:
train[train['ing_count'] >= 40]

The 65 ingredient recipe is from italian cuisine.

In [None]:
train[train['ing_count'] <= 1]

We can see a japanese cuisine with just water as the ingredient. What can that be??

**Most common ingredients**

In [None]:
# Taking Out all the ingredients in the dataset and storing in a list
ingredients_list = [ing for ingredients in train['ingredients'] for ing in ingredients]

In [None]:
from collections import Counter
ingredients_count = pd.Series(dict(Counter(','.join(ingredients_list).split(',')))).sort_values(ascending=False)
top20ingredients = ingredients_count.head(20)

plt.figure(figsize=(15,5))
sns.barplot(x= top20ingredients.index, y=top20ingredients)
plt.xticks(rotation=60)
plt.title('20 common ingredients', fontsize=15, fontweight='bold')
plt.xlabel('ingredients')
plt.show()

It is not suprising to find salt as the most common ingredients. We can also find olive oil, onions, water as common ingredients.

# Preprocessing

**Remove outliers**

In [None]:
train = train[train['ing_count'] > 1]
train = train[train['ing_count']<60]

**Basic cleaning**

In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients = ' '.join(ingredients)
    ingredients = ingredients.lower() #Convert to lowercase
    ingredients = re.sub('[,\.!?:()"]', '',ingredients) # remove punctuation marks 
    ingredients = re.sub('[^a-zA-Z"]',' ',ingredients) # remove all strings that contain a non-letter
    ingredients = ingredients.replace('-', ' ')
    words = []
    for word in ingredients.split():
        word = re.sub("[0-9]"," ",word) #removing numbers
        word = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ', word) # Removing Units
        if len(word) <= 2: continue
        word = unidecode.unidecode(word)
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return ' '.join(words)


In [None]:
train['x'] = train['ingredients'].progress_apply(preprocess)
test['x'] = test['ingredients'].progress_apply(preprocess)
train.head()

# Final Model

Before Predictive modelling,we need to convert words to numeric values. We can use TfidfVectorizer.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True) 
# sublinear_tf scaling addresses the problem that 20 occurrences of a word is probably not 20 times more important than 1 occurrence

In [None]:
X_train = vectorizer.fit_transform(train['x'].values)
X_train.sort_indices()
X_test = vectorizer.transform(test['x'].values)

In [None]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(train['cuisine'].values)

In [None]:
classifier = SVC(C=100, # penalty parameter
                 kernel='rbf', # kernel type, rbf working fine here
                 degree=3, # default value
                 gamma=1, # kernel coefficient
                 coef0=1, # change to 1 from default value of 0.0
                 shrinking=True, # using shrinking heuristics
	 			 tol=0.001, # stopping criterion tolerance 
	      		 probability=False, # no need to enable probability estimates
	      		 cache_size=200, # 200 MB cache size
	      		 class_weight=None, # all classes are treated equally 
	      		 verbose=False, # print the logs 
	      		 max_iter=-1, # no limit, let it run
          		 #decision_function_shape=None, # will use one vs rest explicitly 
          		 random_state=None)

In [None]:
model = OneVsRestClassifier(classifier, n_jobs=4)
model.fit(X_train, Y_train)

OneVsRest is a heuristic method for using binary classification algorithms for multi-class classification.

In [None]:
Y_test = model.predict(X_test)
Y_pred = label_encoder.inverse_transform(Y_test)

In [None]:
test_id = test['id']
submission = pd.DataFrame({'id': test_id, 'cuisine': Y_pred}, columns=['id', 'cuisine'])
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()