In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile
import json 

for t in ['train','test']:
    with zipfile.ZipFile("../input/whats-cooking/{}.json.zip".format(t),"r") as z:
        z.extractall(".")
    
with open('./train.json') as train_file:    
    train = json.load(train_file)
    
with open('./test.json') as test_file:
    test = json.load(test_file)

In [None]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train['cuisine'].value_counts()

In [None]:
# clean data
#converting each ingredients list in one string: ' word1, word2, ...'
train['ingredients_clean_string'] = [' , '.join(z).strip() for z in train['ingredients']]  
test['ingredients_clean_string'] = [' , '.join(z).strip() for z in test['ingredients']]

In [None]:
import re
from nltk import WordNetLemmatizer
# further clean data and extract information through word lemmatization
train['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                         for line in lists]).strip() for lists in train['ingredients']]

test['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                          for line in lists]).strip() for lists in test['ingredients']]

In [None]:
# create corpus based on newly processed data
train_corpus = train['ingredients_string']
test_corpus = test['ingredients_string']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert a collection of raw documents to a matrix of TF-IDF features
train_vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),
                             analyzer="word", 
                             max_df = .57 , 
                             binary=False , 
                             token_pattern=r'\w+' , 
                             sublinear_tf=False)

test_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
# transform the corpus to a dense matrix representation
train_tfidf = train_vectorizer.fit_transform(train_corpus).todense()
test_tfidf = train_vectorizer.transform(test_corpus)

In [None]:
# prepare data for prediction
train_predictor = train_tfidf
test_predictor = test_tfidf

train_target = train['cuisine']

In [None]:
from sklearn.preprocessing import LabelEncoder

le_cuisine = LabelEncoder()

# encoding 'cuisine'
le_cuisine.fit(train_target)
encoded_le_cuisine_new_train = le_cuisine.transform(train_target)
train_target = encoded_le_cuisine_new_train

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_predictor, train_target, test_size = 0.2, random_state = 0, stratify = train_target)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# defining model
classifier = LogisticRegression(solver='liblinear')

# fit the model
classifier.fit(X_train, y_train)

# predicting X_val
y_pred = classifier.predict(X_val)

# evaluating
print(classification_report(y_val, y_pred))

In [None]:
"""
from sklearn.model_selection import GridSearchCV

parameters = {
    "C": [1, 10] }

model = LogisticRegression()
classifier = GridSearchCV(model, parameters)

classifier.fit(X_train, y_train)
classifier.best_params_
"""

In [None]:
# defining model
best_classifier = LogisticRegression(solver='liblinear', C = 7)

# fit the model
best_classifier.fit(X_train, y_train)

# predicting X_val
y_pred = best_classifier.predict(X_val)

# evaluating
print(classification_report(y_val, y_pred))

In [None]:
# prediction on the test set
predicted_cuisine = best_classifier.predict(test_predictor)

#predicted_price_range

In [None]:
predicted_cuisine = le_cuisine.inverse_transform(predicted_cuisine)
predicted_cuisine

In [None]:
# creating submission file
submission = pd.DataFrame({'id': test['id'],
                           'cuisine': predicted_cuisine})
submission.to_csv('submission.csv', index=False)