# Импорт библиотек

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
tqdm.pandas()
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import json

# Загрузка данных

In [None]:
train = pd.read_json('../input/whats-cooking-kernels-only/train.json')
test = pd.read_json('../input/whats-cooking-kernels-only/test.json')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)                 
print(train.columns)

In [None]:
print(test.shape)                 
print(test.columns)

In [None]:
train.isnull().sum()

# Анализ данных

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=60)
ax= sns.countplot(x='cuisine', data= train, order = train['cuisine'].value_counts().index)
plt.title('Кол-во ингредиентов, используемых в той или иной кухне',fontweight="bold")

In [None]:
train['ing_count'] = train['ingredients'].str.len()

In [None]:
plt.figure(figsize=(15,5))
sns.kdeplot(data=train["ing_count"], shade=True)
plt.title('Кол-во ингредиентов на рецепт',fontweight="bold")

# Предварительная обработка данных

Удаляем из списка ингридиентов лишние символы и повторения основ

In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients = ' '.join(ingredients)
    ingredients = ingredients.lower()
    ingredients = re.sub('[,\.!?:()"]', '',ingredients) 
    ingredients = re.sub('[^a-zA-Z"]',' ',ingredients)
    ingredients = ingredients.replace('-', ' ')
    words = []
    for word in ingredients.split():
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return ' '.join(words)

In [None]:
train['x'] = train['ingredients'].progress_apply(preprocess)
test['x'] = test['ingredients'].progress_apply(preprocess)
train.head()

# Финальная модель

Преобразуем списки ингредиентов в числовые значения

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True) 

In [None]:
X_train = vectorizer.fit_transform(train['x'].values)
X_train.sort_indices()
X_test = vectorizer.transform(test['x'].values)

In [None]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(train['cuisine'].values)

Создаем модель и обучаем её

In [None]:
model= MLPClassifier(solver='adam',activation='relu',early_stopping=True, random_state=3, 
                    max_iter=100,verbose=True,alpha=1e-08,beta_1=0.05, beta_2=0.4, epsilon=1e-09)
model.fit(X_train, Y_train)

In [None]:
Y_test = model.predict(X_test)
Y_pred = label_encoder.inverse_transform(Y_test)

In [None]:
test_id = test['id']
submission = pd.DataFrame({'id': test_id, 'cuisine': Y_pred}, columns=['id', 'cuisine'])
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()