In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import json
import re
import nltk
import zipfile

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Зчитуємо дані

In [None]:
for t in ['train','test']:
    with zipfile.ZipFile("../input/whats-cooking/{}.json.zip".format(t),"r") as z:
        z.extractall(".")
    
with open('./train.json') as data_file:    
    data = json.load(data_file)
    
with open('./test.json') as test_file:
    test = json.load(test_file)

# Приклад train-датасету

In [None]:
df = pd.DataFrame(data)
test_df = pd.DataFrame(test)

test_ids = test_df['id']

df.head()

# EDA

Відсутні значення у відсотковому співвідношенні

In [None]:
(df.isnull().sum() / len(df))*100 # жодних відсутніх даних у train

In [None]:
(test_df.isnull().sum() / len(test_df))*100 # жодних відсутніх даних у test

# Страв якої країни найбільше

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
per_vals = round(df["cuisine"].value_counts(normalize=True)*100, 2)
for i, v in enumerate(per_vals):
    ax.text(v + 3, i + .25, str(v)+"%", color='blue', fontweight='bold')
df["cuisine"].value_counts().plot.barh(ax = ax)
plt.show()

# Які інгредієнти найчастіше застосовуються

In [None]:
fig, ax = plt.subplots(figsize=(22,7))
extensive_ing_list = []
for x in df['ingredients']:
    for y in x:
        extensive_ing_list.append(y)
        
extensive_ing_list = pd.Series(extensive_ing_list)
extensive_ing_list.value_counts().sort_values(ascending=False).head(30).plot.bar(ax = ax)

# Перелік усіх cuisine

In [None]:
cuisine = df["cuisine"].unique()

all_cus = dict()
for cs in cuisine:
    i = []
    for ing_list in df[df['cuisine']==cs]['ingredients']:
        for ing in ing_list:
            i.append(ing)
    all_cus[cs] = i

all_cus.keys()

# 25 найчастіше застосовуваних інгредієнтів для кухні кожної країни

In [None]:
for key in all_cus.keys():
    fig, ax = plt.subplots(figsize=(25,2))
    pd.Series(all_cus[key]).value_counts().head(25).plot.bar(ax=ax, title=key)
    plt.show()

# Попередня обробка даних

String preprocessing

In [None]:
def preprocess_df(df):
    
    def process_string(x):
        x = [" ".join([WordNetLemmatizer().lemmatize(q) for q in p.split()]) for p in x] #Lemmatization
        x = list(map(lambda x: re.sub(r'\(.*oz.\)|crushed|crumbles|ground|minced|powder|chopped|sliced','', x), x))
        x = list(map(lambda x: re.sub("[^a-zA-Z]", " ", x), x))   # прибираємо все окрім a-z та A-Z
        x = " ".join(x)                                 # перетворюємо list-елементи у string-елементи
        x = x.lower()
        return x
    
    df = df.drop('id',axis=1)
    df['ingredients'] = df['ingredients'].apply(process_string)
    
    return df

In [None]:
def get_cuisine_cumulated_ingredients(df):
    cuisine_df = pd.DataFrame(columns=['ingredients'])

    for cus in cuisine:
        st = ""
        for x in df[df.cuisine == cus]['ingredients']:
            st += x
            st += " "
        cuisine_df.loc[cus,'ingredients'] = st

    cuisine_df = cuisine_df.reset_index()
    cuisine_df = cuisine_df.rename(columns ={'index':'cuisine'})
    return cuisine_df

In [None]:
df = preprocess_df(df)
test_df = preprocess_df(test_df)

cuisine_df = get_cuisine_cumulated_ingredients(df)

In [None]:
df.head()

In [None]:
train = df['ingredients']
target = df['cuisine']
test = test_df['ingredients']

# Count Vectorizer

In [None]:
def count_vectorizer(train, test=None):
    cv = CountVectorizer()
    train = cv.fit_transform(train)
    if test is not None:
        test = cv.transform(test)
        return train, test, cv
    else:
        return train, cv

# TFiDF Vectorizer

In [None]:
def tfidf_vectorizer(train, test=None):
    tfidf = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
    train = tfidf.fit_transform(train)
    if test is not None:
        test = tfidf.transform(test)
        return train, test, tfidf
    else:
        return train, tfidf

In [None]:
train_tfidf, test_tfidf, tfidf = tfidf_vectorizer(train,test)
cuisine_data_tfidf, cuisine_tfidf = tfidf_vectorizer(cuisine_df['ingredients'])

# Кластеризація

Серед 20 різних типів кухні спробуємо знайти щось спільне між деякими кухнями

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import KernelPCA,PCA,TruncatedSVD

def get_kmeans_wcss(data, n_limit=15):
    wcss = [] #Within cluster sum of squares (WCSS)
    for i in range(1,n_limit):
        km = KMeans(init='k-means++', n_clusters=i, n_init=10)
        km.fit(data)
        wcss.append(km.inertia_)
    plt.title("Elbow Method")
    plt.plot(range(1, n_limit), wcss)
    plt.xlabel("Number of clusters")
    plt.ylabel("WCSS")
    return wcss
    
    
def kmeans(data, n):
    km = KMeans(init='k-means++', n_clusters=n, n_init=10)
    km = km.fit(data)
    return km.predict(data), km 


def get_PCA(data, n_components=2):
    pca = PCA(n_components = n_components)
    reduced_data = pca.fit_transform(data)
    explained_variance = pca.explained_variance_ratio_
    print(explained_variance)
    return reduced_data, pca, explained_variance

def get_kernel_PCA(data, n_components=2, kernel='rbf'):
    kpca = KernelPCA(n_components = 2, kernel = kernel)
    reduced_data = kpca.fit_transform(data)
    explained_variance = kpca.explained_variance_ratio_
    print(explained_variance)
    return reduced_data, kpca, explained_variance

def get_TSVD(data, n_components=2, n_ittr=5, algorithm='randomized'):
    tsvd = TruncatedSVD(n_components=n_components, n_iter=n_ittr, algorithm=algorithm)
    reduced_data = tsvd.fit_transform(data)
    explained_variance = tsvd.explained_variance_ratio_
    print(explained_variance)
    return reduced_data, tsvd, explained_variance



def create_pca_graph(cluster_pca, red_pca, n_clus):

    c_mask = []
    c_x = []
    c_y = []
    
    for i in range(0,n_clus):
        c_mask.append([x for x in cluster_pca==i])
    
    for i in range(0,n_clus):
        c_x.append([a[0] for a, b in zip(red_pca, c_mask[i]) if b])
        c_y.append([a[1] for a, b in zip(red_pca, c_mask[i]) if b])

    colours = ['red','blue','green','orange','purple','cyan','black','magenta']
    
    for i in range(0,n_clus):
        plt.scatter(c_x[i], c_y[i], s=30, c=colours[i], label='Cluster {}'.format(i))
        
        
    plt.title("Clusters of PCA")
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.legend()
    plt.show()

In [None]:
red_cuisine_pca, cus_pca, var_cus_pca = get_PCA((cuisine_data_tfidf).toarray(),2)

In [None]:
wcss_pca = get_kmeans_wcss(red_cuisine_pca,20)

Отже, найоптимальніша кількість кластерів = 3

In [None]:
cluster_cus_pca, km_cus_pca = kmeans(red_cuisine_pca,3)
cluster_cus_pca

In [None]:
create_pca_graph(cluster_cus_pca, red_cuisine_pca, 3)

# Моделювання

In [None]:
from scipy import sparse
# для кращих результатів
train = train_tfidf
test = test_tfidf

# Linear SVC

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import f1_score

param_grid = {'C': [0.001, 0.1, 1, 10, 50, 100, 500, 1000, 5000],  
              'penalty': ['l1','l2'],
             'loss': ['hinge','squared hinge']} 

grid = GridSearchCV(LinearSVC(), param_grid, refit = True, verbose = 3, n_jobs=-1, scoring='f1_micro')

In [None]:
grid.fit(train, target)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC, SVC

def evalfn(C, gamma):
    s = SVC(C=float(C), gamma=float(gamma), kernel='rbf', class_weight='balanced')
    f = cross_val_score(s, train, target, cv=5, scoring='f1_micro')
    return f.max()

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
new_opt = BayesianOptimization(evalfn, {'C': (0.1, 1000),  
              'gamma': (0.0001, 1)  })

In [None]:
C = 604.5300203551828
gamma = 0.9656489284085462

clf = SVC(C=float(C), gamma=float(gamma), kernel='rbf')

In [None]:
clf.fit(train, target)

In [None]:
y_pred = clf.predict(test)

In [None]:
my_submission = pd.DataFrame({'id':test_ids})
my_submission['cuisine'] = y_pred
my_submission.to_csv('submission_{}.csv', index=False)
print('Saved file to disk as submission_{}.csv.')