In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#asignar datos de entrenamiento y prueba a variables
data = pd.read_json("../input/train.json")
test = pd.read_json("../input/test.json")
data.head()

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
data['cuisine'].value_counts().plot(kind='bar')

In [None]:
from collections import Counter

In [None]:
counters = {}
for cuisine in data['cuisine'].unique():
    counters[cuisine] = Counter()
    indices = (data['cuisine'] == cuisine)
    for ingredients in data[indices]['ingredients']:
        counters[cuisine].update(ingredients)

In [None]:
counters['italian'].most_common(5)

In [None]:
top10 = pd.DataFrame([[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
            index=[cuisine for cuisine in counters],
            columns=['top{}'.format(i) for i in range(1, 11)])
top10

In [None]:
data['all_ingredients'] = data['ingredients'].map(";".join)

In [None]:
data.head()

In [None]:
data['all_ingredients'].str.contains('garlic cloves')

In [None]:
indices = data['all_ingredients'].str.contains('garlic cloves')
data[indices]['cuisine'].value_counts().plot(kind='bar',
                                                 title='garlic cloves as found per cuisine')

In [None]:
relative_freq = (data[indices]['cuisine'].value_counts() / data['cuisine'].value_counts())
relative_freq.sort_values(inplace=True)
relative_freq.plot(kind='bar')

In [None]:
import numpy as np
unique = np.unique(top10.values.ravel())
unique

In [None]:
fig, axes = plt.subplots(8, 8, figsize=(20, 20))
for ingredient, ax_index in zip(unique, range(64)):
    indices = data['all_ingredients'].str.contains(ingredient)
    relative_freq = (data[indices]['cuisine'].value_counts() / data['cuisine'].value_counts())
    relative_freq.plot(kind='bar', ax=axes.ravel()[ax_index], fontsize=7, title=ingredient)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
X = cv.fit_transform(data['all_ingredients'].values)

In [None]:
X.shape

In [None]:
print(list(cv.vocabulary_.keys())[:100])

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc = LabelEncoder()
y = enc.fit_transform(data.cuisine)

In [None]:
y[:100]

In [None]:
enc.classes_

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

In [None]:
logistic.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, logistic.predict(X_test))
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm_normalized, interpolation='nearest')
plt.title("confusion matrix")
plt.colorbar(shrink=0.3)
cuisines = data['cuisine'].value_counts().index
tick_marks = np.arange(len(cuisines))
plt.xticks(tick_marks, cuisines, rotation=90)
plt.yticks(tick_marks, cuisines)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = logistic.predict(X_test)
print(classification_report(y_test, y_pred, target_names=cuisines))

In [None]:
submission = pd.DataFrame({'id': X_test, 'cuisine': y_pred}, columns=['id', 'cuisine'])

submission.to_csv('submission.csv', index=False)