# A little start with the dataset and the ML

I'm going to split this notebook in two parts:
* Little study and preprocess of train data.
* Cross-validation over Random Forest

## Little Study of train data

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm_notebook

%matplotlib inline

In [None]:
# Leemos el Json original y lo visualizamos
data_path = '/kaggle/input/whats-cooking-kernels-only/'
json_train_path = os.path.join(data_path, 'train.json')
json_train = pd.read_json(json_train_path)

json_train

### Missing values

First of all we are going to search for missing values

In [None]:
json_train.info()

There is no missing values.

### Class Analysis

In [None]:
json_train['cuisine'].unique()

In [None]:
sns.countplot(y='cuisine', data=json_train)
plt.show()

### Cleaning the data

In [None]:
# Create a list with all different ingredients 
total_ingredients_list = []
tq = tqdm_notebook(total=json_train.shape[0])
for ing in json_train['ingredients']:
    total_ingredients_list = total_ingredients_list + ing
    tq.update(1)
tq.close()

print('Total number of unique ingredients: {}'.format(len(np.unique(total_ingredients_list))))

In [None]:
# Remove a lot of things to clean the data:
# - Remove (number  oz.)
# - Remove all spaces in the beggining or in the end
# - Remove all special characters

import string
from nltk.corpus import stopwords

chars = re.escape(string.punctuation)
clean_ingredients_list = [re.sub(r'['+chars+']', '', 
                                 re.sub('[0-9]+','', c.replace("oz",""))).strip(' ').lower()
                    for c in total_ingredients_list]

stop_words = set(stopwords.words('english'))
tq = tqdm_notebook(total=len(clean_ingredients_list))
for i, ingredients in enumerate(clean_ingredients_list):
    cleaned_ingredients = [c for c in ingredients.split(' ') if c not in stop_words]
    cleaned_ingredients = (' '.join(cleaned_ingredients)).strip(' ')
    clean_ingredients_list[i] = cleaned_ingredients
    tq.update(1)
tq.close()
# See some of the cleaned data
# Var with all cleaned ingredients in the dataset
clean_unique_ingredients_list = [c for c in list(np.unique(clean_ingredients_list)) if len(c)>0]  
print('Total of unique ingredients: ', len(clean_unique_ingredients_list))

Creating a dataframe with one column per ingredient with 1 if the ingredient appears in this recipe and 0 if not appears

In [None]:
classes_dict = {'greek':0, 'southern_us':1, 'filipino':2, 'indian':3, 'jamaican':4,
                'spanish':5, 'italian':6, 'mexican':7, 'chinese':8, 'british':9, 'thai':10,
                'vietnamese':11, 'cajun_creole':12, 'brazilian':13, 'french':14, 'japanese':15,
                'irish':16, 'korean':17, 'moroccan':18, 'russian':19}

ingredients_encoded = np.zeros((json_train.shape[0], len(clean_unique_ingredients_list)+2), dtype=np.uint8)

tq = tqdm_notebook(total=json_train.shape[0])
for i in range(json_train.shape[0]):
    ingredients_encoded[i,0] = json_train['id'].values[i]
    ingredients_encoded[i,1] = classes_dict[json_train['cuisine'][i]]
    # first clean
    clean_recipe = [re.sub(r'['+chars+']', '',
                           re.sub('[0-9]+','', c.replace("oz",""))).strip(' ').lower()
                    for c in json_train['ingredients'][i]]
    
    # delete stop words
    for k, ingredients in enumerate(clean_recipe):
        cleaned_ingredients = [c for c in ingredients.split(' ') if c not in stop_words]
        cleaned_ingredients = (' '.join(cleaned_ingredients)).strip(' ')
        clean_recipe[k] = cleaned_ingredients
        
    clean_recipe = [c for c in clean_recipe if len(c)>0]
    for ingredient in clean_recipe:
        ingredients_encoded[i,clean_unique_ingredients_list.index(ingredient)+2]= 1
        
    tq.update(1)
    
tq.close()

In [None]:
data = pd.DataFrame(data=ingredients_encoded[:,1:])
data

Check if everything is ok in two ways:
* We are going to create an image with one column. In the title we print the name if the ingredient and the ocurrences in all the dataset. In the image each white pixel is an ocurrence.
* We are going to create a list with each ingredient and the number of ocurrences and see the top 5.

In [None]:
column_to_show = 6677

image = data[column_to_show].values
image = np.reshape(image, (42,947))

plt.figure(figsize=(10,30))
plt.title(clean_unique_ingredients_list[column_to_show-1]+
          ' {}'.format(np.sum(data[column_to_show].values)))
plt.imshow(image*255, cmap='gray')
plt.show()

In [None]:
sum = np.sum(data[data.columns[1:]].values, axis=0)
Z = [x for _,x in sorted(zip(sum, clean_unique_ingredients_list), reverse=True)]
Y = [y for y,x in sorted(zip(sum, clean_unique_ingredients_list), reverse=True)]

print('Top five: ',list(zip(Z,Y))[:5])
print('Inverse top five: ', list(zip(Z,Y))[-5:])

## Models and other things

Prepare the data to ML.

In [None]:
y = data[data.columns[0]]
X = data[data.columns[1:]]
# Split data in train-val
percent_train = 80
# OJO! para hacer los conjuntos de train y val, deberiamos tener cuidado de incluir todo tipo de cocina en ambos
# estaria bien localizar los indices de cada tipo de cocina e incluir un 80% en train y un 80% para cada una
idx_train = []
idx_val = []
for cls in classes_dict.values():
    indices = np.array(data.index[y==cls])
    np.random.shuffle(indices)
    idx_train += list(indices[:int(len(indices)*percent_train/100)])
    idx_val += list(indices[int(len(indices)*percent_train/100):])

np.random.shuffle(idx_train)
np.random.shuffle(idx_val)
    
X_train = X.values[idx_train,:]
y_train = y.values[idx_train]
X_val = X.values[idx_val,:]
y_val = y.values[idx_val]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_best = RandomForestClassifier(bootstrap=True, 
                                 class_weight='balanced',
                                 criterion='gini', 
                                 max_depth=100, 
                                 max_features='auto',
                                 max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None, 
                                 min_samples_leaf=1,
                                 min_samples_split=2, 
                                 min_weight_fraction_leaf=0.0,
                                 n_estimators=400, 
                                 n_jobs=8, 
                                 oob_score=False,
                                 random_state=None, verbose=0, warm_start=False)
# Acc CV: 0.7225571464456166

rf_best.fit(X,y)

### Prepare test data:

In [None]:
json_test_path = os.path.join(data_path, 'test.json')
json_test = pd.read_json(json_test_path)

json_test

In [None]:
ingredients_encoded_test = np.zeros((json_test.shape[0], len(clean_unique_ingredients_list)+1), dtype=np.uint8)

tq = tqdm_notebook(total=json_test.shape[0])
for i in range(json_test.shape[0]):
    ingredients_encoded_test[i,0] = json_test['id'].values[i]
    #ingredients_encoded_test[i,1] = classes_dict[json_test['cuisine'][i]]
    # first clean
    clean_recipe = [re.sub(r'['+chars+']', '',
                           re.sub('[0-9]+','', c.replace("oz",""))).strip(' ').lower()
                    for c in json_test['ingredients'][i]]
    # delete stop words
    for k, ingredients in enumerate(clean_recipe):
        cleaned_ingredients = [c for c in ingredients.split(' ') if c not in stop_words]
        cleaned_ingredients = (' '.join(cleaned_ingredients)).strip(' ')
        clean_recipe[k] = cleaned_ingredients
        
    clean_recipe = [c for c in clean_recipe if len(c)>0]
    for ingredient in clean_recipe:
        if ingredient not in clean_unique_ingredients_list:
            continue
        ingredients_encoded_test[i,clean_unique_ingredients_list.index(ingredient)+1]= 1
        
    tq.update(1)
    
tq.close()

data_test = pd.DataFrame(data=ingredients_encoded_test[:,1:])
data_test

In [None]:
column_to_show = 6676

image = data_test[column_to_show].values
image = np.reshape(image, (88,113))

plt.figure(figsize=(10,30))
plt.title(clean_unique_ingredients_list[column_to_show]+
          ' {}'.format(np.sum(data_test[column_to_show].values)))
plt.imshow(image*255, cmap='gray')
plt.show()

In [None]:
X_test = data_test[data_test.columns]

In [None]:
y_pred = rf_best.predict(X_test)
y_pred

In [None]:
sub = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
y_pred_str = [list(classes_dict.keys())[list(classes_dict.values()).index(c)] for c in y_pred]
print(len(y_pred_str), sub.shape)
sub['id'] = json_test['id']
sub['cuisine'] = y_pred_str

sub.to_csv("submission.csv", index = None)
