In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Explore Dataset

In [None]:
df = pd.read_json('/kaggle/input/whats-cooking-kernels-only/train.json')
df.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
df.cuisine.value_counts().plot.bar(title='Classes Counts')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

n_classes = len(df['cuisine'].unique())
print("Number of classes", n_classes)

# get the length of the tokens
df['length'] = df.ingredients.map(lambda x: len(x))

# get the number of classes
le = LabelEncoder()
df['categorical_label'] = le.fit_transform(df.cuisine)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# split dataset
train_set, valid_set = train_test_split(df, test_size=0.15, stratify=df.cuisine, random_state=42)

print(train_set.shape)
print(valid_set.shape)

train_sentences = [','.join(sentence) for sentence in train_set.ingredients.values.tolist()]
valid_sentences = [','.join(sentence) for sentence in valid_set.ingredients.values.tolist()]

# get the labels
y_train = train_set.categorical_label
y_valid = valid_set.categorical_label

train_sentences[:3]

# Text Vectorization

In [None]:
import tensorflow as tf

# get sequence max length
sequence_length = int(df['length'].max())

# create vectorization layer
vectorization_layer = tf.keras.layers.TextVectorization(max_tokens=None, output_mode='int', output_sequence_length=sequence_length, 
                                                        split=lambda x: tf.strings.split(x, ','), standardize=lambda x: tf.strings.lower(x))
vectorization_layer.adapt(train_sentences)

# create vectorization layer
vectorizer = tf.keras.models.Sequential()
vectorizer.add(tf.keras.Input(shape=(1,), dtype=tf.string))
vectorizer.add(vectorization_layer)

# get sequences
train_sequences = vectorizer.predict(train_sentences)
valid_sequences = vectorizer.predict(valid_sentences)

print(train_sentences[:3])
print(train_sequences[:3])

In [None]:
print(len(vectorization_layer.get_vocabulary()))
print(vectorization_layer.get_vocabulary()[:10])

# Create Classification Model

In [None]:
embedding_dim = 50
vocab_size = vectorization_layer.vocabulary_size()

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length, mask_zero=True),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(n_classes, activation='softmax')
])
    
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("cooking_deep.h5", save_best_only=True)

history = model.fit(train_sequences, y_train, epochs=30, validation_data=(valid_sequences, y_valid),
                    callbacks=[early_stopping_cb, checkpoint_cb])

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(loss))  # Get number of epochs

# Plot training and validation loss per epoch
plt.figure(figsize=(8, 6))
plt.plot(epochs, loss, 'r', label="Training Loss")
plt.plot(epochs, val_loss, 'b', label="Validation Loss")
plt.legend()
plt.show()

# Make Predictions

In [None]:
model = tf.keras.models.load_model("cooking_deep.h5")
print(model.evaluate(valid_sequences, y_valid))

In [None]:
import zipfile
zip_ref = zipfile.ZipFile("/kaggle/input/whats-cooking-kernels-only/sample_submission.csv.zip", 'r')
zip_ref.extractall('/kaggle/temp')
zip_ref.close()

pd.read_csv('/kaggle/temp/sample_submission.csv').head()

In [None]:
test_set = pd.read_json('/kaggle/input/whats-cooking-kernels-only/test.json')
test_sentences = [','.join(sentence) for sentence in test_set.ingredients.values.tolist()]
test_sequences = vectorizer.predict(test_sentences)
predictions = model.predict(test_sequences)
predictions

In [None]:
test_set["cuisine"] = le.inverse_transform(np.argmax(predictions, axis=1))
test_set[['id', 'cuisine']].to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv').head()