<a href="https://colab.research.google.com/github/riyaa14/learning-projects/blob/main/Natural%20Language%20Processing/NLP_0_What's_Cooking%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Text Pre Processing Part I

- removal of unnecessary characters (in this case anything that is not an english alphabet)
- convert case
- convert list, object, etc to one single sentence where each word is seperated by a space

In [101]:
ingredients_data = train_data['ingredients']
ingredients = ingredients_data.copy()

In [104]:
len(ingredients)

39774

In [120]:
def process_text(text):
    text = text.lower()
    text = re.sub('[^a-z]', ' ', text)
    return text

In [103]:
for i in range(len(ingredients)):
    ingredients[i] = [process_text(item) for words in ingredients[i] for item in words.split()]
    ingredients[i] = ' '.join(ingredients[i])

ingredients.head()

Unnamed: 0,ingredients
0,romaine lettuce black olives grape tomatoes ga...
1,plain flour ground pepper salt tomatoes ground...
2,eggs pepper salt mayonaise cooking oil green c...
3,water vegetable oil wheat salt
4,black pepper shallots cornflour cayenne pepper...


In [6]:
def preprocess_features(ingredients):
  for i in range(len(ingredients)):
    ingredients[i] = [process_text(item) for words in ingredients[i] for item in words.split()]
    ingredients[i] = ' '.join(ingredients[i])

  return ingredients

# Text Preprocessing Part II

- tdidf or bag of words
- word2vec
- embeddings

## Word Embeddings + LSTM

- step 1 - one hot encoding by defining vocab size
- step 2 - padding from maximum possible length of embeddings
- step 3 - use these embeddings as embedding layer in NN

In step 1, we convert text to integers
In step 2, we make all sequences equal so that we have a uniform matrix type structure
In step 3, we convert each integer representation to vectors of embeddings that basically capture similarity

In [7]:
# vocab size

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer to get the vocabulary
vectorizer = CountVectorizer()

# Fit the model and transform the text data
X = vectorizer.fit_transform(ingredients)

# Get the vocabulary size
vocab_size = len(vectorizer.get_feature_names_out())

print(f'Vocabulary Size: {vocab_size}')

Vocabulary Size: 3002


In [8]:
vocab_size = 5000
onehot_repr = [one_hot(items, vocab_size) for items in ingredients]
onehot_repr

[[4696,
  937,
  4479,
  292,
  58,
  4663,
  2704,
  2711,
  3642,
  3827,
  2597,
  4362,
  2788,
  1786,
  2282,
  2871],
 [995,
  4915,
  1787,
  2711,
  3939,
  4663,
  1787,
  4479,
  2711,
  3066,
  662,
  3539,
  4663,
  4314,
  4238,
  3518,
  3682,
  2598,
  2813],
 [662,
  2711,
  3939,
  4979,
  504,
  2813,
  3539,
  899,
  1523,
  4916,
  1401,
  2704,
  544,
  4314,
  3827,
  4761,
  4705,
  4225,
  4916,
  1001],
 [1357, 2598, 2813, 4999, 3939],
 [4479,
  2711,
  2725,
  3885,
  603,
  2711,
  2862,
  2704,
  4224,
  3682,
  4225,
  3939,
  3844,
  776,
  1357,
  3464,
  544,
  97,
  2813,
  1787,
  108,
  4671,
  4916,
  2986,
  1479,
  4370,
  4443,
  1454,
  2847,
  1577,
  2943,
  4480,
  1500],
 [995,
  4915,
  2296,
  4225,
  662,
  3296,
  3019,
  3170,
  3939,
  1787,
  2968,
  3682,
  2937,
  1000,
  1787,
  3019,
  1726,
  2296,
  157,
  544],
 [3437,
  2813,
  3939,
  972,
  4333,
  2711,
  2704,
  634,
  2282,
  347,
  899,
  3659,
  1500,
  636,
  1081,
  2

In [9]:
ingredients['length'] = ingredients.apply(lambda x: len(x.split()))
max_length = ingredients['length'].max()
max_length

142

In [10]:
ingredient_list_len = 200
embedded_docs = pad_sequences(onehot_repr,padding='post', maxlen=ingredient_list_len)
embedded_docs

array([[4696,  937, 4479, ...,    0,    0,    0],
       [ 995, 4915, 1787, ...,    0,    0,    0],
       [ 662, 2711, 3939, ...,    0,    0,    0],
       ...,
       [ 662,  476,  854, ...,    0,    0,    0],
       [4671, 4916, 2986, ...,    0,    0,    0],
       [3539, 1566,  347, ...,    0,    0,    0]], dtype=int32)

In [11]:
# given a dataframe for this problem -> converts to embeddings
def get_embeddings(df):

  vocab_size = 5000
  onehot_repr = [one_hot(items, vocab_size) for items in df]

  list_len = 200
  embedded_docs = pad_sequences(onehot_repr,padding='post', maxlen=list_len)

  return embedded_docs

In [82]:
embedded_docs.shape

(39774, 200)

In [21]:
embedded_docs[100]

array([4219,  232,  686, 3978, 2109, 2755,  922, 3565,  405, 2643, 1741,
       3610, 3198,  239, 4775, 4056,  237, 3545,  596, 4381, 1161, 3231,
        237, 1499, 1454, 2600,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [12]:
embedding_features = 40
model = Sequential([
    Embedding(vocab_size,embedding_features, input_length=ingredient_list_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dropout(0.1),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(20, activation="softmax")
    ])

model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])
print(model.summary())



None


In [13]:
import numpy as np
X = np.array(embedded_docs)
y = np.array(train_data['cuisine'])

In [30]:
X.shape

(39774, 200)

In [35]:
test_data.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize a label encoder
label_encoder = LabelEncoder()

# Fit the encoder on the entire target variable to capture all labels
label_encoder.fit(train_data['cuisine'])

# Transform the training and test labels
#y = label_encoder.transform(train_data['cuisine'])
# y_test = label_encoder.transform(np.array(test_data['cuisine']))

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
y_train_encoded = label_encoder.transform(y_train)
y_train_encoded

array([ 3,  9, 13, ...,  3, 16,  5])

In [20]:
model.fit(X_train, label_encoder.transform(y_train), validation_data=(X_test, label_encoder.transform(y_test)), epochs=5, batch_size=64)

Epoch 1/5
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m767s[0m 2s/step - accuracy: 0.3190 - loss: 2.2812 - val_accuracy: 0.5865 - val_loss: 1.3819
Epoch 2/5
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m711s[0m 2s/step - accuracy: 0.5971 - loss: 1.3696 - val_accuracy: 0.6517 - val_loss: 1.1455
Epoch 3/5
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 2s/step - accuracy: 0.6758 - loss: 1.0930 - val_accuracy: 0.6856 - val_loss: 1.0263
Epoch 4/5
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m732s[0m 2s/step - accuracy: 0.7050 - loss: 0.9635 - val_accuracy: 0.6996 - val_loss: 0.9795
Epoch 5/5
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m685s[0m 2s/step - accuracy: 0.7306 - loss: 0.8892 - val_accuracy: 0.7090 - val_loss: 0.9505


<keras.src.callbacks.history.History at 0x7eedb1716ce0>

In [56]:
test_loss, test_acc = model.evaluate(X_test, label_encoder.transform(y_test))
print('Test Accuracy: {}'.format(test_acc))

[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 287ms/step - accuracy: 0.7130 - loss: 0.9510
Test Accuracy: 0.7166692018508911


In [57]:
data = test_data['ingredients'].copy()
data = preprocess_features(data)
data.shape

(9944,)

In [24]:
data_embeddings = get_embeddings(data)
data_embeddings.shape

(9944, 200)

In [25]:
y_predicted = model.predict(data_embeddings)
y_test_predicted = y_predicted.argmax(axis=-1)

[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 293ms/step


In [44]:
y_test_predicted = label_encoder.inverse_transform(y_test_predicted)

In [49]:
data.head()

Unnamed: 0,ingredients
0,baking powder eggs all purpose flour raisins m...
1,sugar egg yolks corn starch cream of tartar ba...
2,sausage links fennel bulb fronds olive oil cub...
3,meat cuts file powder smoked sausage okra shri...
4,ground black pepper salt sausage casings leeks...


In [58]:
data.shape

(9944,)

In [59]:
# Assuming y_test_predicted and label_encoder are properly defined
predicted_cuisines = pd.Series(y_test_predicted)

predicted_cuisines

Unnamed: 0,0
0,irish
1,southern_us
2,italian
3,cajun_creole
4,italian
...,...
9939,italian
9940,southern_us
9941,italian
9942,southern_us


In [64]:
df = data.to_frame()

In [66]:
print(isinstance(df, pd.DataFrame))

True


In [68]:
# Ensure adding column
df['cuisine'] = predicted_cuisines

# data['cuisine'] = label_encoder.inverse_transform(y_test_predicted)
df.head()

Unnamed: 0,ingredients,cuisine
0,baking powder eggs all purpose flour raisins m...,irish
1,sugar egg yolks corn starch cream of tartar ba...,southern_us
2,sausage links fennel bulb fronds olive oil cub...,italian
3,meat cuts file powder smoked sausage okra shri...,cajun_creole
4,ground black pepper salt sausage casings leeks...,italian


# Tokenization instead of One Hot Encoding

In [87]:
# given a dataframe for this problem -> converts to embeddings

from tensorflow.keras.preprocessing.text import Tokenizer

# input should be list of strings
# output is padded sequence of tokenized string (each word is represented by a number)
def tokenize_text(texts):

  tokenizer.fit_on_texts(texts)

  # Convert texts to sequences of integers
  sequences = tokenizer.texts_to_sequences(texts)

  return pad_sequences(sequences, maxlen=200, padding='post')


In [105]:
# Initialize Tokenizer
tokenizer = Tokenizer()

In [106]:
list_of_ingredients = ingredients.astype(str)
list_of_ingredients = list_of_ingredients.tolist()

In [107]:
tokenized_ingredients = tokenize_text(list_of_ingredients)
tokenized_ingredients

array([[ 314,  138,   13, ...,    0,    0,    0],
       [ 165,   16,    5, ...,    0,    0,    0],
       [  26,    1,    2, ...,    0,    0,    0],
       ...,
       [  26, 1477,  533, ...,    0,    0,    0],
       [  67,   11,   85, ...,    0,    0,    0],
       [  19,   81,  113, ...,    0,    0,    0]], dtype=int32)

In [125]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

# Parameters
vocab_size_tokenizer = len(tokenizer.word_index) + 500  # Vocabulary size (including the reserved index 0)
embedding_dim = 50  # Dimension of the embedding vectors

# Create the model
model = Sequential([
    Embedding(input_dim=vocab_size_tokenizer, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dropout(0.1),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(20, activation="softmax")
    ])

model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Print model summary
model.summary()



In [109]:
X = tokenized_ingredients
y = np.array(train_data['cuisine'])

In [110]:
label_encoder.fit(train_data['cuisine'])

In [95]:
print(len(y))

39774


In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [119]:
model.fit(X_train, label_encoder.transform(y_train), validation_data=(X_test, label_encoder.transform(y_test)), epochs=3, batch_size=64)

Epoch 1/3
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m692s[0m 2s/step - accuracy: 0.4181 - loss: 1.9378 - val_accuracy: 0.6180 - val_loss: 1.2569
Epoch 2/3
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m796s[0m 2s/step - accuracy: 0.6411 - loss: 1.2071 - val_accuracy: 0.6647 - val_loss: 1.1003
Epoch 3/3
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 2s/step - accuracy: 0.6970 - loss: 0.9955 - val_accuracy: 0.6990 - val_loss: 0.9986


<keras.src.callbacks.history.History at 0x7eed999d88b0>

In [121]:
test_loss, test_acc = model.evaluate(X_test, label_encoder.transform(y_test))
print('Test Accuracy: {}'.format(test_acc))

[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 299ms/step - accuracy: 0.7006 - loss: 0.9965
Test Accuracy: 0.6989943385124207


In [122]:
data = test_data['ingredients'].copy()
data = preprocess_features(data)
data.shape

(9944,)

In [123]:
data_embeddings = tokenize_text(data.astype(str).tolist())
data_embeddings.shape

(9944, 200)

In [126]:
y_predicted = model.predict(data_embeddings)
y_test_predicted = y_predicted.argmax(axis=-1)

[1m 31/311[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:17[0m 276ms/step

KeyboardInterrupt: 

In [None]:
y_test_predicted = label_encoder.inverse_transform(y_test_predicted)

In [None]:
# Assuming y_test_predicted and label_encoder are properly defined
predicted_cuisines = pd.Series(y_test_predicted)

predicted_cuisines

In [None]:
df = data.to_frame()

In [None]:
# Ensure adding column
df['cuisine'] = predicted_cuisines

# data['cuisine'] = label_encoder.inverse_transform(y_test_predicted)
df.head()

# Word2Vec with SVM

In [127]:
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [128]:
# here we are taking mean of vector embeddings of all the words in a sentence (words)
# vector embedding of each word is a 1D vector with some d values
# each value is a representation of similarity with some other word
def avg_word2vec(sentence):
  return np.mean([wv[word] for word in sentence if word in wv.index_to_key], axis=0)

In [129]:
from tqdm import tqdm

In [157]:
def get_word2vec_embeddings(ingredients_list):
  X = []
  for i in tqdm(range(len(ingredients_list))):
    X.append(avg_word2vec(ingredients_list[i].split()))
  return X

In [130]:
ingredients_list = ingredients.tolist()

In [141]:
X = get_word2vec_embeddings(ingredients_list)
X_new = np.array(X, dtype=object)
y = np.array(train_data['cuisine'])

In [146]:
y = label_encoder.fit_transform(train_data['cuisine'])
y

array([ 6, 16,  4, ...,  8,  3, 13])

In [147]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=42)

y_train

array([ 3,  9, 13, ...,  3, 16,  5])

In [149]:
clf = LogisticRegression(C=100)
clf.fit(X_new, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [177]:
X_test_data = test_data['ingredients'].copy()
X_test_data = preprocess_features(X_test_data)
X_test_data_text = X_test_data
X_test_data.head()

Unnamed: 0,ingredients
0,baking powder eggs all purpose flour raisins m...
1,sugar egg yolks corn starch cream of tartar ba...
2,sausage links fennel bulb fronds olive oil cub...
3,meat cuts file powder smoked sausage okra shri...
4,ground black pepper salt sausage casings leeks...


In [164]:
X_test_data = get_word2vec_embeddings(X_test_data)

  2%|▏         | 188/9944 [00:06<06:00, 27.08it/s]


KeyboardInterrupt: 

In [160]:
y_test_pred = clf.predict(X_test_data)
y_test_pred = label_encoder.inverse_transform(y_test_pred)

In [162]:
y_test_pred

array(['british', 'southern_us', 'spanish', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [179]:
X_test_data_text = X_test_data_text.to_frame()

In [178]:
isinstance(X_test_data_text, pd.DataFrame)

False

In [180]:
X_test_data_text['cuisine'] = y_test_pred
X_test_data_text.head()

Unnamed: 0,ingredients,cuisine
0,baking powder eggs all purpose flour raisins m...,british
1,sugar egg yolks corn starch cream of tartar ba...,southern_us
2,sausage links fennel bulb fronds olive oil cub...,spanish
3,meat cuts file powder smoked sausage okra shri...,cajun_creole
4,ground black pepper salt sausage casings leeks...,italian


# TFIDF + Ensemble model

In [None]:
# to do