# Word2Vec & LSTM

### Importing Libraries

In [2]:
import numpy as np 
import pandas as pd
import os
import re
import json
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import gensim
import keras
import tensorflow as tf

import matplotlib.colors as mcolors

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import word2vec

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils import pad_sequences
# from keras.preprocessing.sequence import pad_sequences #deprecated
from keras.models import Sequential

from tqdm import tqdm


2023-04-27 00:59:46.528158: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-27 00:59:46.581802: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Loading the dataset

In [4]:
df = pd.read_csv("Reviews_cleanText_noSW_sageMakerLocal.csv")
len(df)

394052

In [4]:
df.shape

(394052, 7)

In [10]:
rating_df = pd.DataFrame(df, columns=['Score_class', 'cleaned_text'])
print(rating_df.shape)

(394052, 2)


In [11]:
rating_df['Score_class'].astype('category').value_counts()

1     307472
-1     86580
Name: Score_class, dtype: int64

In [12]:
dummies = pd.get_dummies(rating_df['Score_class'])
dummies.head()

Unnamed: 0,-1,1
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1


In [17]:
x_train, x_test, y_train, y_test = train_test_split(rating_df['cleaned_text'], 
                                                    dummies, 
                                                    test_size=0.1, random_state = 42
)

## Model Building

In [18]:
embedding_matrix = pd.read_csv("embed_matrix.csv").to_numpy()

**The following model was trained multiple times with 3 different SpatialDropout1D(x) values:**
Hyper-Parameter Value Tested: [0.2, 0.3, 0.4]

In [21]:
def build_model(embedding_matrix):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.4)(x) # changed for 3 model runs
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
 
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = Dense(512, activation='relu')(hidden)
    
    result = Dense(2, activation='softmax')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='adam',
        metrics=['accuracy','AUC','Precision','Recall']
    )

    return model

In [None]:
%%time
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [None]:
maxlen=512
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=512)
x_test = pad_sequences(x_test, maxlen=512)


In [None]:
import keras
keras.backend.set_image_data_format("channels_last")

In [None]:
model = build_model(embedding_matrix)
model.summary()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor='val_acc', 
    verbose=1, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)

history = model.fit(
    x_train,
    y_train,
    batch_size=512,
    callbacks=[checkpoint],
    epochs=5,
    validation_split=0.1
)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.plot(history.history['accuracy'], label = 'train_accuracy')
plt.title('Model Accuracy')
plt.legend()
plt.xticks([0,1,2,3,4])
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.savefig('LSTM_v3_Accuracy.png')
plt.show()


In [None]:
# Plot training & validation auc values
plt.plot(history.history['val_auc'], label = 'val_auc')
plt.plot(history.history['auc'], label = 'train_auc')
plt.title('Model AUC')
plt.legend()
plt.xticks([0,1,2,3,4])
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.savefig('LSTM_v3_AUC.png')
plt.show()


In [None]:
# Plot training & validation loss values
plt.plot(history.history['val_loss'], label = "val_loss")
plt.plot(history.history['loss'], label = "training_loss")
plt.title('Model Loss')
plt.legend()
plt.xticks([0,1,2,3,4])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.savefig('LSTM_v3_Loss.png')
plt.show()

In [None]:
# Plot training & validation recall values
plt.plot(history.history['val_recall'], label = 'val_recall')
plt.plot(history.history['recall'], label = 'train_recall')
plt.title('Model Recall')
plt.legend()
plt.xticks([0,1,2,3,4])
plt.ylabel('Recall')
plt.xlabel('Epoch')
plt.savefig('LSTM_v3_Recall.png')
plt.show()

In [None]:
# Plot training & validation recall values
plt.plot(history.history['val_precision'], label = 'val_precision')
plt.plot(history.history['precision'], label = 'train_precision')
plt.title('Model Precision')
plt.legend()
plt.xticks([0,1,2,3,4])
plt.ylabel('Precision')
plt.xlabel('Epoch')
plt.savefig('LSTM_v3_Precision.png')
plt.show()

# Test Accuracy

In [None]:
loss, accuracy, AUC , Precision, Recall = model.evaluate(x_test, y_test, verbose=2)

In [None]:
f1 = 2 * (Precision * Recall) / (Precision + Recall)
f1

In [None]:
import helper_module
name = f'W2V_LSTM_v3'
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name,
                            datashift = f'test', with_sw = 0,
                            ROC_AUC = AUC, accuracy = accuracy, 
                            f1 = f1, recall=Recall, cm = np.zeros(4,), first_entry=False)

In [None]:
# !mkdir -p saved_model
model.save('saved_model/lstm_v3')

# new_model = tf.keras.models.load_model('saved_model/lstm_v2')
# loss, accuracy, AUC , Precision, Recall = new_model.evaluate(x_test, y_test, verbose=2)

# Data Shifts

In [None]:
#summary Data shift

rating_df = pd.DataFrame(df, columns=['Score_class', 'cleaned_summary'])
dummies = pd.get_dummies(rating_df['Score_class'])

x_train, x_test, y_train, y_test = train_test_split(rating_df['cleaned_summary'], 
                                                    dummies, 
                                                    test_size=0.1, random_state = 42
)
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=512)

loss, accuracy, AUC , Precision, Recall = model.evaluate(x_test, y_test, verbose=2)
f1 = 2 * (Precision * Recall) / (Precision + Recall)

name = f'W2V_LSTM_v3'
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name,
                            datashift = f'summary', with_sw = 0,
                            ROC_AUC = AUC, accuracy = accuracy, 
                            f1 = f1, recall=Recall, cm = np.zeros(4,), first_entry=False)

In [None]:
# dropout 0.1

rating_df = pd.DataFrame(df, columns=['Score_class', 'cleaned_text'])
dummies = pd.get_dummies(rating_df['Score_class'])

x_train, x_test, y_train, y_test = train_test_split(rating_df['cleaned_text'], 
                                                    dummies, 
                                                    test_size=0.1, random_state = 42
)
x_test = x_test.apply(helper_module.random_dropout,
                      p=0.1, 
                      random_state=42)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=512)

loss, accuracy, AUC , Precision, Recall = model.evaluate(x_test, y_test, verbose=2)
f1 = 2 * (Precision * Recall) / (Precision + Recall)

name = f'W2V_LSTM_v3'
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name,
                            datashift = f'dropout_0.1', with_sw = 0,
                            ROC_AUC = AUC, accuracy = accuracy, 
                            f1 = f1, recall=Recall, cm = np.zeros(4,), first_entry=False)

In [None]:
# dropout 0.25

rating_df = pd.DataFrame(df, columns=['Score_class', 'cleaned_text'])
dummies = pd.get_dummies(rating_df['Score_class'])

x_train, x_test, y_train, y_test = train_test_split(rating_df['cleaned_text'], 
                                                    dummies, 
                                                    test_size=0.1, random_state = 42
)
x_test = x_test.apply(helper_module.random_dropout,
                      p=0.25, 
                      random_state=42)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=512)

loss, accuracy, AUC , Precision, Recall = model.evaluate(x_test, y_test, verbose=2)
f1 = 2 * (Precision * Recall) / (Precision + Recall)

name = f'W2V_LSTM_v3'
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name,
                            datashift = f'dropout_0.25', with_sw = 0,
                            ROC_AUC = AUC, accuracy = accuracy, 
                            f1 = f1, recall=Recall, cm = np.zeros(4,), first_entry=False)

In [None]:
# dropout 0.5

rating_df = pd.DataFrame(df, columns=['Score_class', 'cleaned_text'])
dummies = pd.get_dummies(rating_df['Score_class'])

x_train, x_test, y_train, y_test = train_test_split(rating_df['cleaned_text'], 
                                                    dummies, 
                                                    test_size=0.1, random_state = 42
)
x_test = x_test.apply(helper_module.random_dropout,
                      p=0.5, 
                      random_state=42)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=512)

loss, accuracy, AUC , Precision, Recall = model.evaluate(x_test, y_test, verbose=2)
f1 = 2 * (Precision * Recall) / (Precision + Recall)

name = f'W2V_LSTM_v3'
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name,
                            datashift = f'dropout_0.5', with_sw = 0,
                            ROC_AUC = AUC, accuracy = accuracy, 
                            f1 = f1, recall=Recall, cm = np.zeros(4,), first_entry=False)

# Reference: 
**Amazon Fine Food Reviews: Sentiment Analysis.**
Provides the basic guidelines for LSTM. 
https://www.kaggle.com/code/chirag9073/amazon-fine-food-reviews-sentiment-analysis