## Experiment 5: Simple Bidirectional RNN Model with Doc2Vec Embeddings
**(Version 1)**

**1. Import the necessary libraries and modules for this experiment**

In [1]:
import datetime
from packaging import version
from collections import Counter
import numpy as np
import pandas as pd
import time
import os
import re
import string

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
# %pip install tensorflow_datasets
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as k
from sklearn.model_selection import train_test_split

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# Set the default precision for numpy
np.set_printoptions(precision=3, suppress=True)

# Enable display of multiple outputs per Jupyter Notebook cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

---

**2. Load in the golf course reviews dataset & create a new label column**


In [3]:
file_path = "top_and_non_golf_course_reviews.csv"
df = pd.read_csv(file_path)

# Create a new label column that indicates whether the review is a top100 course or not
df['top100'] = df['label'].apply(lambda x: 1 if x == 'top100' else 0)

---

**3. Split the dataset into training, validation, and testing sets**

In [4]:
train_df, remaining = train_test_split(df, test_size=0.33, stratify=df['top100'], random_state=42)
val_df, test_df = train_test_split(remaining, test_size=0.5, stratify=remaining['top100'], random_state=42)

# Check the shape of the training, validation, and test sets
print(f"Training Dataset Shape: {train_df.shape}")
print(f"Validation Dataset Shape: {val_df.shape}")
print(f"Test Dataset Shape: {test_df.shape}")

Training Dataset Shape: (80, 11)
Validation Dataset Shape: (20, 11)
Test Dataset Shape: (20, 11)


---
**4. Preprocess the text data from the 'review_text' column**

In [5]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove stopwords (using the NLTK library)
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the 'review_text' column of the df DataFrame
train_df['preprocessed_text'] = train_df['review_text'].apply(preprocess_text)
val_df['preprocessed_text'] = val_df['review_text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['review_text'].apply(preprocess_text)

---
**5. Convert the preprocessed text data into a list of tagged documents (Doc2Vec format)**

In [6]:
def create_tagged_documents(df):
    """
    Create a list of tagged documents from the 'preprocessed_text' column of the given DataFrame.

    Parameters:
        df (pandas.DataFrame): The DataFrame containing the 'preprocessed_text' column.

    Returns:
        List[gensim.models.doc2vec.TaggedDocument]: A list of TaggedDocument objects, where each TaggedDocument represents a document with its corresponding tag (index).
    """
    return [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(df['preprocessed_text'])]

tagged_documents = create_tagged_documents(train_df)

---
**6. Train the Doc2Vec model and generate the embeddings**

In [7]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)

# Define a function to generate the embeddings for the split dataframes
def get_doc2vec_embeddings(model, df):
    return np.array([model.infer_vector(text.split()) for text in df['preprocessed_text']])

# Generate the embeddings for the training, validation, and test sets
train_embeddings = get_doc2vec_embeddings(model, train_df)
val_embeddings = get_doc2vec_embeddings(model, val_df)
test_embeddings = get_doc2vec_embeddings(model, test_df)

----
**7. Create Two Layer Simple Bidirectional RNN Model with Doc2Vec Embeddings**

- Two Bidirectional RNN layers (64 units and 32 units)
- ReLu activation function
- RMSprop optimizer (learning rate = 0.001)
- Vocabulary size = 10000

In [8]:
# Build the unidirectional RNN model
inputs = tf.keras.Input(shape=(100,), name="input")
x = tf.keras.layers.Reshape((100, 1))(inputs)
x = tf.keras.layers.Bidirectional(layers.SimpleRNN(units=64, activation='relu', return_sequences=True, name="Bidirectional_RNN_1"))(x)
x = tf.keras.layers.Bidirectional(layers.SimpleRNN(units=32, activation='relu', name="Bidirectional_RNN_2"))(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name="Two_Layer_RNN_D2V")

---- 
**8. Compile the model**

In [9]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy"])

# Display the model summary
model.summary()

---
**9. Train the model**

In [10]:
# Clear any existing models in memory
tf.keras.backend.clear_session()

# Define the callbacks for the model training
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("EXP_5_RNN_D2V.keras", save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
]

# Train the model
start_time = time.time()
history = model.fit(train_embeddings, train_df['top100'],
                    validation_data=(val_embeddings, val_df['top100']),
                    epochs=20,
                    callbacks=callbacks)
end_time = time.time()
training_time = end_time - start_time

# Print the training time
print(f"Training Time: {training_time} seconds")

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 126ms/step - accuracy: 0.4984 - loss: 0.7537 - val_accuracy: 0.5500 - val_loss: 0.6924
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.6477 - loss: 0.6294 - val_accuracy: 0.5000 - val_loss: 0.6762
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.6977 - loss: 0.5735 - val_accuracy: 0.6000 - val_loss: 0.6601
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8250 - loss: 0.5324 - val_accuracy: 0.5500 - val_loss: 0.6603
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.8047 - loss: 0.4791 - val_accuracy: 0.5500 - val_loss: 0.6407
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.8828 - loss: 0.4203 - val_accuracy: 0.5500 - val_loss: 0.6202
Training Time: 2.0062851905822754 seconds


---
**10. Evaluate the model on the test set**

In [11]:
# Load the best model
model = tf.keras.models.load_model("EXP_5_RNN_D2V.keras")

# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_embeddings, test_df['top100'])
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Get the training loss, validation loss, training accuracy, and validation accuracy from the history object
training_loss = history.history['loss'][-1] # the -1 index gets the last epoch
validation_loss = history.history['val_loss'][-1]
training_accuracy = history.history['accuracy'][-1]
validation_accuracy = history.history['val_accuracy'][-1]

"""
1.2.2 Extract the training history and add all evaluation metrics into a history DataFrame
"""
# Extract the training history into a pandas DataFrame
history_df = pd.DataFrame({
    'EXP': [5],
    'Model': ['RNN w/ Doc2Vec Embeddings'],
    'Training Loss': [training_loss],
    'Training Accuracy': [training_accuracy],
    'Validation Loss': [validation_loss],
    'Validation Accuracy': [validation_accuracy],
    'Test Loss': [test_loss],
    'Test Accuracy': [test_accuracy],
    'Training Time': [training_time]
})

# Inspect the history DataFrame
history_df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363ms/step - accuracy: 0.6500 - loss: 0.6495
Test Loss: 0.6494666337966919, Test Accuracy: 0.6499999761581421


'\n1.2.2 Extract the training history and add all evaluation metrics into a history DataFrame\n'

Unnamed: 0,EXP,Model,Training Loss,Training Accuracy,Validation Loss,Validation Accuracy,Test Loss,Test Accuracy,Training Time
0,5,RNN w/ Doc2Vec Embeddings,0.425126,0.875,0.620164,0.55,0.649467,0.65,2.006285
