## Experiment 4: 1-Dimensional Convolutional Neural Network (1D-CNN) Model with TensorFlow TextVectorization Layer Embeddings
**(Version 1)**

**1. Import the necessary libraries and modules for this experiment**

In [1]:
import datetime
from packaging import version
from collections import Counter
import numpy as np
import pandas as pd
import time
import os
import re
import string

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
# %pip install tensorflow_datasets
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as k
from sklearn.model_selection import train_test_split

In [2]:
# Set the default precision for numpy
np.set_printoptions(precision=3, suppress=True)

# Enable display of multiple outputs per Jupyter Notebook cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

---

**2. Load in the golf course reviews dataset & create a new label column**


In [3]:
file_path = "top_and_non_golf_course_reviews.csv"
df = pd.read_csv(file_path)

# Create a new label column that indicates whether the review is a top100 course or not
df['top100'] = df['label'].apply(lambda x: 1 if x == 'top100' else 0)

---

**3. Split the dataset into training, validation, and testing sets**

In [4]:
train_df, remaining = train_test_split(df, test_size=0.33, stratify=df['top100'], random_state=42)
val_df, test_df = train_test_split(remaining, test_size=0.5, stratify=remaining['top100'], random_state=42)

# Check the shape of the training, validation, and test sets
print(f"Training Dataset Shape: {train_df.shape}")
print(f"Validation Dataset Shape: {val_df.shape}")
print(f"Test Dataset Shape: {test_df.shape}")

Training Dataset Shape: (80, 11)
Validation Dataset Shape: (20, 11)
Test Dataset Shape: (20, 11)


----

**4. Convert the split DataFrames into TensorFlow Datasets**

In [5]:
train_ds = tf.data.Dataset.from_tensor_slices(dict(train_df))
val_ds = tf.data.Dataset.from_tensor_slices(dict(val_df))
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_df))

----

**5. Create `custom_stopwords` function and `text_vectorization` layer**

In [6]:
# Define a `custom_stopwords` function to remove stopwords, strip punctuation, and lowercase the text
def custom_stopwords(input_text):
    """
    Removes stopwords, strips punctuation, and lowers the input text.

    Args:
        input_text (tf.Tensor): The input text to be processed.

    Returns:
        tf.Tensor: The processed text with stopwords removed, punctuation stripped, and lowercased.
    """
    lowercase = tf.strings.lower(input_text)
    stripped_punct = tf.strings.regex_replace(lowercase,
                                            '[%s]' % re.escape(string.punctuation),
                                            '')
    return tf.strings.regex_replace(stripped_punct, r'\b(' + r'|'.join(STOPWORDS) + r')\b\s*', "")

# Download stopwords from the NLTK library
nltk.download('stopwords', quiet=True)
STOPWORDS = stopwords.words("english")

# Define the maxium sequence and token length for this experiment
max_length =  3073
max_tokens = 10000

# Create a TextVectorization layer
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    standardize=custom_stopwords
)

True

----

**6. Adapt the TextVectorization layer to a text_only_dataset of the training dataset**

In [7]:
# Create a text_only_train_dataset which contains only the review_text column of the training dataset
text_only_train_dataset = train_ds.map(lambda x: x['review_text'])

# Adapt the TextVectorization layer to the text_only_train_dataset
text_vectorization.adapt(text_only_train_dataset)

# Create int_train_ds, int_val_ds, and int_test_ds from the train, val, and test datasets respectively using the TextVectorization layer
int_train_ds = train_ds.map(lambda x: (text_vectorization(x['review_text']), x['top100']))
int_val_ds = val_ds.map(lambda x: (text_vectorization(x['review_text']), x['top100']))
int_test_ds = test_ds.map(lambda x: (text_vectorization(x['review_text']), x['top100']))

# Batch and pad the datasets to have a sequence lenfth dimension
batch_size = 32
max_sequence_length = 3073

int_train_ds = int_train_ds.map(lambda x, y: (x, y)).padded_batch(batch_size, padded_shapes=(max_sequence_length, ()))
int_val_ds = int_val_ds.map(lambda x, y: (x, y)).padded_batch(batch_size, padded_shapes=(max_sequence_length, ()))
int_test_ds = int_test_ds.map(lambda x, y: (x, y)).padded_batch(batch_size, padded_shapes=(max_sequence_length, ()))

2024-08-10 12:21:32.569810: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


----
**7. Create Two Layer 1D-CNN Model**

- Two 1D-CNN layers (64 filters and 32 filters | Kernel Size = 5 each)
- MaxPooling1D Layers after each 1D-CNN layer (pool_size = 2)
- GlobalMaxPooling1D Layer after the second MaxPooling1D layer
- Dense Layer with 64 units and ReLU activation function
- RMSprop optimizer (learning rate = 0.001)
- Vocabulary size = 10000

In [16]:
# Define the model constants
vocab_size = 10000
embedding_dim = 356
kernel_size = 5

# Build the 1D-CNN model
inputs = tf.keras.Input(shape=(None,), dtype="int64", name="input")
embedding = layers.Embedding(input_dim=vocab_size,
                             output_dim=embedding_dim,
                             mask_zero=True, name="embedding")(inputs)
x = layers.Conv1D(filters=64, kernel_size=kernel_size, activation='relu', name='1d-cnn_1')(embedding)
x = layers.MaxPooling1D(pool_size=2, name='maxpool_1')(x)
x = layers.Conv1D(filters=32, kernel_size=kernel_size, activation='relu', name='1d-cnn_2')(x)
x = layers.MaxPooling1D(pool_size=2, name='maxpool_2')(x)
x = layers.GlobalMaxPooling1D(name='globalmaxpool')(x)
x = layers.Dense(64, activation='relu', name='dense_1')(x)
outputs = layers.Dense(1, activation="sigmoid", name="output")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name="Two_Layer_1D-CNN_TF-TV")



---- 
**8. Compile the model**

In [17]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy"])

# Display the model summary
model.summary()

---
**9. Train the model**

In [18]:
# Clear any existing models in memory
tf.keras.backend.clear_session()

# Define the callbacks for the model training
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("EXP_4_1D-CNN_TF_TV.keras", save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
]

# Train the model
start_time = time.time()
history = model.fit(int_train_ds,
                    validation_data=int_val_ds,
                    epochs=20,
                    callbacks=callbacks)
end_time = time.time()
training_time = end_time - start_time

# Print the training time
print(f"Training Time: {training_time} seconds")

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 411ms/step - accuracy: 0.4922 - loss: 0.6954 - val_accuracy: 0.5500 - val_loss: 0.6926
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 361ms/step - accuracy: 0.9336 - loss: 0.6661 - val_accuracy: 0.5000 - val_loss: 0.6927
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 380ms/step - accuracy: 0.9820 - loss: 0.6309 - val_accuracy: 0.5000 - val_loss: 0.6915
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 436ms/step - accuracy: 1.0000 - loss: 0.5810 - val_accuracy: 0.5000 - val_loss: 0.6909
Training Time: 5.838586091995239 seconds


---
**10. Evaluate the model on the test set**

In [19]:
# Load the best model
model = tf.keras.models.load_model("EXP_4_1D-CNN_TF_TV.keras")

# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(int_test_ds)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Get the training loss, validation loss, training accuracy, and validation accuracy from the history object
training_loss = history.history['loss'][-1] # the -1 index gets the last epoch
validation_loss = history.history['val_loss'][-1]
training_accuracy = history.history['accuracy'][-1]
validation_accuracy = history.history['val_accuracy'][-1]

"""
1.2.2 Extract the training history and add all evaluation metrics into a history DataFrame
"""
# Extract the training history into a pandas DataFrame
history_df = pd.DataFrame({
    'EXP': [4],
    'Model': ['1D-CNN w/ TextVectorization Embeddings'],
    'Training Loss': [training_loss],
    'Training Accuracy': [training_accuracy],
    'Validation Loss': [validation_loss],
    'Validation Accuracy': [validation_accuracy],
    'Test Loss': [test_loss],
    'Test Accuracy': [test_accuracy],
    'Training Time': [training_time]
})

# Inspect the history DataFrame
history_df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step - accuracy: 0.5000 - loss: 0.6889
Test Loss: 0.6889338493347168, Test Accuracy: 0.5


'\n1.2.2 Extract the training history and add all evaluation metrics into a history DataFrame\n'

Unnamed: 0,EXP,Model,Training Loss,Training Accuracy,Validation Loss,Validation Accuracy,Test Loss,Test Accuracy,Training Time
0,4,1D-CNN w/ TextVectorization Embeddings,0.570403,1.0,0.690904,0.5,0.688934,0.5,5.838586
