In [None]:
# External libraries
import pandas as pd
import inflect
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import (
    Dense,
    Input,
    Dropout,
    Flatten,
    LSTM,
    Bidirectional,
    Embedding
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# The project's modules
from data_collection.util_collect import data_collection
from data_management.util_cleaning import data_cleaning
from data_management.util_data_prep_for_modelling import (
    dataframe_summary,
    plot_review_counts,
    process_ratings,
    undersample_majority,
    add_word_count,
    plot_and_save_avg_words_vs_rating,
    text_Preprocessing,
    add_word_count_after_preprocessing
)
from nlp_model_functions.util_lstm_glove import (
    separate_features_and_target,
    tokenize_reviews,
    pad_text_sequences,
    download_glove_embeddings,
    load_glove_embeddings,
    create_embedding_matrix,
    evaluate_model_performance,
    lstm_plot_confusion_matrix,
    calculate_predicted_and_actual,
    plot_loss,
    make_rounded_predictions
)
from nlp_model_functions.util_BERT import (
    split_data,
    tokenize_texts,
    get_dataset,
    get_model,
    compile_and_train,
    evaluate_metrics,
    predict_with_bert,
    plot_confusion_matrix,
    plot_and_save_loss
)

# Special utility imports (if needed in a non-Colab environment, you might not need these)
# from google.colab import files

# Download necessary NLTK data
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Set global settings
%matplotlib inline
p = inflect.engine()
STOPWORDS = set(stopwords.words('english'))

## data_collection Function

**Purpose:**  
This function initiates the process of scraping Amazon product reviews by tying all the smaller functions together.

### Overview:

1. Initializes the Selenium driver and navigates to Amazon's website.
2. Performs a search on Amazon using the provided keyword.
3. Collects product links based on the search results up to the specified number of pages.
4. Scrapes reviews from the collected product links.
5. Exports the scraped reviews to a specified CSV file.

In [None]:
# You can then call this function with the search term, maximum pages, and filename
# Kindly note that the data collection function takes a long time to collect Amazon products' reviews
# So that here the function is commented out
#data_collection("home electronics", max_pages=2)

## data_cleaning Function

**Purpose:**  
Performs end-to-end data cleaning and preparation on multiple CSV files from a specified directory.

### Overview:

1. Merges multiple CSV files from a directory into a single DataFrame.
2. Processes the merged DataFrame (reordering columns, shuffling rows).
3. Removes duplicates and handles missing values.
4. Refines ratings (extraction and filtering).
5. Cleans the review texts.
6. Filters out anomalies in reviews.
7. Separates English and non-English reviews.
8. Translates non-English reviews to English. (NOTE: RUNNING TAKES LONGER TIME)
9. Saves the cleaned DataFrame to a specified CSV file.

In [None]:
# Define the path to the directory containing the original, collected raw data.
input_directory = 'data/data_sample'

# Specify the path where the cleaned and processed CSV file will be saved.
output_file = 'data/cleaned_data/test_last_cleaned_output.csv'

# Call the data_cleaning function to perform end-to-end data cleaning.
# The function will read all CSV files from 'input_directory', process them, and save the cleaned data to 'output_file'.
data_cleaning(input_directory, output_file)

## Data Processing Workflow for the NLP models

### 1. Load and Review Data:
Load the cleaned data from a CSV file and review its structure and content.
### 2. Initial Data Summary:
Generate a summary of the dataframe.
### 3. Plotting Review Counts:
Plot the distribution of review counts across different ratings, showcasing potential class imbalance.
### 4. Process Ratings:
Process and  transform 5 and 4 scores into 1 (positive), 1 and 2 into (negative) in the ratings column.
### 5. Balance Dataset:
Address the class imbalance issue by undersampling majority classes.
### 6. Re-plot Review Counts:
Showcase the new distribution of review counts after addressing class imbalance.
### 7. Word Count:
Calculate and print the maximum word count across reviews. Also, visualize the average word count vs. rating.
### 8. Text Preprocessing: text_Preprocessing Function
**Purpose:**  
To preprocess a list of text strings for NLP tasks.
#### Steps:
1. **Lowercasing:** Convert each string to lowercase.
2. **Removing emails:** Eliminate email addresses from the strings.
3. **Removing digits:** Get rid of any numerical characters in the strings.
4. **Removing special characters:** Remove characters that are not alphabets, numbers, or whitespaces.
5. **Removing extra spaces:** Trim any leading or trailing spaces.
6. **Removing emojis:** Remove any emojis present in the string.
7. **Stopwords removal:** Eliminate common words (e.g., "and", "the", "is") that typically don't contain significant meaning.
8. **Lemmatization:** Convert words to their base or dictionary form (e.g., "running" -> "run").
### 9. Word Count Post-Processing:
Calculate word count again after preprocessing.
### 10. Last checkpoint for spaces:
Identify and remove rows where the 'Reviews' or 'Rating' columns contain only spaces.
### 11. Save Data:
Store the processed DataFrame in a new CSV file, ready for modelling.


In [None]:
# Load the cleaned dataset
data = pd.read_csv("data\cleaned_data\last_cleaned_output.csv")
df = data

# Display a summary of the dataframe
dataframe_summary(df)

# Plot distribution of review counts before balancing
plot_review_counts(df, save_path='data_management/plots', file_name='imbalanced_ratings.png')

# Process ratings in the dataframe
df = process_ratings(df)

# Balance the dataset by undersampling the majority class
df = undersample_majority(df)

# Plot distribution of review counts after balancing
plot_review_counts(df, save_path='data_management/plots', file_name='balanced_ratings.png')

# Add a column with word counts and get the maximum word count
df, max_count = add_word_count(df)
print(f"Maximum word count in the DataFrame: {max_count}")

# Plot average number of words vs. ratings
plot_and_save_avg_words_vs_rating(df, save_path='data_management/plots', filename='average_number_words.png')

# Preprocess the 'Reviews' column for textual data
df['Reviews'] = text_Preprocessing(df['Reviews'])

# Add a column with word counts post-text preprocessing
df = add_word_count_after_preprocessing(df, 'Reviews')

# Convert the 'Reviews' column to string type just to ensure it's string
df['Reviews'] = df['Reviews'].astype(str)

# Identify rows where 'Reviews' column is only spaces
rows_with_spaces = df[df['Reviews'].str.strip().str.len() == 0]

# Remove rows containing only spaces in the 'Reviews' column
df = df.drop(rows_with_spaces.index)
df.head()
# Save the processed DataFrame to a new CSV file
df.to_csv('data/data_for_modelling/ready_data_for_modelling.csv', index=False)

In [None]:
df = pd.read_csv("data/data_for_modelling/ready_data_for_modelling.csv")

## Splitting the Data

Once our data is ready, the next step is to split it into training and testing sets. This allows us to train our models on one subset and validate its performance on another unseen subset.

In [None]:
train,test=train_test_split(df, test_size=0.3)

## Checking the Shape of the Datasets

It's essential to know the size of the training and testing sets after splitting. This ensures we have a sufficient amount of data for both training and validation.

In [None]:
# Displaying the shape of training set
print(train.shape)

# Displaying the shape of test set
print(test.shape)

## The `separate_features_and_target` Function

Let's apply the `separate_features_and_target` function to our dataset.

### Step 1: Separate Data

We are separating our train and test data into feature variables (X_train and X_test) and target variables (y_train and y_test), with 'Rating' being the target column.

### Step 2: Displaying the Separated Data
To ensure the data has been separated correctly, let's display the initial rows of each set.

From the printed output, we can see the top rows of the feature datasets (X_train and X_test) and the corresponding target values (y_train and y_test).

In [None]:
X_train, y_train, X_test, y_test = separate_features_and_target(train, test, 'Rating')

# Displaying the separated features and target variables
print("X_train:")
print(X_train.head())
print("\ny_train:")
print(y_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_test:")
print(y_test.head())

## Setting Up the Tokenizer and Applying Tokenization

To convert the reviews into tokenized sequences, we need to set up and fit a tokenizer. This will enable the model to understand and process the text data in our reviews.

### Step 1: Initializing the Tokenizer

We initialize a Keras tokenizer object.
The num_words argument restricts the tokenizer to only consider the top 15,000 most frequent words in the dataset.

### Step 2: Fitting the Tokenizer on the Training Data

The fit_on_texts method allows the tokenizer to learn the vocabulary present in our X_train['Reviews'] data.

### Step 3: Tokenizing the Reviews

We use our previously defined tokenize_reviews function to tokenize the "Reviews" column in both the training and testing datasets.
The tokenized sequences are stored in the "text_tokenizer" column in each dataset.
By the end of these steps, our datasets are prepared with tokenized sequences that are ready for modeling.

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train['Reviews'])
X_train, X_test = tokenize_reviews(X_train, X_test, tokenizer)

## Applying Padding and Truncation to our Data

Once we have defined the function to pad and truncate our sequences, the next step is to apply this transformation to our training and testing datasets. 

### Using the `pad_text_sequences` Function:

**By calling this function**:

X_train and X_test, which contain the tokenized sequences in the 'text_tokenizer' column, are transformed.
The sequences are padded (or truncated) to have a uniform length of 1000 tokens, as set by the default maxlen parameter in our function.
The resulting padded sequences for the training data are stored in X_train_pad.
The resulting padded sequences for the testing data are stored in X_test_pad.
Now, both X_train_pad and X_test_pad are ready to be used as input for neural network models that require fixed-length sequences.

In [None]:
X_train_pad, X_test_pad = pad_text_sequences(X_train, X_test)

## Downloading GloVe Embeddings

[GloVe (Global Vectors for Word Representation)](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm developed by Stanford for generating word embeddings. These embeddings can be used to improve the performance of NLP models, especially for our text classification task. We will discuss Glove vectors in details later in the model construction.

### Setting up the GloVe Download:

**URL and Local Storage Definitions**:
   - `GLOVE_URL`: The online URL from Stanford's site where the GloVe embeddings (specifically the Twitter dataset version) can be accessed.
   - `GLOVE_DIR`: The local directory where the downloaded embeddings should be stored.

**File Download**:
We first check if the GloVe embeddings are already downloaded to avoid re-downloading.
If the embeddings are not found, we download them from the given URL and save them in the specified directory.

By following these steps, we ensure efficient retrieval of the GloVe embeddings, which can significantly enhance the quality of text representations in our subsequent models.


In [None]:
# Define the URL and the local path where the file should be saved
GLOVE_URL = 'https://nlp.stanford.edu/data/glove.twitter.27B.zip'
GLOVE_DIR = 'data/glove'

# Call the function
download_glove_embeddings(GLOVE_URL, GLOVE_DIR)

## Loading and Inspecting the GloVe Embeddings

We set glove_zip_path to the location where our GloVe ZIP file is stored. We then call our previously defined load_glove_embeddings function to load the word vectors. Furthemore, we print the total number of unique words present in the loaded GloVe embeddings.

In [None]:
glove_zip_path = "data/glove/glove.twitter.27B.zip"
glove, glove_words = load_glove_embeddings(glove_zip_path)
# Displaying some information about the loaded embeddings
print(f"Number of words in GloVe: {len(glove_words)}")

## Creating the Embedding Matrix

To leverage the power of pre-trained GloVe embeddings for our model, we need an embedding matrix. This matrix serves as a bridge between our dataset's vocabulary and GloVe's pre-trained vectors.

### Function: create_embedding_matrix

The dimension of the GloVe word vectors, with a default value of 100.
The funtion returns a NumPy array that represents the embedding matrix suitable for use in a Keras Embedding layer.

#### How our embedding matrix works:

**Determine Vocabulary Size**: The vocabulary size is obtained from the tokenizer's word index.
**Initialize an Embedding Matrix**: A matrix of zeros with a shape (max_vocabulary+1, embedding_dim). The extra "+1" accounts for out-of-vocabulary words or padding tokens.
**Populate the Matrix**: To fill in the embedding matrix we iterate over each word in the tokenizer's word index. For each word, we check if a pre-trained GloVe vector exists. If it exists, the corresponding row in the embedding matrix is populated with the GloVe vector. 

By the end of this process, we'll have an embedding matrix where each row corresponds to the vector representation of a word in our dataset's vocabulary. This matrix can be directly used in our neural network's embedding layer.

In [None]:
embedding_matrix = create_embedding_matrix(tokenizer, glove_words, glove)
# Displaying the shape of embedding matrix information about the created embedding matrix
print(f"Shape of embedding matrix: {embedding_matrix.shape}")

### Variables Assignment

In [None]:
X_train = X_train_pad
X_test = X_test_pad
Y_train = y_train
Y_test = y_test

The model is for text classification based on Amazon reviews. It uses pre-trained word embeddings combined with LSTM layers.

### Function definition:
```python
def LSTM_Glove_model(embedding_matrix, max_vocabulary):
```

- The function is named `create_review_model`.
- It takes two parameters:
  - `embedding_matrix`: This is a pre-trained word embedding matrix.
  - `max_vocabulary`: This is the maximum number of unique words in the vocabulary.

### Input layer:
```python
review = Input(shape=(1000,), name='review_input')
```

- This specifies the input to the model. Each review is represented as a sequence of 1000 tokens. These tokens will typically be integer values, each corresponding to a word in a vocabulary.

### Embedding layer:
```python
X = Embedding(output_dim=100,
              input_dim=max_vocabulary + 1,
              input_length=1000,
              weights=[embedding_matrix],
              trainable=False)(review)
```

- The layer converts integer token values into dense vectors of fixed size.
- `output_dim=100`: Each word (token) is represented as a 100-dimensional vector.
- `input_dim=max_vocabulary + 1`: The size of the vocabulary. We add 1 to account for the padding token.
- `input_length=1000`: This sets the sequence length.
- `weights=[embedding_matrix]`: This initializes the embedding layer with the pre-trained word vectors provided by `embedding_matrix`.
- `trainable=False`: This means the word embeddings will not be updated during training.

### Bidirectional LSTM layer:
```python
lstm_review = Bidirectional(LSTM(100))(X)
```

- This layer uses LSTM (Long Short Term Memory) units, which are a type of Recurrent Neural Network (RNN) that can remember past information.
- `Bidirectional` means that the LSTM will process the sequence from both directions (start-to-end and end-to-start). This often helps in capturing patterns effectively.
- It uses 100 LSTM units.

### Dropout layer:
```python
model = Dropout(0.5)(lstm_review)
```

- This layer randomly sets half (50%) of the input units to 0 at each update during training time. This helps in preventing overfitting.

### Flatten layer:
```python
model = Flatten()(model)
```

- After LSTM, the data might have two dimensions (one for sequence and one for features). This layer flattens the output to prepare it for the Dense layers that follow.

### Dense layers:
```python
model = Dense(64, activation="relu", kernel_initializer="he_normal", kernel_regularizer=l2(0.001))(model)
model = Dense(8, activation="relu", kernel_initializer="he_normal", kernel_regularizer=l2(0.001))(model)
```

- These are fully connected neural network layers.
- The first Dense layer has 64 units and the second has 8 units.
- They both use the ReLU activation function.
- The `kernel_initializer="he_normal"` means that the initial weights of these layers are drawn from a He-normal distribution, which is commonly used with ReLU activations.
- `kernel_regularizer=l2(0.001)`: This applies L2 regularization to the weights of the neurons in these layers, helping to prevent overfitting.

### Output layer:
```python
output = Dense(1, activation='sigmoid', name='output')(model)
```

- This is the final layer, and it's designed for binary classification. It has a single neuron with a sigmoid activation function, which will output values between 0 and 1.

### Model Compilation:
```python
model = Model(inputs=[review], outputs=[output])
```

- This line consolidates all the layers we defined into a Keras Model.

### Summary:
```python
print(model.summary())
```

- This prints a summary of the model's architecture.

### Return:
Finally, the function returns the constructed model.

In [None]:
def LSTM_Glove_model(embedding_matrix, max_vocabulary):
    """
    Creates a Keras model for text classification based on reviews.

    Parameters:
        embedding_matrix (ndarray): Pre-trained embedding weights.
        max_vocabulary (int): Maximum number of unique words in the vocabulary.

    Returns:
        model (Model): The Keras Model object.
    """

    # Define the shape of the input for the review (1000 tokens)
    review = Input(shape=(1000,), name='review_input')

    # Embedding layer
    # Uses pre-trained weights (embedding_matrix) and sets it to non-trainable
    X = Embedding(output_dim=100,
                  input_dim=max_vocabulary + 1,
                  input_length=1000,
                  weights=[embedding_matrix],
                  trainable=False)(review)

    # LSTM layer
    # Uses Bidirectional LSTM with 100 units
    lstm_review = Bidirectional(LSTM(100))(X)

    # Dropout layer to reduce overfitting
    model = Dropout(0.5)(lstm_review)

    # Flatten layer to prepare for dense layers
    model = Flatten()(model)

    # First Dense layer with 64 units, ReLU activation, and L2 regularization
    model = Dense(64, activation="relu", kernel_initializer="he_normal", kernel_regularizer=l2(0.001))(model)

    # Second Dense layer with 8 units, ReLU activation, and L2 regularization
    model = Dense(8, activation="relu", kernel_initializer="he_normal", kernel_regularizer=l2(0.001))(model)

    # Output layer with 1 unit (binary classification) and Sigmoid activation
    output = Dense(1, activation='sigmoid', name='output')(model)

    # Compile the model
    model = Model(inputs=[review], outputs=[output])

    # Print a summary of the model's architecture
    print(model.summary())

    return model

## Initializing the Sentiment Analysis Model

After defining our model architecture in the `LSTM_Glove_model` function, it's now time to initialize it using our data's vocabulary and the pre-trained embedding matrix.

### Setting Vocabulary Size:
We first determine the vocabulary size based on our dataset:
```python
max_vocabulary = len(tokenizer.word_index)
```
The `tokenizer.word_index` gives us the number of unique words present in our dataset.

### Model Initialization:
With the vocabulary size and pre-trained embedding matrix in hand, we can proceed to initialize our model:
```python
model = create_review_model(embedding_matrix, max_vocabulary)
```
At this point, the model will be ready, and a summary of its architecture will be printed, providing an overview of layers, output shapes, and the number of parameters.

In [None]:
max_vocabulary = len(tokenizer.word_index)
model = LSTM_Glove_model(embedding_matrix, max_vocabulary)

## Visualizing the Model Architecture

To better understand and showcase the architecture of our model, visualizing its structure can be quite beneficial. This can also serve as documentation for future reference or for presenting the model's design.

In [None]:
# Plot the model
plot_model(model, to_file='model_results/LSTM_Glove_model_architecture_plot.png', show_shapes=True, show_layer_names=True)


## Model Compilation and Training

Before training our sentiment analysis model on the Amazon reviews, we need to specify certain parameters like the optimizer, loss function, and metrics. After the model is compiled, we proceed to train it using our training dataset.

### Model Compilation:

- **Optimizer:** `adam` 
  - The Adam optimizer is a popular choice in deep learning due to its efficient handling of sparse gradients and adaptive learning rates.
  
- **Loss:** `binary_crossentropy`
  - Given that we're tackling a binary classification problem (positive or negative sentiment), the binary crossentropy loss function is suitable.
  
- **Metrics:** `accuracy`
  - Accuracy metric provides a straightforward understanding of how well the model is performing by calculating the proportion of correctly classified reviews.

```python
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
```

### Model Training:

The actual learning happens during this phase:

- **Training Data:** `(X_train, Y_train)`
  
- **Batch Size:** `64`
  - This indicates the number of training examples used in each iteration to update the model's weights. 
  
- **Epochs:** `20`
  - The model will go through the entire training dataset 20 times. 
  
- **Validation Data:** `(X_test, Y_test)`
  - This provides a way to monitor the model's performance on unseen data after each epoch.
  
- **Verbose:** `1`
  - Ensures that the training process displays logs for each epoch.

```python
history = model.fit(X_train, Y_train, batch_size = 64, epochs = 20, verbose = 1, validation_data = (X_test, Y_test))
```

By the end of this training phase, the model will have adjusted its weights based on the provided training data. The returned `history` object will contain details about the training and validation accuracy and loss across epochs, which can be useful for plotting and analysis.

In [None]:
# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Compile and fit the model with the callback
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train, batch_size=64, epochs=20, verbose=1, validation_data=(X_test, Y_test), callbacks=[early_stopping])

## Saving the Model

After training, it's essential to save the trained model to reuse it later without having to retrain. Saving ensures that both the architecture and the learned weights are preserved.

In [None]:
model.save("model_results/LSTM_Glove_model.h5")

LOADING THE MODEL

In [None]:
path = "model_results/LSTM_Glove_model.h5"
model = load_model(path, custom_objects={'Bidirectional': Bidirectional})

## Model Predictions and Evaluation

Once the model has been trained, it's essential to use it to make predictions on unseen data and then evaluate how well it performed.

### Making Predictions:

Using the `predict` method of the trained model, we generate predicted values for the test set:

```python
preds = model.predict(X_test)
```

### Evaluation Results:

After obtaining the predictions, we pass them to our `evaluate_model_performance` function, along with the true labels (`Y_test`), to get a suite of performance metrics:

- **Accuracy Score:** \(0.9030\)
  - Indicates that the model correctly predicted the sentiment of about 90.30% of the reviews in the test set.
  
- **F1 Score:** \(0.9018\)
  - The harmonic mean of Precision and Recall, suggesting a good balance between false positives and false negatives.
  
- **ROC AUC Score:** \(0.9628\)
  - Signifies that the model has a high capability to distinguish between positive and negative sentiments.
  
- **Precision:** \(0.9095\)
  - Out of all the reviews predicted as positive, about 90.95% were actually positive.
  
- **Sensitivity:** \(0.8943\)
  - Out of all the actual positive reviews, about 89.43% were correctly identified by the model.
  
- **Specificity:** \(0.9116\)
  - Out of all the actual negative reviews, about 91.16% were correctly identified by the model.
  
- **Cross-Entropy Loss:** \(0.3042\)
  - A lower value is better, and it indicates how well the probability predictions of the model align with the true labels.

The provided metrics give a holistic view of the model's performance on the test data.

In [None]:
preds = model.predict(X_test)
evaluate_model_performance(Y_test, preds)

## Plotting the Confusion Matrix

With the `plot_confusion_matrix` function defined, we can now visualize the confusion matrix for the model predictions on our test set. This will give us a clearer understanding of where our model might be making mistakes.

### Expected Output:

The heatmap will show the actual counts of:
- **True Positives (TP):** Reviews that were actually positive and were also predicted as positive.
- **True Negatives (TN):** Reviews that were actually negative and were also predicted as negative.
- **False Positives (FP):** Reviews that were actually negative but were predicted as positive.
- **False Negatives (FN):** Reviews that were actually positive but were predicted as negative.

By examining this matrix, we can gather insights into the type and frequency of errors made by our model.

In [None]:
predictions = make_rounded_predictions(model, X_test)
lstm_plot_confusion_matrix(predictions, Y_test, 'model_results/LSTM_Glove_model_confusion_matrix.png')

## Predicted vs. Actual Labels: Detailed Breakdown

Following the execution of the `calculate_predicted_and_actual` function, here is the breakdown of our model's predictions compared to the actual numbers:

### Predictions:
- **Predicted 'Yes' (Positive Reviews):** 12,360
  - This indicates the total number of reviews the model predicted to be positive.
  
- **Predicted 'No' (Negative Reviews):** 12,402
  - This represents the total number of reviews the model predicted to be negative.

### Actual Data:
- **Actual Positive Reviews:** 12,132
  - The test dataset had this many actual positive reviews.
  
- **Actual Negative Reviews:** 12,630
  - The test dataset had this many actual negative reviews.

### Insights:

- The model predicted a slightly higher number of negative reviews (12,402) than positive ones (12,360). This is fairly balanced.
  
- Compared to the actual distribution in the test set, the model slightly overestimated the number of positive reviews and underestimated the number of negative reviews.

- The difference in actual vs. predicted values for both positive and negative reviews is not drastic, indicating a relatively good performance of the model in terms of class distribution.

In [None]:
cm = confusion_matrix(predictions, Y_test)
calculate_predicted_and_actual(cm)

## Interpretation of the Training and Validation Loss Plot:
**Function Description:**  
The `plot_loss` function is used to visualize the evolution of loss values during the training process. It plots both the training loss and the validation loss against the number of epochs, allowing us to observe how the model's performance changes over time.

**1. Convergence Point at Epoch 5:** 
The fact that the training and validation loss lines touched at epoch 5 indicates that up to this point, the model was generalizing well, and the losses were converging.

**2. Signs of Overfitting:** 
Post epoch 5, the continuous decrease in training loss accompanied by a stagnant validation loss is a typical sign of overfitting. This means that while the model is getting better and better at fitting the training data (hence the decreasing training loss), its performance on unseen data (validation set) is not improving, implying that the model might be memorizing the training data.

**3. Early Stopping is Beneficial:** 
The early stopping callback stopping the training at epoch 8 was apt. This mechanism prevents the model from overfitting further by halting the training when the validation performance fails to improve.

In [None]:
# Plot the training and validation loss using the defined function and save it
plot_loss(history, 'model_results/LSTM_Glove_loss_plot.png')

# BERT MODEL

## Sampling the Dataset

Our Amazon reviews dataset contains over 82,000 entries. To make possible to compile and train with our available computing power, we will down-sample the dataset to 10,000.

By using this sampled dataset, we can reduce the computational cost and time required for training and testing our model. However, it's essential to remember that downsampling might lead to loss of information, and the model might not perform as well as if trained on the complete dataset.

In [None]:
df = pd.read_csv("data/data_for_modelling/ready_data_for_modelling.csv")

In [None]:
# We run only 10,000 out of over 82,000 because of the limited computing power
df = df.sample(n=10000)  # To take a random sample of 10000 rows

## Splitting Data for Amazon Sentiment Analysis

When conducting sentiment analysis on Amazon reviews, it's imperative to partition the data into separate training and test sets. The training set is utilized to train the model, while the test set serves to evaluate its performance.

In [None]:
# Execute the split_data function to get the training and test sets.
train_texts, test_texts, train_labels, test_labels = split_data(df)

## Tokenization with BERT for Amazon Sentiment Analysis

For deep learning models, especially the BERT model, raw text data needs to be converted into a format that the model can understand. This process is known as tokenization. BERT requires a specialized tokenizer because it tokenizes text in a way that's compatible with its pre-trained vocabulary.

### Code Explanation

```python
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
```

- **`BertTokenizer.from_pretrained('bert-base-uncased')`**: This initializes the tokenizer based on the BERT base model that's been trained on uncased (or lowercase) text. "Uncased" means that the pre-training was done on lowercase text, and the model does not differentiate between uppercase and lowercase letters.

  - `'bert-base-uncased'` is the identifier for the base version of the BERT model, trained on English text in lowercase. It's one of the most widely used versions of BERT due to its balance between size (smaller and faster) and performance.

- **`do_lower_case=True`**: This argument ensures that all tokens produced by the tokenizer are in lowercase, aligning with the 'uncased' nature of our chosen BERT model.

With the tokenizer loaded, we can now process the Amazon reviews to transform them into tokenized sequences, making them ready for training and evaluation in the BERT model for sentiment analysis.

In [None]:
# Load the uncased BERT tokenizer, converting all tokens to lowercase.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Tokenizing Amazon Reviews for Sentiment Analysis with BERT

To prepare the Amazon reviews for sentiment analysis with the BERT model, we need to tokenize the text. BERT tokenization is a bit more involved than typical tokenization, and it involves creating "input IDs" and "attention masks."

**Purpose**: This function processes raw review texts into a format suitable for the BERT model.

After executing these lines of code, the training and test Amazon reviews are transformed into tokenized sequences and attention masks. These can be fed into the BERT model for sentiment analysis training and evaluation.

In [None]:
# Tokenize the training and test texts
train_input_ids, train_attention_masks = tokenize_texts(train_texts)
test_input_ids, test_attention_masks = tokenize_texts(test_texts)

## Creating TensorFlow Datasets for Model Training and Evaluation

### Code Explanation

```python
train_data = get_dataset(train_input_ids, train_attention_masks, train_labels).shuffle(100).batch(64).repeat(2)
```

- **`get_dataset(...)`**: Using the previously defined `get_dataset` function, we convert our tokenized training reviews and their labels into a TensorFlow dataset.

- **`.shuffle(100)`**: Shuffling helps prevent patterns during training and makes the model more robust. `100` is the size of the shuffle buffer, which determines how many samples from the dataset should be loaded and then a random sample will be chosen from those to ensure randomness. A larger buffer size will have better shuffling at the expense of memory.

- **`.batch(64)`**: Batching groups multiple samples together to process in parallel, speeding up training. Here, we've chosen a batch size of 64, meaning the model will process 64 reviews simultaneously in each step. In our case, we cannot choose higher batch size since we do not have enough computing power.

- **`.repeat(2)`**: The dataset will be repeated twice. This is useful for training over multiple epochs. Essentially, our model will see each review twice during training.

```python
test_data = get_dataset(test_input_ids, test_attention_masks, test_labels).batch(64)
```

- Here, we're doing the same thing as before but for our test data. We only need to batch the test data without the need for shuffling or repeating. This dataset will be used for evaluating the model's performance.

In [None]:
# Convert tokenized training texts and labels into a dataset, then shuffle, batch, and repeat
train_data = get_dataset(train_input_ids, train_attention_masks, train_labels).shuffle(100).batch(64).repeat(2)

# Convert tokenized test texts and labels into a dataset, then batch
test_data = get_dataset(test_input_ids, test_attention_masks, test_labels).batch(64)

## Constructing a Customized BERT Model for Amazon Sentiment Analysis

### Here, we try to dive as deep as possible!

BERT (Bidirectional Encoder Representations from Transformers) is a state-of-the-art model for a variety of natural language processing tasks. In this section, we're customizing BERT for sentiment analysis on Amazon reviews, introducing regularization and dropout to enhance performance and prevent overfitting.

### Function Explanation: `get_model`

```python
def get_model():
```

**Purpose**: This function initializes the BERT model for sequence classification, freezes certain layers for fine-tuning, and introduces a modified classifier with dropout and regularization.

**Returns**:
- **`model (TFBertForSequenceClassification)`**: The enhanced BERT model ready for sequence classification tasks, in our case, sentiment analysis.

Inside the function:

```python
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
```

- **`TFBertForSequenceClassification.from_pretrained`**: This loads the pre-trained BERT model tailored for sequence classification tasks. The "bert-base-uncased" version indicates that the model uses the base-sized BERT with all tokens converted to lowercase.

```python
    for layer in model.bert.encoder.layer[:9]:
        layer.trainable = False
```

- Here, we're freezing (making non-trainable) the first 9 layers of the BERT model. This is a form of transfer learning. Instead of retraining the entire model from scratch, we'll leverage the existing knowledge BERT has from pre-training and only fine-tune the later layers on our specific Amazon review dataset.

```python
    classifier_input = tf.keras.Input(...)
    x = tf.keras.layers.Dropout(0.5)(classifier_input)
    classifier_output = tf.keras.layers.Dense(...)(x)
    regularized_classifier_model = tf.keras.Model(inputs=classifier_input, outputs=classifier_output)
```

- We're creating a new classifier to replace the default one in BERT. Key modifications include:
  - **`Dropout`**: Adds a dropout layer with a rate of 0.5, meaning approximately half of the inputs will be randomly set to zero during training. Dropout is a regularization technique to reduce the risk of overfitting. When building the model, it faced several times overfitting so we kept increasing dropout rate untill 0.5.
  - **`Dense` layer with L2 regularization**: Introduces a penalty on the layer's weights. Regularization can prevent the model from fitting too closely to the training data, making it generalize better to unseen data.

```python
    model.classifier = regularized_classifier_model
```

- This line replaces BERT's original classifier with our newly defined regularized classifier.

In conclusion, the `get_model` function provides an enhanced version of the BERT model for sentiment analysis on Amazon reviews. By freezing certain layers, introducing dropout, and adding regularization, we're better equipping the model to generalize to new, unseen reviews while leveraging the powerful features of BERT.

In [None]:
def get_model():
    """
    Load the BERT model, freeze specified layers, and modify the classifier with regularization and dropout.

    Returns:
    - model (TFBertForSequenceClassification): The modified BERT model for sequence classification.
    """

    # Load the BERT model for sequence classification
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

    # Make the first 9 layers of the BERT model non-trainable
    for layer in model.bert.encoder.layer[:9]:
        layer.trainable = False

    # Define a new classifier layer with dropout and L2 regularization
    classifier_input = tf.keras.Input(shape=(model.config.hidden_size,), dtype=tf.float32, name="inputs")
    x = tf.keras.layers.Dropout(0.5)(classifier_input)
    classifier_output = tf.keras.layers.Dense(
        model.config.num_labels,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=model.config.initializer_range),
        kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )(x)

    # Construct the new classifier model
    regularized_classifier_model = tf.keras.Model(inputs=classifier_input, outputs=classifier_output)

    # Integrate the new classifier into the BERT model
    model.classifier = regularized_classifier_model

    return model

## Initializing the Customized BERT Model for Amazon Sentiment Analysis

In [None]:
# Instantiate the modified BERT model
model = get_model()

## Fine-Tuning the BERT Model for Amazon Sentiment Analysis

In this phase, we will compile and train our custom BERT model on the Amazon reviews dataset. Training involves adjusting the model's weights based on our data so that it can accurately perform sentiment analysis on Amazon reviews.

### Function Explanation: `compile_and_train`

```python
def compile_and_train(model, train_data, test_data):
```

**Purpose**: To compile, train, and evaluate the BERT model using the provided training and test datasets.

Inside the function:
```python
initial_learning_rate = 3e-6
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=10000,
        decay_rate=0.9,
        staircase=True)
```
- **Learning Rate Scheduling**: The learning rate determines how drastically the model's weights are adjusted during training. Starting with an initial rate of `3e-6`, the learning rate is scheduled to decrease (decay) over time. Specifically, every 10,000 steps, the learning rate is multiplied by a factor of 0.9. This decay helps stabilize training as the model approaches convergence.
```python
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
```
- **Adam Optimizer**: Adam is a popular optimization algorithm that combines the best properties of the AdaGrad and RMSprop algorithms. It's widely used for deep learning models. Here, it's configured with our learning rate schedule.
```python
model.compile(...)
```
- **Compiling the Model**: This step is about configuring the model for training. The following are set:
  - **Loss Function**: `SparseCategoricalCrossentropy(from_logits=True)`. Given our task is classification, this loss function computes the crossentropy loss between the labels and the predictions. The `from_logits=True` flag indicates that the model's outputs are not probabilities.
  - **Metrics**: Accuracy, to determine the proportion of correctly predicted classifications.
```python
early_stopping = EarlyStopping(...)
```
- **Early Stopping**: It's a regularization method that stops the training process once the model's performance starts degrading on a held-out validation dataset. This mechanism helps in preventing overfitting and saves computational resources. Here, we're monitoring the validation loss (`val_loss`), and if it doesn't improve for 2 consecutive epochs (`patience=2`), training will stop and revert back to the best weights.
```python
history = model.fit(train_data, epochs=20, verbose=1, validation_data=test_data, callbacks=[early_stopping])
```
- **Training the Model**: The BERT model is trained using the `fit` method. We use the `train_data` for training and `test_data` for validation. If the validation performance doesn't improve for two consecutive epochs, thanks to our early stopping callback, training will halt.

In summary, the `compile_and_train` function sets the stage for fine-tuning our BERT model on Amazon reviews for sentiment analysis. By employing learning rate decay, the Adam optimizer, and early stopping, we ensure that the model learns effectively without overfitting. Once the model is trained, it's ready to make predictions on new Amazon reviews and classify their sentiments.

In [None]:
def compile_and_train(model, train_data, test_data):
    """
    Compile, train, and evaluate the BERT model using given data.

    Parameters:
    - model (TFBertForSequenceClassification): The BERT model for sequence classification.
    - train_data (tf.data.Dataset): The training dataset.
    - test_data (tf.data.Dataset): The validation/test dataset.

    Returns:
    - model (TFBertForSequenceClassification): The trained BERT model.
    """

    # Define learning rate and decay for optimizer
    initial_learning_rate = 3e-6
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=10000,  # Decrease learning rate every 10,000 steps
        decay_rate=0.9,
        staircase=True)

    # Use the Adam optimizer with learning rate decay
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    # Compile the model with sparse categorical crossentropy and accuracy metric
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

    # Set early stopping criteria to prevent overfitting
    early_stopping = EarlyStopping(
        monitor='val_loss',  # Monitor validation loss for early stopping
        patience=2,  # Number of epochs with no improvement to trigger early stopping
        verbose=1,
        restore_best_weights=True)  # Restore model weights from the best epoch

    # Train the model using the training data and validate using test data
    history = model.fit(train_data, epochs=20, verbose=1, validation_data=test_data, callbacks=[early_stopping])

    return model, history

## BERT Model Training

Let's deep dive into the results of the training process for our fine-tuned BERT model on the Amazon reviews dataset:

### Training Summary:

1. **Total Epochs**: `14 out of 20`
   - The model training was early-stopped at the 14th epoch, indicating that further training didn't improve the validation loss.

2. **Batch Size**: `64`
   - Each batch contains 64 reviews that the model processes simultaneously.

### Model Performance Over Epochs:

- **Initial Epoch (Epoch 1)**:
  - Training Loss: `0.6373`, Training Accuracy: `64.24%`
  - Validation Loss: `0.4632`, Validation Accuracy: `82.73%`

- **Best Epoch (Epoch 12)**:
  - Training Loss: `0.1774`, Training Accuracy: `93.46%`
  - Validation Loss: `0.2599`, Validation Accuracy: `90.17%`

- **Last Epoch (Epoch 14)**:
  - Training Loss: `0.1602`, Training Accuracy: `94.21%`
  - Validation Loss: `0.2697`, Validation Accuracy: `90.17%`

### Observations:

1. **Learning Progress**: The training accuracy improved significantly from the initial 64.24% in the first epoch to 94.21% in the 14th epoch. The model has learned well from the training data, as evident from the accuracy improvement.

2. **Overfitting and Early Stopping**: The early stopping was triggered at epoch 14, restoring the weights from epoch 12, which had the best validation loss. This ensures that the model doesn't overfit to the training data and can generalize well on unseen data.

3. **Validation Results**: The validation accuracy was consistently higher after the initial epochs, reaching its peak at `90.17%` in epoch 12. This shows that our model is effective in sentiment prediction for unseen Amazon reviews.

### Conclusion:

The fine-tuning process of the BERT model has been successful. With an impressive accuracy on the validation set, we can conclude that the model is likely to perform well in real-world scenarios when given new Amazon reviews for sentiment prediction.

In [None]:
# Compile, train, and evaluate the BERT model using the provided data
model, history = compile_and_train(model, train_data, test_data)

## Model Evaluation Results on the test data

After evaluating our fine-tuned BERT model on the test dataset, we've obtained the following results:

### Model Performance Metrics:

**1. Loss**: `0.2599`
   - This value represents the model's error on the test set. In other words, it's a measure of how well the model's predictions match the true ratings in the test data. A lower loss value is generally better, indicating that the model's predictions are closer to the true values.

**2. Accuracy**: `90.17%`
   - This is a measure of how many of the test reviews the model correctly classified. An accuracy of 90.17% means the model correctly predicted the sentiment of approximately 9 out of every 10 reviews in the test set.

### Conclusion:

The model exhibits a high accuracy on the test set, suggesting that it is effective at performing sentiment analysis on Amazon reviews. An accuracy of over 90% indicates that our fine-tuning process with the BERT model has been successful in capturing the nuances of sentiment in the Amazon reviews dataset.

In [None]:
# Evaluate on the Test Set
model.evaluate(test_data)

## Saving the Fine-tuned BERT Model

In [None]:
# Save the trained model
model_path = "model_results/BERT_model"
model.save_pretrained(model_path)

In [None]:
# Load the saved model
model_path = "model_results/BERT_model"
loaded_model = TFBertForSequenceClassification.from_pretrained(model_path)

## Model Evaluation Insights: Sentiment Analysis with BERT on Amazon Reviews

Following the evaluation of our trained BERT model on the test data, let's dive deep into understanding the results.

### **1. Accuracy: 0.9017**

An accuracy of **~90.17%** is quite high, suggesting that the model has learned a significant amount from the training data and can generalize well to unseen samples.

### **2. F1 Score: 0.9015**

An F1 score of **~90.15%** is very good, especially for NLP tasks such as sentiment analysis, where context matters. This suggests that our model has both good precision and recall.

### **3. ROC AUC Score: 0.9666**

ROC AUC (Receiver Operating Characteristic Area Under Curve) measures the area under the ROC curve, which is a plot of the true positive rate versus the false positive rate. An ROC AUC of **~96.66%** indicates a very good ability of the model to distinguish between the positive and negative classes. This score is especially crucial for imbalanced datasets.

### **4. Precision: 0.9012 & Recall: 0.9020**

- **Precision** denotes how many of the positive classifications were actually correct. A precision of **~90.12%** indicates a high rate of relevant results.
  
- **Recall** refers to the percentage of total relevant results correctly classified by the algorithm. A recall of **~90.20%** is commendable, showing the model has a good hit rate.

### **5. Sensitivity: 0.8951 & Specificity: 0.9090**

These are metrics specific to binary classification:
- **Sensitivity (True Positive Rate)** of **~89.51%** suggests the model correctly identifies positive samples a majority of the time.
  
- **Specificity (True Negative Rate)** of **~90.90%** shows the model's effectiveness in correctly identifying negative samples.

### **6. Cross-Entropy Loss: 0.2553**

A lower value is better, suggesting that the predicted probabilities align closely with the actual labels. The value of **0.2553** is relatively low, indicating that the model's predictions are in good alignment with the true labels.

### **Final Thoughts**:

The BERT model's performance on the sentiment analysis task for Amazon reviews is commendable. With scores consistently around the 90% mark across most metrics, it demonstrates the power of transformer-based BERT model for sentiment analysis for Amazon reviews in understanding and classifying textual data.

In [None]:
evaluate_metrics(model, test_data, test_labels)

## Model Evaluation Insights: Sentiment Analysis with BERT on Amazon Reviews

Following the evaluation of our trained BERT model on the test data, let's dive deep into understanding the results.

### **1. Accuracy: 0.9017**

An accuracy of **~90.17%** is quite high, suggesting that the model has learned a significant amount from the training data and can generalize well to unseen samples.

### **2. F1 Score: 0.9015**

An F1 score of **~90.15%** is very good, especially for NLP tasks such as sentiment analysis, where context matters. This suggests that our model has both good precision and recall.

### **3. ROC AUC Score: 0.9666**

ROC AUC (Receiver Operating Characteristic Area Under Curve) measures the area under the ROC curve, which is a plot of the true positive rate versus the false positive rate. An ROC AUC of **~96.66%** indicates a very good ability of the model to distinguish between the positive and negative classes. This score is especially crucial for imbalanced datasets.

### **4. Precision: 0.9012 & Recall: 0.9020**

- **Precision** denotes how many of the positive classifications were actually correct. A precision of **~90.12%** indicates a high rate of relevant results.
  
- **Recall** refers to the percentage of total relevant results correctly classified by the algorithm. A recall of **~90.20%** is commendable, showing the model has a good hit rate.

### **5. Sensitivity: 0.8951 & Specificity: 0.9090**

These are metrics specific to binary classification:
- **Sensitivity (True Positive Rate)** of **~89.51%** suggests the model correctly identifies positive samples a majority of the time.
  
- **Specificity (True Negative Rate)** of **~90.90%** shows the model's effectiveness in correctly identifying negative samples.

### **6. Cross-Entropy Loss: 0.2553**

A lower value is better, suggesting that the predicted probabilities align closely with the actual labels. The value of **0.2553** is relatively low, indicating that the model's predictions are in good alignment with the true labels.

### **Final Thoughts**:

The BERT model's performance on the sentiment analysis task for Amazon reviews is commendable. With scores consistently around the 90% mark across most metrics, it demonstrates the power of transformer-based BERT model for sentiment analysis for Amazon reviews in understanding and classifying textual data.

In [None]:
# Plotting confusion matrix for BERT model
plot_confusion_matrix(model, test_input_ids, test_attention_masks, test_labels, save_path='model_results/BERT_model_confusion_matrix.png')

## BERT Model Training & Validation Loss Analysis for Amazon Reviews Sentiment

- **Initial Losses:** Training loss started over 0.6 and decreased to below 0.2 by epoch 12. Validation loss began under 0.5 and approached 0.3 by epoch 12.
  
- **Loss Convergence:** Around epochs 4 and 5, training and validation losses intersected, which can be a balance point in learning. It's a juncture to monitor for potential overfitting.

Key Takeaway: The consistent reduction in loss indicates BERT's effective learning on the Amazon reviews sentiment analysis task, but monitoring for overfitting is essential.

In [None]:
# Call the function
plot_and_save_loss(history, 'model_results/BERT_training_validation_loss.png')