# Setting up

In [None]:
!pip install transformers datasets evaluate stopwords flair nltk swifter
!pip install gensim
!pip install keras
!pip install tensorflow
!pip install --upgrade gensim

In [None]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import torch
from torch import nn

import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, GPT2Model
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
import flair
from flair.data import Sentence
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import random as rn
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image

from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer

import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

import logging
transformers.logging.set_verbosity_error()

In [None]:
# OUR DATASET
df = pd.read_csv('./Reviews.csv')
# EQUALIZED DATASET
sample_df = pd.read_csv('./samples.csv')

# Baselines

### Constant Predictions

In [None]:
# X and y
X = np.array(df['Text'])
y = np.array(df['Score'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
constant_y_pred = np.array([5]*len(y_test))
accuracy = np.sum(np.equal(y_test, constant_y_pred)) / len(y_test)
precision_constant = precision_score(y_test, constant_y_pred, average=None)
recall_constant = recall_score(y_test, constant_y_pred, average=None)
print("accuracy: {}, precision: {}, recall: {}".format(accuracy, precision_constant, recall_constant))

### VADER Analysis Logistic Regression

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
X = np.array(vaders.loc[:,["neg", "neu", "pos", "compound"]].values)
y = np.array(vaders.loc[:,"Score"].values)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(tol=0.0001, max_iter=1000, random_state=42,  class_weight="balanced", multi_class='multinomial')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# calculate accuracy, precision, and recall
accuracy = np.sum(np.equal(y_test, constant_y_pred)) / len(y_test)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
print("accuracy: {}, precision: {}, recall: {}".format(accuracy, precision, recall))

# Word2Vec & NN

In [None]:
class Word2VecTrain:
    def __init__(self, text_data, sentiment_labels):
        self.model = Sequential()
        self.max_len = -1
        self.sentiment_labels = sentiment_labels
        self.tokenizer = Tokenizer()
        self.embedding_matrix = None
        self.text_data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.word2vec_model = None
        self.vocab_size = None

    def tokenize(self):
        self.tokenizer.fit_on_texts(text_data)
        self.vocab_size = len(self.tokenizer.word_index) + 1
        sequences = self.tokenizer.texts_to_sequences(text_data)
        self.max_len = max([len(x) for x in sequences])
        self.text_data = pad_sequences(sequences, maxlen=self.max_len)

    def build_word2vec_model(self, filePath='./GoogleNews-vectors-negative300.bin.gz'):
        self.word2vec_model = KeyedVectors.load_word2vec_format(filePath, binary=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.text_data, self.sentiment_labels, test_size=0.2, random_state=42)

        # Convert sentiment labels to categorical
        self.y_train = to_categorical(self.y_train - 1, num_classes=5)  # Assuming you have 5 sentiment labels (1 to 5)
        self.y_test = to_categorical(self.y_test - 1, num_classes=5)

    def word_embedding_matrix(self, embedding_dim = 300):
        self.embedding_matrix = np.zeros((self.vocab_size, embedding_dim))
        for word, i in self.tokenizer.word_index.items():
            if word in self.word2vec_model:
                self.embedding_matrix[i] = self.word2vec_model[word]

    def build_NN(self, embedding_dim=300, layer1=128, layer2=64, dropout=0.3, optimizer='adam', loss='categorical_crossentropy'):
        embedding_layer = Embedding(input_dim=self.vocab_size, output_dim=embedding_dim, input_length=self.max_len)
        # A three layer neural network
        self.model.add(embedding_layer)
        self.model.add(Flatten())
        self.model.add(Dense(layer1, activation='relu', input_dim=self.max_len))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(layer2, activation='relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(5, activation='softmax'))
        self.model.layers[0].set_weights([self.embedding_matrix])
        self.model.layers[0].trainable = False
        
        self.model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    def train(self, epochs=4, batch_size=32, verbose=1):
        self.model.fit(self.X_train, self.y_train, epochs=4, batch_size=32, verbose=1)

    def evaluate(self):
        loss, accuracy = self.model.evaluate(self.X_test, self.y_test, batch_size=32)
        y_pred = self.model.predict(self.X_test)
        y_pred = np.argmax(y_pred, axis=1) + 1
        precision = precision_score(self.y_test.argmax(axis=1) + 1, y_pred, average=None)
        recall = recall_score(self.y_test.argmax(axis=1) + 1, y_pred, average=None)
        print("Test Loss:", loss)
        print("Test Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        cm = confusion_matrix(self.y_test, y_pred)
        print(cm)

    def get_NN(self):
        return self.model

    def run(self):
        print('tokenizing')
        self.tokenize()
        print('importing word2vec model')
        self.build_word2vec_model()
        print('embedding matrix')
        self.word_embedding_matrix()
        self.train_test_split()
        self.build_NN()
        print('training NN')
        self.train()
        print('evaluating NN')
        self.evaluate()

## Normal dataset

In [None]:
text_data = df['Text']
sentiment_labels = df['Score']

In [None]:
word2vec = Word2VecTrain(text_data, sentiment_labels)
word2vec.run()

## Shifted dataset

In [None]:
text_data = sample_df['Text']
sentiment_labels = sample_df['Score']

In [None]:
word2vec = Word2VecTrain(text_data, sentiment_labels)
word2vec.run()

# Fine Tuned BERT Model Classification

### Entire Dataset

In [None]:
# Use this dataset if you want to run on the entire data
amazon_reviews_df = pd.read_csv('./Reviews.csv')

### Equalized Dataset

In [None]:
# Use this dataset if you want to run on the equalized data
amazon_reviews_df = pd.read_csv('./samples.csv')

### Data Preprocessing

In [None]:
amazon_reviews_df.dropna(axis=0, subset = ['Score', 'Summary'], inplace=True)

#Quick view on data distribution
ax = amazon_reviews_df['Score'].value_counts().sort_index().plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

In [None]:
# Number of data points in each star category
np.unique(amazon_reviews_df.Score,return_counts=True)

In [None]:
amazon_reviews_df.info()

In [None]:
# remove html tags
amazon_reviews_df['Text'] = amazon_reviews_df['Text'].apply(lambda row : re.sub('<.*?>', '', row))

# TFBertForSequenceClassification requires labels in the range [0,1,...]
amazon_reviews_df["Score"] = amazon_reviews_df["Score"].subtract(1)

In [None]:
# Table: Number of datapoints per category
temp = amazon_reviews_df.groupby('Score').count()['Text'].reset_index().sort_values(by='Text',ascending=False)
temp.style.background_gradient()

In [None]:
# Graph: Distribution of the number of words
word_count = amazon_reviews_df
word_count['num_words'] = word_count['Text'].apply(lambda x : len(str(x).split()))
sns.histplot(data=word_count, x = 'num_words', bins = 30)

### WordClouds

In [None]:
# Divide reviews into positive and negative based on the scores
amazon_reviews_df.loc[amazon_reviews_df['Score'] <= 3, 'Score'] = 0
amazon_reviews_df.loc[amazon_reviews_df['Score'] == 5, 'Score'] = 1
amazon_reviews_df.drop(amazon_reviews_df[amazon_reviews_df['Score']==4].index, inplace=True)

positive = amazon_reviews_df[amazon_reviews_df['Score'] == 1]
negative = amazon_reviews_df[amazon_reviews_df['Score'] == 0]

In [None]:
# Or wordcloud by score

one = amazon_reviews_df[amazon_reviews_df['Score'] == 0]
two = amazon_reviews_df[amazon_reviews_df['Score'] == 1]
three = amazon_reviews_df[amazon_reviews_df['Score'] == 2]
four = amazon_reviews_df[amazon_reviews_df['Score'] == 3]
five = amazon_reviews_df[amazon_reviews_df['Score'] == 4]

In [None]:
# positive reviews
txt = ' '.join(rev for rev in positive['Text'])
plt.figure(figsize=(15,8))

wordcloud = WordCloud(
            background_color = 'black',
            max_font_size = 100,
            max_words = 100,
            width = 1000,
            height = 600
            ).generate(txt)

plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
# negative reviews
txt = ' '.join(rev for rev in negative['Text'])
plt.figure(figsize=(15,8))

wordcloud = WordCloud(
            background_color = 'black',
            max_font_size = 100,
            max_words = 100,
            width = 1000,
            height = 600
            ).generate(txt)

plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')
plt.show()

### Training

In [None]:
# Limit input string length
amazon_reviews_df['Text'] = amazon_reviews_df['Text'].str[:60]

In [None]:
reviews = amazon_reviews_df["Text"].values.tolist()
scores = amazon_reviews_df["Score"].tolist()

# Split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, scores, test_size=.4)
validation_sentences, holdout_sentences, validation_labels, holdout_labels = train_test_split(test_sentences, test_labels, test_size=.5)

In [None]:
# Tokenize Our Input Data
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)

validation_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)

holdout_encodings = tokenizer(holdout_sentences,
                            truncation=True,
                            padding=True)

In [None]:
# Convert the input encodings and labels into a Dataset object

train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ));

validation_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(validation_encodings),
                            validation_labels
                            ));

holdout_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(holdout_encodings),
                            holdout_labels
                            ));

In [None]:
# Initialize our pre-trained BERT model

model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train and fine tune our pre-trained BERT model

history = model.fit(train_dataset.shuffle(30).batch(8),
          epochs=1,
          batch_size=8,
          validation_data=validation_dataset.shuffle(30).batch(8), verbose=1)

### Evaluate

In [None]:
# Load the model and then evaluate it on holdout set

loaded_model = TFBertForSequenceClassification.from_pretrained("./output_model")
result = model.evaluate(holdout_dataset.batch(8))
dict(zip(model.metrics_names, result))

In [None]:
# Predict the sentiment for holdout set

tf_output = loaded_model.predict(holdout_dataset.batch(8))
pred_label = tf.argmax(tf.nn.softmax(tf_output["logits"], axis=1).numpy(), 1).numpy()

In [None]:
def VisualizeModelCompetence(labels, true_labels, pred_labels):
    # Confusion Matrix plot
    fig, ax = plt.subplots(figsize=(8, 8))
    cm = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=range(len(labels)), normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(ax=ax)
    
    # Precision and Recall Scores
    precision = precision_score(true_labels, pred_labels, average=None)
    recall = recall_score(true_labels, pred_labels, average=None)
    return precision, recall

In [None]:
# Plot Confusion Matrix and get precision and recall scores
# Note that due to TFBertForSequenceClassification, labels [1,2,3,4,5] are shown as [0,1,2,3,4]
labels = [0,1,2,3,4]
precision, recall = VisualizeModelCompetence(labels, holdout_labels, pred_label)

# Fine Tuned GPT2Model Classification

### Define Custom Dataset

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, max_length, tokenizer):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = labels
        self.texts = [self.tokenizer(text, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt") for text in texts]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], np.array(self.labels[idx])

### Define Classifier nn.Module class

In [None]:
class GPT2Classifier(nn.Module):
    def __init__(self, max_len:int):
        super(GPT2Classifier,self).__init__()
        
        self.gpt2 = GPT2Model.from_pretrained("gpt2")
        self.out = nn.Linear(768 * max_len, 5) #gpt2's final layer has (768*max_len) neurons

    def forward(self, input_id, mask):
        out, _ = self.gpt2(input_ids=input_id, attention_mask=mask, return_dict=False)
        batch_size = out.shape[0]
        out = self.out(out.view(batch_size,-1))
        return out

### Training Loop Function

In [None]:
def GPT2Train(model, train_dataloader, train_len, val_dataloader, val_len, epochs, lr):
    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=lr)

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)

            model.zero_grad()

            output = model(input_id, mask) # output is the attention scores, not the actual outputs

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1)==train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()

        train_loss = total_loss_train / train_len
        train_acc = total_acc_train / train_len

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1)==val_label).sum().item()
                total_acc_val += acc

            val_loss = total_loss_val / val_len
            val_acc = total_acc_val / val_len

            print(f"Epochs: {epoch_num + 1}\n"
                  f"Train Loss: {train_loss} | Train Accuracy: {train_acc} | Val Loss: {val_loss} | Val Accuracy: {val_acc}")

### Evaluate Function

In [None]:
def GPT2Evaluate(model, test_dataloader, test_len):
    pred_labels = []
    true_labels = []

    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            true_labels += test_label.cpu().numpy().flatten().tolist()
            pred_labels += output.argmax(dim=1).cpu().numpy().flatten().tolist()

    test_acc = total_acc_test / test_len

    print(f'Test Accuracy: {test_acc}')
    return true_labels, pred_labels, test_acc

### Evaluation Metrics and Visualization: confusion matrix, precision, recall

In [None]:
def VisualizeModelCompetence(labels, true_labels, pred_labels):
    # Confusion Matrix plot
    fig, ax = plt.subplots(figsize=(8, 8))
    cm = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=range(len(labels)), normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(ax=ax)
    
    # Precision and Recall Scores
    precision = precision_score(true_labels, pred_labels, average=None)
    recall = recall_score(true_labels, pred_labels, average=None)
    return precision, recall

### Run with dataset

In [None]:
# set random seed
seed = 42
np.random.seed(seed)

# Load dataset
data = np.array(df['Text'])  # List of input texts
labels = np.array(df['Score'] - 1)  # List of labels (0-4)
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=seed)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.25, random_state=seed)

In [None]:
# define tokenizer, GPT2Tokenizer requires that padding is on the left side
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Define hyperparameters and model
max_len = 32
epochs = 3
lr = 1e-5
model = GPT2Classifier(max_len=max_len).to(device)

In [None]:
# Create train dataset and dataloader
train_dataset = SentimentDataset(train_data, train_labels, max_length=max_len, tokenizer=tokenizer) # takes a while...
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
train_len = len(train_data)

In [None]:
# Create validation dataset and dataloader
val_dataset = SentimentDataset(val_data, val_labels, max_length=max_len, tokenizer=tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=64)
val_len = len(val_data)

In [None]:
# Train the model
GPT2Train(model, train_dataloader, train_len, val_dataloader, val_len, epochs, lr)

In [None]:
# Create test dataset and dataloader
test_dataset = SentimentDataset(test_data, test_labels, max_length=max_len, tokenizer=tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=64)
test_len = len(test_data)

In [None]:
# Evaluate the model
true_labels, pred_labels, test_acc = GPT2Evaluate(model, test_dataloader, test_len)

In [None]:
# Plot Confusion Matrix and get precision and recall scores
labels = [1,2,3,4,5]
precision, recall = VisualizeModelCompetence(labels, true_labels, pred_labels)

In [None]:
print(
    f'Test Accuracy: {text_acc}\n'
    f'Test Precision: {precision}\n'
    f'Test Recall: {recall}'
)

### Testing Dataset Shift

In [None]:
# Load dataset
sample_data = np.array(sample_df['Text'])  # List of input texts
sample_labels = np.array(sample_df['Score'] - 1)  # List of labels (0-4)
s_train_data, s_test_data, s_train_labels, s_test_labels = train_test_split(sample_data, sample_labels, test_size=0.2, random_state=seed)
s_train_data, s_val_data, s_train_labels, s_val_labels = train_test_split(s_train_data, s_train_labels, test_size=0.25, random_state=seed)

In [None]:
# define hyperparameters and model
epochs = 1
lr = 1e-5
s_model = GPT2Classifier(max_len=max_len).to(device)

In [None]:
# define train dataset and dataloader
s_train_dataset = SentimentDataset(s_train_data, s_train_labels, max_length=max_len, tokenizer=tokenizer)
s_train_dataloader = DataLoader(s_train_dataset, batch_size=64, shuffle=True)
s_train_len = len(s_train_data)

In [None]:
# define validation dataset and dataloader
s_val_dataset = SentimentDataset(s_val_data, s_val_labels, max_length=max_len, tokenizer=tokenizer)
s_val_dataloader = DataLoader(s_val_dataset, batch_size=64, shuffle=True)
s_val_len = len(s_val_data)

In [None]:
# Train
GPT2Train(s_model, s_train_dataloader, s_train_len, s_val_dataloader, s_val_len, epochs, lr)

In [None]:
# define test dataset and dataloader
s_test_dataset = SentimentDataset(s_test_data, s_test_train_labels, max_length=max_len, tokenizer=tokenizer)
s_test_dataloader = DataLoader(s_test_train_dataset, batch_size=64, shuffle=True)
s_test_len = len(s_test_data)

In [None]:
# Evaluate the model
true_labels, pred_labels, test_acc = GPT2Evaluate(s_model, s_test_dataloader, s_test_len)

In [None]:
# Plot Confusion Matrix and get precision and recall scores
labels = [1,2,3,4,5]
precision, recall = VisualizeModelCompetence(labels, true_labels, pred_labels)

In [None]:
print(
    f'Test Accuracy: {text_acc}\n'
    f'Test Precision: {precision}\n'
    f'Test Recall: {recall}'
)