<a href="https://colab.research.google.com/github/satankita/AmazonReviewsSentimentAnalysis/blob/main/AmazonReviewsSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
from google.colab import drive
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
drive.mount("/content/drive/", force_remount=True)

mypath = "drive/MyDrive/"
# os.listdir(mypath)

Mounted at /content/drive/


In [None]:
DATA_DIR = mypath + "TIS Project/Data/"

AMAZON_DATA_TRAIN = "train.csv"
AMAZON_DATA_TEST = "test.csv"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def clean(text):
    a=[f for f in text if f not in string.punctuation]
    a=''.join(a)
    b=[w for w in a.split() if w.lower() not in stopwords.words('english')]
    return b

# Raw Data Check #

In [None]:
large_data = pd.read_csv(DATA_DIR + AMAZON_DATA_TRAIN)
data = large_data.sample(frac=0.01, random_state=101)
data.columns = ['Sentiment', 'Review Title', 'Text']
data.drop('Review Title', axis=1, inplace=True)

data['Length'] = data['Text'].apply(len)
data['Word Count'] = data['Text'].apply(lambda x: len(x.split()))

In [None]:
data.head()

Unnamed: 0,Sentiment,Text,Length,Word Count
750068,2,"These splices are easy to pinch shut, but do n...",300,55
3084052,2,my prayers answered! a great bra! it is hard t...,113,23
1782585,1,I ordered this watch from Africa and had it co...,270,55
1315001,1,Sound quality for Widescreen Dolby 5.1 version...,404,74
2648609,2,My very first hand of real money Texas Holdem ...,778,148


In [None]:
data['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
2,18116
1,17884


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['Text'],data['Sentiment'], test_size=0.3, random_state=101)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
2,12629
1,12571


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
2,5487
1,5313


# VADER Features

VADER is an existing sentiment analysis model that outputs probabilities of 4 sentiments for each word: Positive, Negative, Neutral and Compound. Positive, Negative, and Neutral range from 0 to 1 scores while Compound takes the aggregate of the previous 3 scores, with 1 being the strong negative sentiment and 1 being the strong positive sentiment. These provide 4 additional predictors to the dataset.


In [None]:
from scipy.sparse import hstack
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC


#large_data = pd.read_csv(DATA_DIR + AMAZON_DATA_TRAIN)
#data = large_data.sample(frac=0.01, random_state=101)
#data.columns = ['Sentiment', 'Review Title', 'Text']
#data.drop('Review Title', axis=1, inplace=True)

#data['Length'] = data['Text'].apply(len)
#data['Word Count'] = data['Text'].apply(lambda x: len(x.split()))

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def get_vader_sentiment(text):
    return sia.polarity_scores(text)

data['VADER_Sentiment'] = data['Text'].apply(get_vader_sentiment)
data['VADER_Positive'] = data['VADER_Sentiment'].apply(lambda x: x['pos'])
data['VADER_Neutral'] = data['VADER_Sentiment'].apply(lambda x: x['neu'])
data['VADER_Negative'] = data['VADER_Sentiment'].apply(lambda x: x['neg'])
data['VADER_Compound'] = data['VADER_Sentiment'].apply(lambda x: x['compound'])
data.drop(columns=['VADER_Sentiment'], inplace=True)



#updated vader
custom_features = data[['Length', 'Word Count', 'VADER_Positive', 'VADER_Neutral', 'VADER_Negative', 'VADER_Compound']].values
combined_features = np.hstack([roberta_features, custom_features])

# Train-test split with the new combined feature set
x_train, x_test, y_train, y_test = train_test_split(combined_features, data['Sentiment'], test_size=0.3, random_state=101)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
#TF-IDF testing - removed for final model

# x_train, x_test, y_train, y_test = train_test_split(
#     data[['Text', 'VADER_Positive', 'VADER_Neutral', 'VADER_Negative', 'VADER_Compound']],
#     data['Sentiment'],
#     test_size=0.3,
#     random_state=101
# )

# cv = CountVectorizer(analyzer=clean)
# x_train_text = cv.fit_transform(x_train['Text'])
# x_test_text = cv.transform(x_test['Text'])

# x_train_vader = x_train[['VADER_Positive', 'VADER_Neutral', 'VADER_Negative', 'VADER_Compound']].values
# x_test_vader = x_test[['VADER_Positive', 'VADER_Neutral', 'VADER_Negative', 'VADER_Compound']].values

# x_train_combined = hstack([x_train_text, x_train_vader])
# x_test_combined = hstack([x_test_text, x_test_vader])

# tfidf = TfidfTransformer()
# x_train_combined = tfidf.fit_transform(x_train_combined)
# x_test_combined = tfidf.transform(x_test_combined)

In [None]:
checkCV=data['Text'].head(1).apply(clean)
print(checkCV)

750068    [splices, easy, pinch, shut, place, much, pres...
Name: Text, dtype: object


In [None]:
x_train_text.shape, x_train_vader.shape, x_train_combined.shape

((25200, 89348), (25200, 4), (25200, 89352))

**Support Vector Machine**

Support vector machine (SVM) is a supervised machine learning algorithm for classification. It aims to find the optimal hyperplane that best separates data into different classes in a high-dimensional space. In our project we used the LinearSVC function in SkLearn, which uses a linear kernel and is specific for classification. We used the default parameters, which uses L2 penalty and squared-hinge loss. Some key advantages of SVM is that it is good with high-dimensions - which our data has after embedding, its inherent property to be less prone to overfitting and its versatility to work with different pre-processing pipelines (including TF-IDF, bag-of-words, etc).


In [None]:
svm = LinearSVC()
clf = CalibratedClassifierCV(svm)
clf.fit(x_train, y_train)

y_predict = clf.predict(x_test)

print("############# Classification Report ##############")
print(classification_report(y_test, y_predict))
print("##################################################")

print("############# Confusion Matrix ##############")
print(confusion_matrix(y_test, y_predict))
print("##################################################")

############# Classification Report ##############
              precision    recall  f1-score   support

           1       0.91      0.91      0.91      5313
           2       0.92      0.91      0.91      5487

    accuracy                           0.91     10800
   macro avg       0.91      0.91      0.91     10800
weighted avg       0.91      0.91      0.91     10800

##################################################
############# Confusion Matrix ##############
[[4856  457]
 [ 481 5006]]
##################################################


# BERT Embedding Features

Text embeddings were generated using Both BERT-base and RoBERTa. BERT (Bidirectional Encoder Representations from Transformers) is a pretrained model that captures contextual
word semantics  in high-dimensional vector space. Text data was tokenized using BertTokenizer,
and for each review, the [CLS] token’s representation was extracted as the sentence embedding.
For this to work you have to change the runtime type to GPU - commented out as not used in model

In [None]:
# import numpy as np
# import pandas as pd
# from transformers import BertTokenizer, BertModel
# import torch
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.svm import LinearSVC
# from sklearn.calibration import CalibratedClassifierCV
# from scipy.sparse import hstack

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

In [None]:
# large_data = pd.read_csv(DATA_DIR + AMAZON_DATA_TRAIN)
# data = large_data.sample(frac=0.01, random_state=101)
# data.columns = ['Sentiment', 'Review Title', 'Text']
# data.drop('Review Title', axis=1, inplace=True)

# data['Length'] = data['Text'].apply(len)
# data['Word Count'] = data['Text'].apply(lambda x: len(x.split()))

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
# def get_bert_embeddings(texts, batch_size=32):
#     bert_model.eval()
#     all_embeddings = []

#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i + batch_size]

#         inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
#         inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU if available

#         with torch.no_grad():
#             outputs = bert_model(**inputs)

#         cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move back to CPU
#         all_embeddings.append(cls_embeddings)

#     return np.vstack(all_embeddings)

# texts = data['Text'].tolist()
# bert_features = get_bert_embeddings(texts, batch_size=32)

# custom_features = data[['Length', 'Word Count']].values

# combined_features = np.hstack([bert_features, custom_features])

# x_train, x_test, y_train, y_test = train_test_split(combined_features, data['Sentiment'], test_size=0.3, random_state=101)

# svm = LinearSVC()
# clf = CalibratedClassifierCV(svm)
# clf.fit(x_train, y_train)

# y_predict = clf.predict(x_test)

# print("############# Classification Report ##############")
# print(classification_report(y_test, y_predict))
# print("##################################################")

# print("############# Confusion Matrix ##############")
# print(confusion_matrix(y_test, y_predict))
# print("##################################################")

############# Confusion Matrix ##############
[[4856  457]
 [ 481 5006]]
##################################################


# RoBERTa Embedding Features

RoBERTa is an optimized version of BERT. Like BERT, we tokenized the text with RobertaTokenizer and processed with RobertaModel; the [CLS] tokens were extracted from each review to act as our features. RoBERTa embeddings were shown to outperform BERT embeddings by approximately 5 percentage points, demonstrating their ability to capture greater contextual representations. To augment these embeddings, we used features like text length and word count.


In [None]:
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from scipy.sparse import hstack

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
large_data = pd.read_csv(DATA_DIR + AMAZON_DATA_TRAIN)
data = large_data.sample(frac=0.01, random_state=101)
data.columns = ['Sentiment', 'Review Title', 'Text']
data.drop('Review Title', axis=1, inplace=True)

data['Length'] = data['Text'].apply(len)
data['Word Count'] = data['Text'].apply(lambda x: len(x.split()))

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base').to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_roberta_embeddings(texts, batch_size=32):
    roberta_model.eval()
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = roberta_model(**inputs)

        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)

    return np.vstack(all_embeddings)

texts = data['Text'].tolist()
roberta_features = get_roberta_embeddings(texts, batch_size=32)

custom_features = data[['Length', 'Word Count']].values
combined_features = np.hstack([roberta_features, custom_features])

x_train, x_test, y_train, y_test = train_test_split(combined_features, data['Sentiment'], test_size=0.3, random_state=101)

svm = LinearSVC()
clf = CalibratedClassifierCV(svm)
clf.fit(x_train, y_train)

y_predict = clf.predict(x_test)

print("############# Classification Report ##############")
print(classification_report(y_test, y_predict))
print("##################################################")

print("############# Confusion Matrix ##############")
print(confusion_matrix(y_test, y_predict))
print("##################################################")

############# Classification Report ##############
              precision    recall  f1-score   support

           1       0.91      0.91      0.91      5313
           2       0.92      0.91      0.91      5487

    accuracy                           0.91     10800
   macro avg       0.91      0.91      0.91     10800
weighted avg       0.91      0.91      0.91     10800

##################################################
############# Confusion Matrix ##############
[[4861  452]
 [ 482 5005]]
##################################################


**XGBoost**

XGBoost (Extreme Gradient Boosting) is a supervised machine learning algorithm that extends gradient boosting methods for classification. Gradient boosting builds ensembles of weak learners (usually decision trees) in a sequential manner and each new tree attempts to correct the errors of the previous. Gradient refers to the descent, the method of updating model parameters to minimize loss. In addition, XGBoost includes regularization terms to help prevent overfitting when training the model and because it splits the data into decision trees, missing or noisy data can be handled better without relying heavily on pre-processing. In comparison to standard gradient boosting, XGBoost has some key advantages, namely improved scalability, faster training through parallelization, handling of imbalanced data, and built in features for customization of regularization and tree pruning. In our project we used the XGBClassifier function in SkLearn with default parameters and objective = ‘binary:logistic’ which supports 2 class classification.



In [None]:
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_XGBC = le.fit_transform(y_train)
y_test_XGBC = le.fit_transform(y_test)


XGBC = XGBClassifier(objective="binary:logistic", random_state=42)
clf_XGBC = CalibratedClassifierCV(XGBC)
clf_XGBC.fit(x_train, y_train_XGBC)

y_predict_XGBC = clf_XGBC.predict(x_test)

print("############# Classification Report ##############")
print(classification_report(y_test_XGBC, y_predict_XGBC))
print("##################################################")

print("############# Confusion Matrix ##############")
print(confusion_matrix(y_test_XGBC, y_predict_XGBC))
print("##################################################")

############# Classification Report ##############
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      5313
           1       0.90      0.89      0.90      5487

    accuracy                           0.90     10800
   macro avg       0.90      0.90      0.90     10800
weighted avg       0.90      0.90      0.90     10800

##################################################
############# Confusion Matrix ##############
[[4799  514]
 [ 600 4887]]
##################################################


**Random Forest**

Random Forest is a supervised machine learning algorithm for classification that uses multiple decision trees to improve the robustness and accuracy of predictions as opposed to singular decision trees. Random forests are generated using bootstrap aggregation (bagging), the subsampling and training of several decision trees whose predictions are aggregated in the end to produce a final result. Predictors at each level for each subsample are randomly selected to avoid repeated use of the same predictors. Random forest has advantages over decision trees in that it reduces overfitting by averaging the predictions of many decision trees, handles high dimensional data by using subsets of features of each tree, and are generally more accurate. In our project we use the RandomForestClassifier function in SkLearn with default parameters, which sets no limit to the tree depth and uses Gini impurity to measure classification improvement at each split.


In [None]:
# from sklearn.ensemble import RandomForestClassifier
# RF = RandomForestClassifier()
# clf_RF = CalibratedClassifierCV(RF)
# clf_RF.fit(x_train, y_train)

# y_predict_RF = clf_RF.predict(x_test)

# print("############# Classification Report ##############")
# print(classification_report(y_test, y_predict_RF))
# print("##################################################")

# print("############# Confusion Matrix ##############")
# print(confusion_matrix(y_test, y_predict_RF))
# print("##################################################")

**K-Nearest Neighbors**

K-Nearest Neighbors (KNN) is a supervised machine learning algorithm for classification that classifies a data point based on the similarity to the majority of the k nearest points. It requires a distance measure and a selection for how many neighbors to make a classification. The advantages of KNN are that it is simple to understand and use, and that it is nonparametric (requires no assumptions of underlying variables). However, it is sensitive to irrelevant features, and is poor in high dimensions. In high dimensional feature spaces, we would need to introduce feature reduction techniques (PCA, etc) to improve the accuracy/effectiveness of the model at the cost of adding pre-processing complexity. In addition, figuring out the correct choice of k can heavily affect the model’s performance, as smaller k values cause the model to be more sensitive to noise and larger k values may dilute the importance of certain features/data. In our project we used the KNeighborsClassification in the SkLearn library with default parameters, which sets k=5, and the distance measure to be Euclidean distance.


In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# KNN = KNeighborsClassifier()
# clf_KNN = CalibratedClassifierCV(KNN)
# clf_KNN.fit(x_train, y_train)

# y_predict_KNN = clf_KNN.predict(x_test)

# print("############# Classification Report ##############")
# print(classification_report(y_test, y_predict_KNN))
# print("##################################################")

# print("############# Confusion Matrix ##############")
# print(confusion_matrix(y_test, y_predict_KNN))
# print("##################################################")

**Final Model**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier


base_models = [
    ('svm', clf),      #SVM trained on RoBERTa embeddings + custom features
    ('xgb', XGBC)      #XGBoost trained on the same
]


stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

#train on RoBERTa features
stacking_model.fit(x_train, y_train)

#evaluate the stacked model
y_pred_stacked = stacking_model.predict(x_test)

print("############# Stacked Model Classification Report ##############")
print(classification_report(y_test, y_pred_stacked))

print("############# Stacked Model Confusion Matrix ##############")
print(confusion_matrix(y_test, y_pred_stacked))


############# Stacked Model Classification Report ##############
              precision    recall  f1-score   support

           1       0.91      0.91      0.91      5313
           2       0.92      0.91      0.91      5487

    accuracy                           0.91     10800
   macro avg       0.91      0.91      0.91     10800
weighted avg       0.91      0.91      0.91     10800

############# Stacked Model Confusion Matrix ##############
[[4851  462]
 [ 476 5011]]
