# Sentiment analysis


This notebook runs all models for sentiment analysis and show the results in terms of performance metrics.

In [None]:
# run this if you run into "ModuleNotFoundError: No module named 'src'"
import sys
sys.path.insert(0, '../src')

In [None]:
import os
import pandas as pd
import numpy as np

# importing modules
import transformations as c
from sentiment_analysis.train.train_vader import train_vader
from sentiment_analysis.train.train_textblob import train_textblob

from sentiment_analysis.train.train_bert import *
from sentiment_analysis.train.train_xgboost import train_xgboost

In [None]:
# loading data
current_path = os.getcwd()
root_path = os.path.dirname(current_path)
df = pd.read_csv(root_path + '/data/reviews.csv', encoding='unicode_escape')

In [None]:
# clean data & add labels
df['clean_text'] = df['Text'].apply(lambda x: c.get_cleantext(x))
df['Sentiment_num'] = df.Sentiment.map({"positive": 1, "negative": 0})

In [None]:
# split train-test data
X = df['clean_text'].to_list()
y = df['Sentiment_num'].to_list()
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=4263
    )

In [None]:
# feature engineering
X_train_tf = tf_idf(X_train)
X_train_word2vec = word2vec(X_train)


## Vader & TextBlob

In [None]:
# run vader and textblob
vader, vader_accuracy = train_vader(df)
textblob, textblob_accuracy = train_textblob(df)

In [None]:
results = pd.DataFrame({"Accuracy":[vader_accuracy, textblob_accuracy]})
results.index = ['VADER', 'TextBlob']

In [None]:
results['Accuracy'].idxmax()

## Bert

In [None]:
X_train, X_test, y_train, y_test = preprocess_bert(df)

In [None]:
train_dataset, test_dataset = initialise_bert(X_train, X_test, y_train, y_test)

In [None]:
best_trial = train_bert(train_dataset, test_dataset)

In [None]:
print(best_trial.hyperparameters)
res_dict['bert'] = best_trial.metrics['accuracy']

## XGBoost

In [None]:
xgb_best_estimator = train_xgboost(X_train_tf, X_train_word2vec, y_train, metric= "accuracy")


# Model comparison

In [None]:
# performance metrics for test data
def model_comparison(best_train_estimator, X_test, y_test):
    y_pred = best_train_estimator.predict(X_test)
    # evalution metrics
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test, y_pred)

    print("""Test accuracy is {}.
             Test Recall is {}.
             Test Precision is {}.
             Test f1 score is {}""".format(acc, pre, recall, f1))

In [None]:
model_comparison(xgb_best_estimator)

Justification for selecting the best model