# Requirements

In [None]:
!git clone https://github.com/pr-Mais/mm_sentiment_analysis.git

In [None]:
%cd mm_sentiment_analysis

In [None]:
!pip install -r requirements.txt

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

%matplotlib inline
# pd.set_option('precision', 2)

In [None]:
!mkdir data
!wget -O data/reviews_makkah_raw.csv ".."
!wget -O data/reviews_medina_raw.csv ".." 

# Data exploration

In [None]:
makkah_reviews_path = './data/reviews_makkah_raw.csv'
medina_reviews_path = './data/reviews_medina_raw.csv'

paths = [makkah_reviews_path, medina_reviews_path]

import src.data as data

df = data.import_data(paths)

In [None]:
df.head()

# Data preparation

This step will prepare the data set by exploring it, applying normalization where needed, and missing and duplicate value checking.

The data has been scraped from **booking.com** for English hotel reviews written during the period prior to and post COVID pendamic, for visitors of Madinah and Makkah.

## Exploring the data

In [None]:
data.check_data(df)

In [None]:
# some keys in the data
negative_key = 'review_negative'
positive_key = 'review_positive'
date_key = 'date_reviewed'

df = data.prepare_data(df, positive_key, negative_key, date_key)

# TODO answer the question: what does NaN represents in this dataset?

In [None]:
df = data.clean_data(df)

In [None]:
df.head()

TODO clean number & duplicate reviews

# Making features ready

In [None]:
df = data.encode_labels(df)

# 1 = positive, 0 = negative

In [None]:
df.head()

In [None]:
df[df.target == 1].shape

In [None]:
df[df.target == 0].shape

In [None]:
import pandas as pd

# Due to unbalance between positive and negatvie reviews, 
# we take 15% of positive reviews to balance it with negative.
reviews_pos = df[df.target == 1].sample(frac=0.12)
reviews_neg = df[df.target == 0]
df = pd.concat([reviews_pos, reviews_neg]).sample(frac=1)

# Assign features and targets
features = df.review.to_numpy()
target = df.target

In [None]:
reviews_neg.shape

In [None]:
reviews_pos.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, stratify=target)

In [None]:
X_train.shape

In [None]:
y_train.shape

# Machine Learning Models

## Bag of Words

In [None]:
# Testing with 2-gram and 3-gram tokens.
ngram_range = (1, 3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='word', ngram_range=ngram_range)

# Calculating n-grams for features & labels.
X_train_counts = count_vect.fit_transform(X_train)
train_features = (count_vect.get_feature_names_out())

X_test_counts = count_vect.transform(X_test)
test_features = (count_vect.get_feature_names_out())

In [None]:
# Checking how it looks like
pd.DataFrame(train_features, columns=['gram'])

In [None]:
pd.DataFrame(test_features, columns=['gram'])

In [None]:
X_train_counts.shape

## TF-IDF Model

In [None]:
import numpy as np

def normalize_data(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def visualize_features(tf_idf: np.ndarray, feats: np.ndarray):
  # Getting top ranking features
  sums = tf_idf.sum(axis = 0)

  data = []
  gram = 0

  for col, term in enumerate(feats):
      if gram == 0:
        gram = 'bigram' if len(term.split(' ')) == 2 else 'trigram'
      data.append((term, sums[0, col]))
      
  ranking = pd.DataFrame(data, columns = [gram, 'rank'])
  ranking['rank'] = normalize_data(ranking['rank'])
  words = (ranking.sort_values('rank', ascending = False))

  display(words)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [None]:
visualize_features(X_train_tfidf, train_features)

In [None]:
from sklearn.model_selection import GridSearchCV

def train_cv(X, y, classifier, param_grid, folds):
  grid = GridSearchCV(classifier, param_grid, cv=folds, scoring='accuracy', return_train_score=False, verbose=1)
  grid_search = grid.fit(X, y)

  print(grid_search.best_params_)
  print(grid_search.best_score_)

  return grid_search.best_params_

## kNN Model

In [None]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
classifier = KNeighborsClassifier(n_jobs=-1)
k = train_cv(X_train_tfidf, y_train, classifier, param_grid, 3)

In [None]:
knn = KNeighborsClassifier(n_neighbors=k['n_neighbors'])
knn.fit(X_train_tfidf, y_train)

In [None]:
y_pred = knn.predict(X_test_tfidf)

In [None]:
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

matrix_df = pd.DataFrame(confusion_matrix)
labelsStr = ['negative', 'positive']

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(10,7))
sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="crest", linewidth=2)

ax.set_title('Confusion Matrix - Decision Tree')
ax.set_xlabel("Predicted", fontsize =15)
ax.set_xticklabels(labelsStr)
ax.set_ylabel("Actual", fontsize=15)
ax.set_yticklabels(list(labelsStr), rotation = 0)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
test_sentiment = pd.DataFrame([], columns=['Review', 'Sentiment'])

for i in np.random.choice(X_test_tfidf.shape[0], 10, replace=False):
  sentiment = 'negative' if knn.predict(X_test_tfidf)[i] == 0 else 'positive'
  test_sentiment.loc[i] = [X_test[i], sentiment]

display(test_sentiment)

In [None]:
test_sentiment.to_csv('test_sample.csv')

## RF Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid = dict(n_estimators=[1000])
classifier = RandomForestRegressor(n_jobs=-1)
classifier.fit(X_train_tfidf, y_train)

In [None]:
y_pred = classifier.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = np.where(y_pred > 0.5, 1, 0)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# Transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
from src import utils
from src import experiment

utils.check_device()

In [None]:
# First experiment with BERT
bert = experiment.TransformerExperiment(name='BERT',
                                        tokenizer=BertTokenizer.from_pretrained(
                                            'bert-base-uncased', do_lower_case=True),
                                        model=BertForSequenceClassification.from_pretrained(
                                            "bert-base-uncased",
                                            num_labels=2,
                                            output_attentions=False,
                                            output_hidden_states=False,
                                        ),
                                        X=X_train,
                                        y=y_train,
                                        batch_size=32,
                                        epochs=4)

# Second experiment with RoBERTa
roberta = experiment.TransformerExperiment(name='RoBERTa',
                                           tokenizer=RobertaTokenizer.from_pretrained(
                                               'roberta-base', do_lower_case=True),
                                           model=RobertaForSequenceClassification.from_pretrained(
                                               'cardiffnlp/twitter-roberta-base-emotion',
                                               num_labels=2,
                                               output_attentions=False,
                                               output_hidden_states=False,
                                           ),
                                           X=X_train,
                                           y=y_train,
                                           batch_size=32,
                                           epochs=4)


In [None]:
bert.tokenize()
bert.create_dataset()
bert.train()
bert.print_stats()

In [None]:
roberta.tokenize(max_len=512)
roberta.create_dataset()
roberta.train()
roberta.print_stats()