# 1. Libraries

In [28]:
import pandas as pd
import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# 2. Data loading

In [29]:
data = pd.read_csv("data/1429_1.csv")
data_test = pd.read_csv("data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")
# data_test = pd.read_csv("data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

  data = pd.read_csv("data/1429_1.csv")


In [None]:
display(data.sample(5), data.info())

# 3. Preprocessing

### 3.1 Dataset preproc

In [None]:
data = data[["categories","reviews.rating","reviews.text"]]
data.info()

In [None]:
# Drop rows with NaN values
data = data.dropna().reset_index()
data.info()

In [None]:
# Same thing for the test data
data_test = data_test[["categories","reviews.rating","reviews.text"]]
data_test = data_test.dropna().reset_index()
data_test.info()

### 3.2 Features preproc

In [23]:
# Clean text data
def get_wordnet_pos(word):
    """
    Map the results of pos_tag() to the characters that lemmatize() accepts
    """
    # from nltk.corpus import wordnet
    tag = nltk.pos_tag([word])[0][1][0]
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def data_cleaning(text):
    # import re
    # import nltk
    # from nltk.corpus import stopwords
    # from nltk.stem.wordnet import WordNetLemmatizer

    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]+', ' ', text) # Regex to remove all the special characters and numbers
    text = re.sub(r'\b\w\b', '', text) # Regex to remove all single characters
    text = re.sub(r' {2,}', ' ', text).strip() # Regex to substitute multiple spaces with single space
    
    tokenized_text = nltk.word_tokenize(text)
    text = [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(word)) for word in tokenized_text if word not in stopwords.words("english")]

    text = " ".join(text) # Transforms the list of words back into a single string
    return text

In [None]:
data["reviews.text"] = data["reviews.text"].apply(data_cleaning)
data

In [11]:
# # SAVE CLEAN DATA BACKUP
# data.to_csv("data_backup.csv")

# data = pd.read_csv("data_backup.csv").dropna()
# data

### 3.3 Classes preproc

In [27]:
data["reviews.rating"] = data["reviews.rating"].replace({1: 'Negative', 2: 'Negative', 3: 'Negative', 4: 'Neutral', 5: 'Positive'})
data["reviews.rating"].value_counts()

reviews.rating
Positive    23775
Neutral      8541
Negative     2311
Name: count, dtype: int64

### 3.4 Train-Test-Split

In [14]:
# X = data["reviews.text"]
# y = data["reviews.rating"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [24]:
X_train = data["reviews.text"]
y_train = data["reviews.rating"]

X_test = data_test["reviews.text"].apply(data_cleaning)
y_test = data_test["reviews.rating"].replace({1: 'Negative', 2: 'Negative', 3: 'Negative', 4: 'Neutral', 5: 'Positive'})

### 3.5 Vectorization

In [None]:
vectorizer = TfidfVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [21]:
# Save the trained model
pickle.dump(vectorizer, open(f"vectorizer.pkl", "wb"))

# 4. Model selection

### 4.1 Multinomial Naive Bayes

In [None]:
model = MultinomialNB()

# Grid-Search
param_grid = {
    'alpha': [0.05, 0.1, 0.2, 0.3],  # Regularization parameter
    'fit_prior': [True, False]  # Whether to learn class priors
}

model = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, scoring='accuracy')

model.fit(X_train_vec, y_train)

# Print the best parameters and best score
print("Best Parameters: ", model.best_params_)
print("Best Score: ", model.best_score_)

In [39]:
# Save the trained model
pickle.dump(model, open(f"model_MultinomialNB.pkl", "wb"))

### 4.2 Support Vector Classifier

In [None]:
model = SVC(class_weight="balanced")

# # Grid-Search
# param_grid = {
#     'kernel': ['linear', 'rbf', 'sigmoid']
# }

# model = GridSearchCV(estimator=model, param_grid=param_grid, 
#                            cv=5, scoring='accuracy')

# model.fit(X_train_vec, y_train)

# # Print the best parameters and best score
# print("Best Parameters: ", model.best_params_)
# print("Best Score: ", model.best_score_)

In [20]:
# Save the trained model
pickle.dump(model, open(f"model_SVC.pkl", "wb"))

### 4.3 Transfer Learning

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')

ratings = classifier(X_test.tolist())

# Extract the label values into a list
predicted_labels_raw = [result['label'].lower() for result in ratings]
predicted_labels = list(map(lambda label: 'Negative' if label == 'label_0' else
                                      'Neutral' if label == 'label_1' else
                                      'Positive' if label == 'label_2' else
                                      label, predicted_labels_raw))

In [60]:
# Save the trained model
pickle.dump(ratings, open(f"model_TransferLearning.pkl", "wb"))

# 5. Model evaluation

In [17]:
# Load the chosen model
best_model = pickle.load(open("model_SVC.pkl", "rb"))

In [None]:
y_pred = np.asarray(best_model.predict(X_test_vec))

In [None]:
acc = classification_report(y_test, y_pred, output_dict=True)
print(acc)

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix,
            annot=True, 
            fmt="d",
            cmap="Blues",
            xticklabels=["Negative","Neutral","Positive"],
            yticklabels=["Negative","Neutral","Positive"],
            )
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report_dict = {
    "Negative": {"precision": 0.90, "recall": 0.84, "f1-score": 0.87},
    "Neutral": {"precision": 0.67, "recall": 0.81, "f1-score": 0.73},
    "Positive": {"precision": 0.92, "recall": 0.86, "f1-score": 0.89}
}

# Convert the dictionary to a DataFrame
report = pd.DataFrame(report_dict).T  # Transpose to make classes rows and metrics columns

# Plot
report.plot(kind="bar", figsize=(10, 8))
# plt.title("Precision, Recall, and F1-Score by Class")
# plt.xlabel("Class")
plt.xticks(rotation=360)
plt.ylabel("Score")
plt.ylim(0, 1)  # To keep the y-axis within 0-1 range
plt.legend()
plt.show()