In [None]:
import pandas as pd

data = pd.read_csv("datasets/amazon_reviews.csv")

In [None]:
data.head()

In [None]:
data['class_index'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
data.fillna('', inplace=True)
data.isnull().sum()

In [None]:
data['label'] = data['class_index'] - 1
data.head()


In [None]:
data['review'] = data['review_title'] + ' ' + data['review_text']
data.head()

In [None]:
downsized = data.sample(n=5000, random_state=42)

In [None]:
#Code to clean review text data
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from langdetect import detect

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize NLP tools
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def cleanReview(text):
    if not isinstance(text, str):
        return ""

    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 3. Remove square brackets
    text = re.sub(r'\[.*?\]', '', text)

    # 4. Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # 5. Remove numbers
    text = re.sub(r'\w*\d\w*', '', text)

    # 6. Remove special quote marks & newlines
    text = re.sub(r'[‘’“”…]', '', text)
    text = text.replace("\n", " ")

    # 7. Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])

    # 8. Spell Correction
    text = str(TextBlob(text).correct())

    # 9. Stemming
    text = " ".join([stemmer.stem(word) for word in text.split()])

    # 10. Lemmatization
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

In [None]:
downsized["text"] = downsized['review'].apply(cleanReview)
downsized.head()


In [None]:
from datasets import Dataset

dataset = Dataset.from_dict(downsized)

dataset = dataset.train_test_split(test_size=0.2)

In [None]:
import torch
import optuna
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Model Initialization
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

# Hyperparameter tuning function
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 2e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=dataset["train"].shuffle(seed=42).select(range(2000)),
        eval_dataset=dataset["test"].select(range(500)),
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_loss"]  # Minimize loss

# Run hyperparameter tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)

In [None]:
# Train final model with best hyperparameters
best_params = study.best_params
final_training_args = TrainingArguments(
    output_dir="./final_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    learning_rate=best_params["learning_rate"],
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

trainer.train()
eval_results = trainer.evaluate()
print("Final Model Evaluation:", eval_results)