In [1]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
os.environ['KAGGLE_CONFIG_DIR'] = '/root/.kaggle'

import shutil
shutil.move('/content/kaggle(5)(1)(4).json', '/root/.kaggle/kaggle.json')

from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate using the Kaggle API
api = KaggleApi()
api.authenticate()

# Download the dataset
api.dataset_download_files('arhamrumi/amazon-product-reviews', path='.', unzip=True)


print("Dataset downloaded and extracted!")


Dataset URL: https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews
Dataset downloaded and extracted!


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW  # Updated import
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import random
from tqdm import tqdm

In [3]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import random
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
import nlpaug.augmenter.word as naw
import nlpaug.model.word_embs as nmw

In [5]:
# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [6]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Set random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [8]:
#Function to evaluate model
def evaluate_model(model, test_loader, device='cuda'):
    model.to(device)
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            with autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [9]:
import pandas as pd
df= pd.read_csv("Reviews.csv")

In [10]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [11]:
# Step 1: Handle missing values
print("Handling missing values...")
df['Text'] = df['Text'].fillna('')  # Fill missing text with empty string
df['Score'] = df['Score'].interpolate(method='linear')  # Interpolate missing scores
print("Missing values per column before dropping:")
print(df.isnull().sum())
df = df.dropna()  # Drop rows with any remaining missing values
print("Missing values after handling and dropping:", df.isnull().sum().sum())
print("Dataset shape after handling missing values:", df.shape)

Handling missing values...
Missing values per column before dropping:
Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64
Missing values after handling and dropping: 0
Dataset shape after handling missing values: (568401, 10)


In [12]:
print(df.isnull().sum())

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


In [13]:
print("Removing duplicates...")
df = df.drop_duplicates(subset=['Text', 'Score'])
print("Dataset shape after deduplication:", df.shape)

Removing duplicates...
Dataset shape after deduplication: (393656, 10)


In [14]:
# Step 3: Transform sentiment scores
print("Transforming sentiment scores...")
def map_sentiment(score):
    if score in [1, 2]:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    elif score in [4, 5]:
        return 2  # Positive
df['Sentiment'] = df['Score'].apply(map_sentiment)

Transforming sentiment scores...


In [15]:
# Step 3.5: Select only Text, Score, and Sentiment columns
print("Selecting only Text, Score, and Sentiment columns...")
df = df[['Text', 'Sentiment']]
print("Dataset shape after column selection:", df.shape)
print("Columns in dataset:", df.columns.tolist())

Selecting only Text, Score, and Sentiment columns...
Dataset shape after column selection: (393656, 2)
Columns in dataset: ['Text', 'Sentiment']


In [16]:
# Step 4: Balance dataset (1000 samples per class)
print("Balancing dataset...")
negative_df = df[df['Sentiment'] == 0].sample(n=1000, random_state=42)
neutral_df = df[df['Sentiment'] == 1].sample(n=1000, random_state=42)
positive_df = df[df['Sentiment'] == 2].sample(n=1000, random_state=42)
balanced_df = pd.concat([negative_df, neutral_df, positive_df]).reset_index(drop=True)
print("Balanced dataset shape:", balanced_df.shape)

Balancing dataset...
Balanced dataset shape: (3000, 2)


In [17]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
# Step 5: Text preprocessing
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back
    return ' '.join(tokens)

print("Preprocessing text...")
balanced_df['Processed_Text'] = balanced_df['Text'].apply(preprocess_text)

Preprocessing text...


In [19]:
# Step 6: Prepare data for BERT and RoBERTa
def prepare_data(df, tokenizer, max_length=128):
    encodings = tokenizer(
        df['Processed_Text'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(df['Sentiment'].values)
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],  # Corrected line
        labels
    )
    return dataset

In [20]:
# Split data
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['Sentiment'])
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (2400, 3)
Test set shape: (600, 3)


In [21]:
# Initialize tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [22]:
# Prepare datasets
train_dataset_bert = prepare_data(train_df, bert_tokenizer)
test_dataset_bert = prepare_data(test_df, bert_tokenizer)
train_dataset_roberta = prepare_data(train_df, roberta_tokenizer)
test_dataset_roberta = prepare_data(test_df, roberta_tokenizer)

In [23]:
# Create data loaders
batch_size = 32
train_loader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=batch_size)
train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader_roberta = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [24]:
# Step 7: BERT Model Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def train_model(model, train_loader, optimizer, epochs=5):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


Using device: cuda


In [25]:
def evaluate_model(model, test_loader):
    model.to(device)
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

print("Training BERT model...")
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer_bert = AdamW(bert_model.parameters(), lr=2e-5)
train_model(bert_model, train_loader_bert, optimizer_bert, epochs=5)
bert_metrics = evaluate_model(bert_model, test_loader_bert)
print("BERT Evaluation Metrics:")
print(f"Accuracy: {bert_metrics[0]:.4f}, Precision: {bert_metrics[1]:.4f}, Recall: {bert_metrics[2]:.4f}, F1-Score: {bert_metrics[3]:.4f}")

Training BERT model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 75/75 [00:43<00:00,  1.74it/s]


Epoch 1/5, Loss: 1.0482


Epoch 2/5: 100%|██████████| 75/75 [00:44<00:00,  1.70it/s]


Epoch 2/5, Loss: 0.7638


Epoch 3/5: 100%|██████████| 75/75 [00:46<00:00,  1.60it/s]


Epoch 3/5, Loss: 0.5508


Epoch 4/5: 100%|██████████| 75/75 [00:46<00:00,  1.60it/s]


Epoch 4/5, Loss: 0.3608


Epoch 5/5: 100%|██████████| 75/75 [00:46<00:00,  1.60it/s]


Epoch 5/5, Loss: 0.2251
BERT Evaluation Metrics:
Accuracy: 0.6933, Precision: 0.7078, Recall: 0.6933, F1-Score: 0.6951


In [27]:
# Step 8: RoBERTa Model Training
print("Training RoBERTa model...")
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
optimizer_roberta = AdamW(roberta_model.parameters(), lr=2e-5)
train_model(roberta_model, train_loader_roberta, optimizer_roberta, epochs=5)
roberta_metrics = evaluate_model(roberta_model, test_loader_roberta)
print("RoBERTa Evaluation Metrics:")
print(f"Accuracy: {roberta_metrics[0]:.4f}, Precision: {roberta_metrics[1]:.4f}, Recall: {roberta_metrics[2]:.4f}, F1-Score: {roberta_metrics[3]:.4f}")

Training RoBERTa model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 75/75 [00:46<00:00,  1.63it/s]


Epoch 1/5, Loss: 0.9746


Epoch 2/5: 100%|██████████| 75/75 [00:46<00:00,  1.63it/s]


Epoch 2/5, Loss: 0.7470


Epoch 3/5: 100%|██████████| 75/75 [00:47<00:00,  1.57it/s]


Epoch 3/5, Loss: 0.5969


Epoch 4/5: 100%|██████████| 75/75 [00:47<00:00,  1.60it/s]


Epoch 4/5, Loss: 0.4645


Epoch 5/5: 100%|██████████| 75/75 [00:47<00:00,  1.58it/s]


Epoch 5/5, Loss: 0.3553
RoBERTa Evaluation Metrics:
Accuracy: 0.7033, Precision: 0.6996, Recall: 0.7033, F1-Score: 0.6986


In [28]:
# Step 9: Compare models
print("\nModel Comparison:")
print(f"BERT - Accuracy: {bert_metrics[0]:.4f}, F1-Score: {bert_metrics[3]:.4f}")
print(f"RoBERTa - Accuracy: {roberta_metrics[0]:.4f}, F1-Score: {roberta_metrics[3]:.4f}")


Model Comparison:
BERT - Accuracy: 0.6933, F1-Score: 0.6951
RoBERTa - Accuracy: 0.7033, F1-Score: 0.6986
