# Binary Classification of Amazon Food Review Dataset

In [None]:
!pip list | grep "transformers"

sentence-transformers              3.3.1
transformers                       4.47.0


In [20]:
import re
import string
from collections import Counter

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification , Trainer, TrainingArguments

## Read the data

In [2]:
# Note: Can optionally use this if required
# df = pd.read_csv("hf://datasets/jhan21/amazon-food-reviews-dataset/Reviews.csv")

df = pd.read_csv("hf://datasets/jhan21/amazon-food-reviews-dataset/Reviews.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.shape

(568454, 10)

In [4]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [5]:
df.Score.value_counts()

Unnamed: 0_level_0,count
Score,Unnamed: 1_level_1
5,363122
4,80655
1,52268
3,42640
2,29769


In [6]:
df = df[df['Score'] != 3]

In [7]:
df.shape

(525814, 10)

In [8]:
df.Score.value_counts()

Unnamed: 0_level_0,count
Score,Unnamed: 1_level_1
5,363122
4,80655
1,52268
2,29769


## Convert the target variable to 0 and 1: 0 for Negative Reviews (1-2) and Positive Reviews (4-5)

In [9]:
df['target'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)
print(df['target'].value_counts())

target
1    443777
0     82037
Name: count, dtype: int64


## Handle Class Imbalance

In [10]:
# Split by class
positive = df[df['target'] == 1]
negative = df[df['target'] == 0]
positive_undersampled = positive.sample(n=len(negative), random_state=42)

# Combine and Shuffle
balanced_df = pd.concat([positive_undersampled, negative], axis=0)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_df['target'].value_counts())

target
1    82037
0    82037
Name: count, dtype: int64


## Preprocessing the Data

In [11]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
english_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w ]+','', text)
    text = re.sub(r'(http|https)?://\S+|www\.\S+','', text)
    text = ''.join(word for word in text if ord(word) < 128)
    text = text.translate(str.maketrans('','',string.punctuation))
    text = re.sub(r'[\d]+','', text)
    text = ' '.join(word for word in text.split() if len(word)>1)
    text = ' '.join(text.split())
    # stopword and punct removal
    text = ' '.join([i for i in nltk.word_tokenize(text) if i not in
    english_stopwords and i not in string.punctuation])
    # removal of anything other than English letters
    text = re.sub('[^a-z]+', ' ', text)
    text = ' '.join([ps.stem(i) for i in nltk.word_tokenize(text)]) #stemming
    return text

In [13]:
balanced_df['cleaned_text'] = balanced_df['Text'].apply(lambda x: preprocess_text(x))

In [None]:
# Optional: Save the dataset
balanced_df.to_csv("pre_processed_amazon_reviews.csv", index=False)

In [None]:
balanced_df['cleaned_text'].head()

0    abc hair read ton materi product best hair typ...
1    realli enjoy coffe skeptic packag think wouldn...
2    chocol delight actual tast like desert describ...
3    sinc product launch cat hook get best best fan...
4    kid like lactos intoler may fine boy cup tea s...
Name: cleaned_text, dtype: object

## Do a Train Test Split: 70% Training, 15% Val and 15% Test

In [25]:
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 114851, Val: 24611, Test: 24612


In [26]:
for x in [train_df, test_df, val_df]:
    print(x['target'].value_counts())

target
0    57520
1    57331
Name: count, dtype: int64
target
1    12316
0    12296
Name: count, dtype: int64
target
1    12390
0    12221
Name: count, dtype: int64


## Implement SVM

In [None]:
# SVM Pipeline
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', SVC(class_weight='balanced', kernel='rbf'))  # Default to RBF kernel
])

# Hyperparameter tuning
param_grid = {
    'clf__C': [0.1, 1, 10],
    'tfidf__ngram_range': [(1,1), (2,2)]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=3, n_jobs=-1)
gs_svm.fit(train_df['cleaned_text'], train_df['target'])

# Evaluation
svm_preds = gs_svm.predict(val_df['cleaned_text'])
print(classification_report(val_df['target'], svm_preds))

## Implement LSTM



In [None]:

# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        numericalized = [self.vocab.get(token, self.vocab['<unk>']) for token in text.split()]
        padded = numericalized[:self.max_length] + [self.vocab['<pad>']]*(self.max_length - len(numericalized))
        return torch.tensor(padded), torch.tensor(label)

# Build vocabulary
def build_vocab(texts, max_vocab=20000):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {'<pad>': 0, '<unk>': 1}
    for idx, (word, count) in enumerate(counter.most_common(max_vocab)):
        vocab[word] = idx + 2
    return vocab

# Load balanced dataset
texts = balanced_df['cleaned_text'].values
labels = balanced_df['target'].values

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Build vocabulary from training data
vocab = build_vocab(X_train)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

# Create datasets
max_length = 128  # Sequence length
train_dataset = TextDataset(X_train, y_train, vocab, max_length)
val_dataset = TextDataset(X_val, y_val, vocab, max_length)
test_dataset = TextDataset(X_test, y_test, vocab, max_length)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

Vocabulary size: 20002


In [None]:
# Define LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden.squeeze(0))

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_dim=256,
    output_dim=2  # Binary classification
).to(device)

# Handle class imbalance
class_counts = torch.bincount(torch.tensor(labels))
class_weights = 1. / class_counts.float()
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for texts, labels in loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts)
            loss = criterion(predictions, labels)
            total_loss += loss.item()
            all_preds.extend(predictions.argmax(dim=1).cpu())
            all_labels.extend(labels.cpu())
    return total_loss / len(loader), all_preds, all_labels

# Train for 5 epochs
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer)
    val_loss, val_preds, val_labels = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch+1}:')
    print(f'\tTrain Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')
    print(classification_report(val_labels, val_preds))

# Final test evaluation
_, test_preds, test_labels = evaluate(model, test_loader, criterion)
print("Test Performance:")
print(classification_report(test_labels, test_preds))

Epoch 1:
	Train Loss: 0.692 | Val Loss: 0.691
              precision    recall  f1-score   support

           0       0.50      0.99      0.66     12221
           1       0.59      0.02      0.04     12390

    accuracy                           0.50     24611
   macro avg       0.54      0.50      0.35     24611
weighted avg       0.54      0.50      0.35     24611

Epoch 2:
	Train Loss: 0.685 | Val Loss: 0.689
              precision    recall  f1-score   support

           0       0.70      0.04      0.07     12221
           1       0.51      0.98      0.67     12390

    accuracy                           0.51     24611
   macro avg       0.60      0.51      0.37     24611
weighted avg       0.60      0.51      0.37     24611

Epoch 3:
	Train Loss: 0.527 | Val Loss: 0.328
              precision    recall  f1-score   support

           0       0.89      0.83      0.86     12221
           1       0.84      0.90      0.87     12390

    accuracy                           0.86 

## Finetuning a RoBERTa Model

In [None]:
balanced_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,target,cleaned_text
0,281700,B007RTR9DS,A305XM71TF83CY,Keleigh Crigler Hadley,4,5,5,1342656000,Thought I would hate it,I have 4a/b/c hair and have read tons of mater...,1,abc hair read ton materi product best hair typ...
1,83621,B005ZBZLT4,AG2ERLQUGESI7,Adriana,0,1,5,1342396800,Fits in my Keurig and tastes great!,I really enjoy this coffee! I was skeptical ab...,1,realli enjoy coffe skeptic packag think wouldn...
2,299614,B0034YP1P8,A259W8AGBOT0G4,arietta night sparkel,0,0,5,1342051200,Very Unique,These Chocolates are so delightful. The actual...,1,chocol delight actual tast like desert describ...
3,367287,B001STX0RW,A3DG144UALEOJX,Hopelessly Caffeinated in Phoenix,3,3,5,1284940800,Phenomenal Product,Since this product's launch my cat has been ho...,1,sinc product launch cat hook get best best fan...
4,566650,B001EQ4J1K,A2BD6JNQZAFL97,Daivd Nooney,1,8,2,1203984000,thumbs down from the boys,Kids did not like it --- lactose intolerant my...,0,kid like lactos intoler may fine boy cup tea s...


In [None]:
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [None]:
device="cuda"

In [14]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [15]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load balanced dataset
texts = balanced_df['cleaned_text'].values
labels = balanced_df['target'].values

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert to DataFrames
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
val_df = pd.DataFrame({'text': X_val, 'label': y_val})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(x):
    return tokenizer(
        x['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

# Prepare datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True, num_proc=4)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True, num_proc=4)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True, num_proc=4)

# Define metric
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/114851 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/24611 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/24612 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'device' is not defined

In [21]:
# Model Fine-Tuning
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2
)
model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir='/results',
    num_train_epochs=4,          # Adjusted epochs
    per_device_train_batch_size=64, # Adjusted batch size for memory
    per_device_eval_batch_size=128,
    evaluation_strategy="steps",
    logging_steps=100,
    fp16=True,
    gradient_accumulation_steps=4,
    save_strategy="steps",
    save_steps = 1000,
    learning_rate=5e-5,      # Fine-tuned learning rate
    weight_decay=0.01,
    load_best_model_at_end=True, # Load the best model based on val_loss
    metric_for_best_model="f1", # Use F1 score as the metric to select the best model
    greater_is_better=True,
    save_total_limit=1, # Save only the best model
    dataloader_num_workers=8 # Use multiple workers for data loading
)

from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks = [early_stopping]
)

print("Started Training")

# Start training
trainer.train()

# Final Evaluation on Test Set
trainer.evaluate(test_dataset)

# Predictions on the test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

print("\nFinal Test Performance:")
print(classification_report(labels, preds))



Started Training




Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4142,0.289847,0.878713,0.883703
200,0.2987,0.246415,0.898907,0.901113
300,0.2638,0.234336,0.908496,0.906517
400,0.236,0.219729,0.919386,0.9191
500,0.2215,0.200872,0.922149,0.921042
600,0.1862,0.202485,0.926537,0.927847
700,0.1805,0.185751,0.930031,0.929276
800,0.1798,0.182774,0.933119,0.933527
900,0.1695,0.186812,0.933891,0.93486
1000,0.1296,0.190517,0.934988,0.934394


Could not locate the best model at /kaggle/output/results/checkpoint-1600/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/output/results/checkpoint-1600'

In [23]:
# Final Evaluation on Test Set
trainer.evaluate(test_dataset)

# Predictions on the test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

print("\nFinal Test Performance:")
print(classification_report(labels, preds))



Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4142,0.289847,0.878713,0.883703
200,0.2987,0.246415,0.898907,0.901113
300,0.2638,0.234336,0.908496,0.906517
400,0.236,0.219729,0.919386,0.9191
500,0.2215,0.200872,0.922149,0.921042
600,0.1862,0.202485,0.926537,0.927847
700,0.1805,0.185751,0.930031,0.929276
800,0.1798,0.182774,0.933119,0.933527
900,0.1695,0.186812,0.933891,0.93486
1000,0.1296,0.190517,0.934988,0.934394





Final Test Performance:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93     12296
           1       0.93      0.94      0.94     12316

    accuracy                           0.94     24612
   macro avg       0.94      0.94      0.94     24612
weighted avg       0.94      0.94      0.94     24612



In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Test set predictions
test_texts = test_df['reviewText'].tolist()
test_labels = test_df['sentiment'].values

# SVM
svm_preds = gs_svm.predict(test_df['clean_text'])

# LSTM
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['clean_text']), maxlen=200)
lstm_preds = (model.predict(X_test) > 0.5).astype(int)

# RoBERTa
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
roberta_preds = trainer.predict(test_encodings).predictions.argmax(axis=1)

# Metrics comparison
from sklearn.metrics import accuracy_score, f1_score

results = {
    'SVM': {
        'Accuracy': accuracy_score(test_labels, svm_preds),
        'F1': f1_score(test_labels, svm_preds)
    },
    'LSTM': {
        'Accuracy': accuracy_score(test_labels, lstm_preds),
        'F1': f1_score(test_labels, lstm_preds)
    },
    'RoBERTa': {
        'Accuracy': accuracy_score(test_labels, roberta_preds),
        'F1': f1_score(test_labels, roberta_preds)
    }
}

pd.DataFrame(results).T


In [None]:
# Training curves comparison
plt.figure(figsize=(12, 6))

# LSTM Training
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='LSTM Train')
plt.plot(history.history['val_accuracy'], label='LSTM Val')
plt.title('LSTM Training Progress')
plt.ylabel('Accuracy')
plt.legend()

# RoBERTa Training
plt.subplot(1, 2, 2)
roberta_history = trainer.state.log_history
train_loss = [x['loss'] for x in roberta_history if 'loss' in x]
val_loss = [x['eval_loss'] for x in roberta_history if 'eval_loss' in x]
plt.plot(train_loss, label='RoBERTa Train')
plt.plot(val_loss, label='RoBERTa Val')
plt.title('RoBERTa Training Progress')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()
