In [4]:
# Install necessary packages (only needed once)
!pip install -q transformers datasets scikit-learn nltk

# Imports
import pandas as pd
import numpy as np
import re
import nltk
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
from nltk.corpus import stopwords
from google.colab import files
import matplotlib.pyplot as plt

nltk.download('stopwords')

# Upload CSV file
uploaded = files.upload()

# Load into DataFrame
df = pd.read_csv(next(iter(uploaded)))
df = df[['reviews.text', 'reviews.rating']].dropna()

# Binary sentiment: rating >= 4 is positive, else negative
df['label'] = df['reviews.rating'].apply(lambda x: 1 if x >= 4 else 0)

# Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation and numbers
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['cleaned_review'] = df['reviews.text'].apply(clean_text)

min_count = df['label'].value_counts().min()
df_balanced = df.groupby('label').sample(min_count, random_state=42)

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['cleaned_review'].tolist(),
    df_balanced['label'].tolist(),
    test_size=0.2,
    stratify=df_balanced['label'],
    random_state=42
)

# Load tokenizer and encode data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)

# Prepare Dataset objects
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': train_labels
})
val_dataset = Dataset.from_dict({
    'text': val_texts,
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'label': val_labels
})



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving 1429_1.csv to 1429_1 (4).csv


  df = pd.read_csv(next(iter(uploaded)))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training args
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss'
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

# Evaluate
preds_output = trainer.predict(val_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

print("\nClassification Report:")
print(classification_report(val_labels, preds))

print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, preds))

print("\nROC AUC Score:")
print(roc_auc_score(val_labels, preds))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mqaapostolico[0m ([33mqaapostolico-virginia-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.463803


Epoch,Training Loss,Validation Loss
1,No log,0.463803
2,No log,0.421047
3,0.376000,0.477174



Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       463
           1       0.82      0.88      0.85       462

    accuracy                           0.84       925
   macro avg       0.85      0.84      0.84       925
weighted avg       0.85      0.84      0.84       925


Confusion Matrix:
[[374  89]
 [ 55 407]]

ROC AUC Score:
0.8443638794610716
