In [None]:
!pip install transformers datasets

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from transformers import BertTokenizer as BT, BertForSequenceClassification as BSC
from transformers import Trainer as TNR, TrainingArguments as TA
from datasets import load_metric as lm
import torch


# Load the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset
amazon_reviews = pd.read_csv('/content/drive/MyDrive/amazon_reviews.csv')

In [None]:
# Convert the 'overall' ratings into binary sentiment labels
amazon_reviews['Sentiment'] = amazon_reviews['overall'].apply(lambda x: 1 if x >= 4 else 0)

subset_size = int(len(amazon_reviews) * 0.5)
amazon_reviews = amazon_reviews.sample(n=subset_size, random_state=42)

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(amazon_reviews, test_size=0.4, random_state=42)


In [None]:
# Load pre-trained BERT tokenizer and model from Google
tokenizer = BT.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
model = BSC.from_pretrained('google/bert_uncased_L-12_H-768_A-12')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the data
def tokenize_fn(texts):
    # Handle potential missing values and ensure string type
    valid_texts = [str(text) for text in texts if pd.notna(text)]
    return tokenizer(valid_texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_fn(train_data['reviewText'].tolist())
test_encodings = tokenize_fn(test_data['reviewText'].tolist())

In [None]:
train_labels = train_data['Sentiment'].tolist()
test_labels = test_data['Sentiment'].tolist()

In [None]:
# Convert to torch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
# Training arguments
training_args = TA(
    output_dir='./results_amazon', evaluation_strategy='epoch',
    learning_rate=2e-5, metric_for_best_model='accuracy',
    per_device_train_batch_size=32, per_device_eval_batch_size=32,
    logging_dir='./logs_amazon', logging_steps=10,
    num_train_epochs=2, save_strategy='epoch',
    load_best_model_at_end=True, seed=42,
    weight_decay=0.01, fp16=True  # Enable mixed precision training
)



In [None]:
# Load accuracy metric
accuracy_metric = lm("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    _ , _ , f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': accuracy['accuracy'], 'f1': f1}

# Initialize Trainer
trainer = TNR(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

  accuracy_metric = lm("accuracy")


In [None]:
# Train and evaluate the model
trainer.train()
eval_results = trainer.evaluate()
print(f"Amazon Reviews Evaluation results: {eval_results}")

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.326,0.213808,0.928789,0.962487
2,0.1494,0.165126,0.93591,0.96563


Amazon Reviews Evaluation results: {'eval_loss': 0.16512608528137207, 'eval_accuracy': 0.9359104781281791, 'eval_f1': 0.9656301145662848, 'eval_runtime': 367.2397, 'eval_samples_per_second': 2.677, 'eval_steps_per_second': 0.084, 'epoch': 2.0}
