In [None]:
# Install the required packages by running the following commands in Jupyter Notebook or Python script
!pip install transformers
!pip install accelerate>=0.20.1

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
Ins

In [None]:
# Copy and paste the following imports at the beginning of your script
import pandas as pd
import torch
from transformers import BertForSequenceClassification, AutoTokenizer, BertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report

In [None]:
# Replace the file path in the pd.read_csv function with the path to your training data CSV file.
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IE4483/data/processed_train_data.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['reviews'], data['sentiments'], test_size=0.2, random_state=42)

X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

In [None]:
y_train = y_train.tolist()
y_test = y_test.tolist()

In [None]:
# This is to define a custom dataset class for text data
class SentimentTextDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_length):
        self.encodings = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt',
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'token_type_ids': self.encodings['token_type_ids'][idx],  # Not used for BERT, but included for compatibility
            'labels': torch.tensor(self.labels[idx]),
        }
        return item


In [None]:
# This is to define a custom dataset class for text data
max_length = 128
train_dataset = SentimentTextDataset(X_train, y_train, tokenizer, max_length)
eval_dataset = SentimentTextDataset(X_test, y_test, tokenizer, max_length)


In [None]:
# The training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_total_limit=2,
)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # num_labels=2 for binary classification
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3185,0.246293
2,0.2287,0.336878
3,0.0724,0.422252


In [None]:
# Save the fine-tuned model to the specified directory
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/IE4483/model/BERT")

Evaluate the model

In [None]:
# This is to evaluate the model and generate a classification report
# Replace the file path with the correct path to the trained model
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/IE4483/model/BERT")

eval_dataloader = DataLoader(eval_dataset, batch_size=8)
model.eval()

predicted_labels = []
true_labels = []

with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels.extend(outputs.logits.argmax(dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Generate a classification report
report = classification_report(true_labels, predicted_labels, target_names=["Class 0", "Class 1"])
print(report)


              precision    recall  f1-score   support

     Class 0       0.79      0.66      0.72       219
     Class 1       0.94      0.97      0.95      1217

    accuracy                           0.92      1436
   macro avg       0.87      0.82      0.84      1436
weighted avg       0.92      0.92      0.92      1436

