# Transfer Learning with BERT for Text Classification

This notebook demonstrates how to leverage the pre-trained BERT model for a text classification task using transfer learning. 
BERT (Bidirectional Encoder Representations from Transformers) is a state-of-the-art model designed for natural language understanding.
We fine-tune BERT on a simple binary sentiment classification dataset, train it for three epochs, and evaluate its performance.

In [None]:
# Importing necessary libraries
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [None]:
# Check for GPU availability
# Use GPU if available; otherwise, fallback to CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

In [None]:
# Define a simple dataset (can be replaced with any other text classification dataset)
data = {
    'text': [
        "I love this product! It's amazing.",  # Positive sentiment
        "Terrible experience, would not recommend.",  # Negative sentiment
        "Great value for the money.",  # Positive sentiment
        "The worst item I have ever purchased.",  # Negative sentiment
        "Good quality and fast shipping.",  # Positive sentiment
        "Awful customer service, very disappointed."  # Negative sentiment
    ],
    'label': [1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

# Create a DataFrame from the dataset
df = pd.DataFrame(data)

# Split the data into train and test sets
# 80% of the data is used for training, and 20% is used for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [None]:
# Load BERT tokenizer
# The tokenizer is used to preprocess the text data for the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a custom dataset class to handle tokenization and data loading
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts  # Input texts
        self.labels = labels  # Corresponding labels
        # Tokenize the texts
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,  # Truncate texts longer than max_length
            padding=True,  # Pad shorter texts to max_length
            max_length=128,  # Maximum token length
            return_tensors="pt"  # Return PyTorch tensors
        )

    def __len__(self):
        # Return the total number of samples
        return len(self.texts)

    def __getitem__(self, idx):
        # Retrieve a single sample
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add label tensor
        return item

# Create training and validation datasets
train_dataset = TextDataset(train_texts, train_labels.tolist())
val_dataset = TextDataset(val_texts, val_labels.tolist())

# Load data loaders for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Shuffle training data
val_loader = DataLoader(val_dataset, batch_size=8)  # No shuffle for validation data

In [None]:
# Load the pre-trained BERT model for sequence classification
# num_labels=2 indicates a binary classification task
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)  # Move the model to the selected device

# Define the optimizer
# AdamW is a commonly used optimizer for transformer models
optimizer = AdamW(model.parameters(), lr=5e-5, no_deprecation_warning=True)

# Training loop
epochs = 3  # Number of epochs
model.train()  # Set the model to training mode

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_loader:
        optimizer.zero_grad()  # Reset gradients from the previous step
        input_ids = batch['input_ids'].to(device)  # Input token IDs
        attention_mask = batch['attention_mask'].to(device)  # Attention mask
        labels = batch['labels'].to(device)  # True labels

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights

        print(f"Loss: {loss.item():.4f}", end="\r")  # Print loss for monitoring


# Evaluation

In [None]:
model.eval()  # Set the model to evaluation mode
predictions, true_labels = [], []

with torch.no_grad():  # Disable gradient computation for evaluation
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)  # Input token IDs
        attention_mask = batch['attention_mask'].to(device)  # Attention mask
        labels = batch['labels'].to(device)  # True labels

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Raw predictions
        preds = torch.argmax(logits, dim=1)  # Get the predicted class

        # Collect predictions and true labels
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Generate and print the classification report
# This includes precision, recall, F1-score, and accuracy
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))
