In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from transformers import RobertaTokenizerFast
import torch
from utils import TransformerDataset, TransformerModel
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


In [2]:
# Load X_train and y_train, X_val and y_val from pickle files
X_train = pd.read_pickle('X_train.pkl')
y_train = pd.read_pickle('y_train.pkl')
X_val = pd.read_pickle('X_val.pkl')
y_val = pd.read_pickle('y_val.pkl')

In [3]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_encodings = tokenizer(
    X_train['cleaned_body'].tolist(),
    padding=True,
    truncation=True,
    return_token_type_ids=False,
    max_length=256,
    return_tensors='pt',
)

val_encodings = tokenizer(
    X_val['cleaned_body'].tolist(),
    padding=True,
    truncation=True,
    return_token_type_ids=False,
    max_length=256,
    return_tensors='pt',
)

In [4]:
X_train.drop(columns=['cleaned_body'], inplace=True)
X_val.drop(columns=['cleaned_body'], inplace=True)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
# Model parameters
batch_size = 16
num_labels = 2

# Set device if cuda is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
train_dataset = TransformerDataset(train_encodings, y_train)
val_dataset = TransformerDataset(val_encodings, y_val)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define model and training parameters
num_epochs = 20
criterion = CrossEntropyLoss()

model = TransformerModel(num_labels)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
learning_rate_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * num_epochs
)

In [None]:
# Training Loop

train_losses,val_losses,train_accuracies,val_accuracies = [],[],[],[]
train_recalls,val_recalls = [],[]

model.to(device)

for epoch in tqdm(range(num_epochs)):

    ######## TRAINING #########
    train_accuracy, train_loss, val_accuracy, val_loss = 0, 0, 0, 0
    train_recall, val_recall = 0, 0

    model.train()
    optimizer.zero_grad()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits, predictions = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        batch_loss = criterion(logits, labels)
        train_loss += batch_loss.item()
        train_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        train_recall += recall_score(labels.cpu().numpy(), predictions.cpu().numpy())

        # Backward pass
        batch_loss.backward()
        optimizer.step()
        learning_rate_scheduler.step()

    train_losses.append(train_loss/len(train_dataloader))
    train_accuracies.append(train_accuracy/len(train_dataloader))
    train_recalls.append(train_recall/len(train_dataloader))

    ######## VALIDATION #########
    model.eval()
    for batch in val_dataloader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits, predictions = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        batch_loss = criterion(logits, labels)
        val_loss += batch_loss.item()
        val_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        val_recall += recall_score(labels.cpu().numpy(), predictions.cpu().numpy())

    val_losses.append(val_loss/len(val_dataloader))
    val_accuracies.append(val_accuracy/len(val_dataloader))
    val_recalls.append(val_recall/len(val_dataloader))

    

print(f'Train Loss: {train_loss/len(train_dataloader)}')
print(f'Train Accuracy: {train_accuracy/len(train_dataloader)}')
print(f'Train Recall: {train_recall/len(train_dataloader)}')

print(f'Validation Loss: {val_loss/len(val_dataloader)}')
print(f'Validation Accuracy: {val_accuracy/len(val_dataloader)}')
print(f'Validation Recall: {val_recall/len(val_dataloader)}')
print('\n')

In [None]:
# EPOCHS: 4

# Plot triain and validation metrics
plt.figure(figsize=(12,6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.show()

plt.figure(figsize=(12,6))
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')
plt.show()

plt.figure(figsize=(12,6))
plt.plot(train_recalls, label='Train Recall')
plt.plot(val_recalls, label='Validation Recall')
plt.legend()
plt.title('Recall')
plt.show()
 

In [9]:
from transformers import RobertaForSequenceClassification
model=RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [10]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./outputs',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    load_best_model_at_end=True,
    evaluation_strategy='steps'
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [11]:
trainer.train()

***** Running training *****
  Num examples = 207
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 70


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.