In [1]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix
from datasets import load_metric


In [2]:
# Load data
df_train = pd.read_csv(r'C:\Users\sagar\OneDrive\Desktop\Sem 3\Deep Learning\Project Roberta\to_Transformer_Train.csv')
df_val = pd.read_csv(r'C:\Users\sagar\OneDrive\Desktop\Sem 3\Deep Learning\Project Roberta\to_Transformer_val.csv')

# Reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [3]:
# Display label counts
print("Training label counts:")
print(df_train['label'].value_counts())
print("Validation label counts:")
print(df_val['label'].value_counts())

Training label counts:
label
0    10000
1    10000
Name: count, dtype: int64
Validation label counts:
label
0    500
1    500
Name: count, dtype: int64


In [4]:
# Extract texts and labels
train_texts = df_train['sentences'].tolist()
train_labels = df_train['label'].tolist()
val_texts = df_val['sentences'].tolist()
val_labels = df_val['label'].tolist()


In [5]:

# Preview data
print("Train texts sample:", train_texts[:2])
print("Val texts sample:", val_texts[:2])


Train texts sample: ['[\'the 28-year-old was a free agent after leaving blackpool, where he played 10 times last season as the tangerines were promoted to league one\', \' the ex-blackburn and preston man made the majority of his career appearances at scunthorpe, featuring 137 times\', " nolan has become crewe\'s fourth signing of the summer", " jordan bowery, michael raynes and chris porter have all moved to david artell\'s side this summer", \' find all the latest football transfers on our dedicated page\', \'\']', "['relationship: communication between family members essay\\n\\ntable of contents\\n 1', ' introduction\\n 2', ' relationship\\n 3', ' communication\\n 4', ' conclusion\\n 5', ' works cited\\n\\nintroduction\\n\\nrelations and communications with parents and relatives are one of the most important and fundamental phenomena necessary for the development of an individual', ' thus, this process of communication and interaction between several relatives can be characterized f

In [6]:


# Tokenization and dataset preparation
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encodings['input_ids'].flatten(),
            'attention_mask': encodings['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [8]:

# Prepare dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [9]:

# Model configuration
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Print out the parameter settings to verify
for name, param in model.named_parameters():
    print(f"{name} is {'trainable' if param.requires_grad else 'frozen'}")

roberta.embeddings.word_embeddings.weight is trainable
roberta.embeddings.position_embeddings.weight is trainable
roberta.embeddings.token_type_embeddings.weight is trainable
roberta.embeddings.LayerNorm.weight is trainable
roberta.embeddings.LayerNorm.bias is trainable
roberta.encoder.layer.0.attention.self.query.weight is trainable
roberta.encoder.layer.0.attention.self.query.bias is trainable
roberta.encoder.layer.0.attention.self.key.weight is trainable
roberta.encoder.layer.0.attention.self.key.bias is trainable
roberta.encoder.layer.0.attention.self.value.weight is trainable
roberta.encoder.layer.0.attention.self.value.bias is trainable
roberta.encoder.layer.0.attention.output.dense.weight is trainable
roberta.encoder.layer.0.attention.output.dense.bias is trainable
roberta.encoder.layer.0.attention.output.LayerNorm.weight is trainable
roberta.encoder.layer.0.attention.output.LayerNorm.bias is trainable
roberta.encoder.layer.0.intermediate.dense.weight is trainable
roberta.encode

In [14]:

# Freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

# Set the last two layers of the encoder and the classifier to be trainable
layer_indices = [10, 11]  # The last two layers

for i, layer in enumerate(model.roberta.encoder.layer):
    if i in layer_indices:
        for param in layer.parameters():
            param.requires_grad = True

# Make classifier layer trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Verify which parameters are trainable
for name, param in model.named_parameters():
    print(f"{name} is {'trainable' if param.requires_grad else 'frozen'}")

roberta.embeddings.word_embeddings.weight is frozen
roberta.embeddings.position_embeddings.weight is frozen
roberta.embeddings.token_type_embeddings.weight is frozen
roberta.embeddings.LayerNorm.weight is frozen
roberta.embeddings.LayerNorm.bias is frozen
roberta.encoder.layer.0.attention.self.query.weight is frozen
roberta.encoder.layer.0.attention.self.query.bias is frozen
roberta.encoder.layer.0.attention.self.key.weight is frozen
roberta.encoder.layer.0.attention.self.key.bias is frozen
roberta.encoder.layer.0.attention.self.value.weight is frozen
roberta.encoder.layer.0.attention.self.value.bias is frozen
roberta.encoder.layer.0.attention.output.dense.weight is frozen
roberta.encoder.layer.0.attention.output.dense.bias is frozen
roberta.encoder.layer.0.attention.output.LayerNorm.weight is frozen
roberta.encoder.layer.0.attention.output.LayerNorm.bias is frozen
roberta.encoder.layer.0.intermediate.dense.weight is frozen
roberta.encoder.layer.0.intermediate.dense.bias is frozen
robe

In [27]:
from tqdm import tqdm
def train(model, dataloader, optimizer, device):
    
    model.train()
    total_loss, total_correct, total_samples = 0, 0, 0
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description("Evaluating")
    
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)
        progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    return average_loss, accuracy


In [28]:
def validate(model, dataloader, device):
    model.eval()
    total_loss, total_correct, total_samples = 0, 0, 0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch['labels']
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=-1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    average_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    return average_loss, accuracy


In [29]:
# Checkpoint directory
checkpoint_dir = './model_checkpoints_Roberta_Venkat'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
print(device)
model.to(device)

cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [30]:
import os
import torch

# Assuming checkpoint_dir is defined
checkpoint_dir = './model_checkpoints_Roberta_Venkat'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

for epoch in range(3):  # Number of epochs
    train_loss, train_accuracy = train(model, train_loader, optimizer, device)
    val_loss, val_accuracy = validate(model, val_loader, device)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Accuracy={train_accuracy:.4f}, Val Loss={val_loss:.4f}, Val Accuracy={val_accuracy:.4f}")

    # Save model checkpoint
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pth')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'train_accuracy': train_accuracy,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy
    }, checkpoint_path)


Evaluating: 100%|██████████████████████████████████████████████████████████████████| 2500/2500 [07:41<00:00,  5.42it/s]


Epoch 1: Train Loss=0.0391, Train Accuracy=0.9868, Val Loss=0.2017, Val Accuracy=0.9390


Evaluating: 100%|██████████████████████████████████████████████████████████████████| 2500/2500 [07:40<00:00,  5.43it/s]


Epoch 2: Train Loss=0.0310, Train Accuracy=0.9890, Val Loss=0.1322, Val Accuracy=0.9650


Evaluating: 100%|██████████████████████████████████████████████████████████████████| 2500/2500 [07:41<00:00,  5.41it/s]


Epoch 3: Train Loss=0.0214, Train Accuracy=0.9928, Val Loss=0.0700, Val Accuracy=0.9760


In [31]:
# Assuming tokenizer is your initialized tokenizer
tokenizer.save_pretrained('./model_checkpoints_finetune')


('./model_checkpoints_finetune\\tokenizer_config.json',
 './model_checkpoints_finetune\\special_tokens_map.json',
 './model_checkpoints_finetune\\vocab.json',
 './model_checkpoints_finetune\\merges.txt',
 './model_checkpoints_finetune\\added_tokens.json')

In [32]:
model.save_pretrained('./model_checkpoints_finetune') 