In [None]:
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch.cuda import empty_cache
from torch.nn import Linear, CrossEntropyLoss
from torch.optim import AdamW, Adam
from torch.optim.lr_scheduler import StepLR 
from torch.nn.utils.rnn import pad_sequence

from data_utils import label2id, id2label
from datasets import load_from_disk
from model import get_model

In [None]:
data = load_from_disk('./data/processed/dataset_3/')
train, val = data['train'], data['val']

print(len(train), len(val))

## PyTorch Modelling

In [None]:
model, tokenizer = get_model('sileod/deberta-v3-large-tasksource-nli')

In [None]:
model.config.num_labels = 15
model.config.id2label = id2label
model.config.label2id = label2id

In [None]:
num_classes = len(list(label2id.keys()))
classifier_layer = torch.nn.Linear(model.classifier.in_features, num_classes, dtype=torch.bfloat16).to('cuda')

model.classifier = classifier_layer
model.num_labels = num_classes

In [None]:
# Freeze complete model
for layer in model.parameters():
    layer.requires_grad = False

# Unfreeze last 6 encoder layers
for layer in model.deberta.encoder.layer[-6:].parameters():
    layer.requires_grad = True

# Unfreeze the classifier
for layer in model.classifier.parameters():
    layer.requires_grad = True

# Print name of the layers that are unfrozen
for name, layer in model.named_parameters():
    if layer.requires_grad == True:
        print(name, layer.dtype)

In [None]:
num_epochs = 50
learning_rate = 1e-4
batch_size=16

optimizer = Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1, verbose=True)
total_steps = len(train) * num_epochs

device = 'cuda'
loss_fn = CrossEntropyLoss(
    weight=torch.tensor([1, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]).to('cuda', dtype=torch.bfloat16),
    label_smoothing=0.05
)

In [None]:
def stack(x, p=0): return pad_sequence([torch.tensor(t) for t in x], True, padding_value=p)
def get_tensor_size(tensor): return tensor.element_size() * tensor.nelement()

In [None]:
all_losses = []
model.train()

for epoch in range(num_epochs):
    with tqdm(total=len(train)//batch_size, desc=f'Epoch {epoch+1}/{num_epochs}') as pbar:
        for s in range(0, len(train), batch_size):
            optimizer.zero_grad()
            batch = train[s:s+batch_size]
            
            input_ids = stack(batch['input_ids']).to(device)
            attention_mask = stack(batch['attention_mask']).to(device)
            labels = stack(batch['labels']).to(device)
    
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    
            loss = loss_fn(
                outputs.logits.reshape(len(labels), 15, stack(batch['labels']).shape[-1]),
                labels
            )
            
            all_losses.append(loss)
            loss.backward()
    
            optimizer.step()

            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
            pbar.update(1)

            empty_cache()

    scheduler.step()

In [None]:
batch['attention_mask']

In [None]:
stack(batch['labels']).shape

In [None]:
batch['labels'][0].__len__()

In [None]:
outputs.logits.shape

In [None]:
all_losses = [a.detach().to('cpu', torch.float16).numpy() for a in all_losses]
plt.plot(all_losses)
plt.show()

In [None]:
def eval_model(trained_model, eval_dataset, bs):
    label_metrics = dict.fromkeys(label2id.values())
    for k, v in label_metrics.items():
        label_metrics[k] = {'total_samples': 0, 'total_predicted': 0, 'correct_predictions': 0}

    trained_model.eval()
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for s in tqdm(range(0, len(eval_dataset), bs)):
            batch = eval_dataset[s:s+bs]
            
            input_ids = stack(batch['input_ids']).to(device)
            attention_mask = stack(batch['attention_mask']).to(device)
            labels = stack(batch['labels'], -100).to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            
            _, predicted_labels = torch.max(outputs.logits, -1)
    
            for p, l in zip(predicted_labels.flatten(), labels.flatten()):
                
                if l == -100:
                    continue
    
                if p==l:
                    correct_predictions +=1
                    label_metrics[l.item()]['correct_predictions'] += 1
    
                label_metrics[l.item()]['total_samples'] += 1
                label_metrics[p.item()]['total_predicted'] += 1
    
                total_samples +=1

    return pd.DataFrame.from_records(label_metrics).T

In [None]:
%%time
train_metrics = eval_model(model, train, 256)
val_metrics = eval_model(model, val, 256)

In [None]:
train_metrics

In [None]:
df = val_metrics

In [None]:
precision = 100*df.correct_predictions/df.total_predicted
recall = 100*df.correct_predictions/df.total_samples
f5 = 36 * precision*recall/(5*precision+recall)

Rough

In [None]:
temp = tokenizer('romit', return_tensors='pt')
tempop = model(temp['input_ids'].to('cuda'))
torch.max(tempop.logits, -1)[1]

In [None]:
[id2label[t] if t != -100 else None for t in train[0]['labels']]

In [None]:
[id2label[t.to('cpu').item()] for t in torch.max(op.logits[0], -1)[1]]

In [None]:
# Smaller sequences
# [X] Different model
# [X] Write test case for validating data
# [X] Data augmetation
# Resampling data?