In [None]:
import os
import wandb
import pandas as pd
from tqdm import tqdm
import torch
from datetime import datetime
from torch.cuda import empty_cache
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence

from data_utils import label2id, id2label
from datasets import load_from_disk
from model import get_model

from loguru import logger

In [None]:
num_classes = len(list(label2id.keys()))

def stack(x, p=0): return pad_sequence([torch.tensor(t) for t in x], True, padding_value=p)
def stack_wo_pad(x): return torch.tensor(x)

def load_data(path):
    logger.info(f'Loading dataset from {path}')

    data = load_from_disk(path)
    train, val = data['train'], data['val']

    logger.info(f'Rows in train dataset: {len(train)}, rows in val dataset: {len(val)}')

    return train, val

def update_model(model, unfreeze_layers=0):
    logger.info('Updating the model')

    model.config.num_labels = num_classes
    model.config.id2label = id2label
    model.config.label2id = label2id

    classifier_layer = torch.nn.Linear(
        model.classifier.in_features,
        num_classes
    ).to('cuda')

    model.classifier = classifier_layer
    model.num_labels = num_classes

    for name, layer in model.named_parameters():
        layer.requires_grad = False

    if unfreeze_layers > 0:
        for layer in model.deberta.encoder.layer[-unfreeze_layers:].parameters():
            layer.requires_grad = True

    for layer in model.classifier.parameters():
        layer.requires_grad = True

    for name, layer in model.named_parameters():
        if layer.requires_grad == True:
            logger.info(f'Layer: {name} will be trained with dtype {layer.dtype}')

    return model

def eval_model(trained_model, eval_dataset, bs):
    label_metrics = dict.fromkeys(label2id.values())
    for k, v in label_metrics.items():
        label_metrics[k] = {'total_samples': 0,
                            'total_predicted': 0, 'correct_predictions': 0}

    trained_model.eval()
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for s in tqdm(range(0, len(eval_dataset), bs)):
            batch = eval_dataset[s:s+bs]

            input_ids = stack_wo_pad(batch['input_ids']).to(device)
            attention_mask = stack_wo_pad(batch['attention_mask']).to(device)
            labels = stack_wo_pad(batch['labels']).to(device)

            outputs = trained_model(input_ids, attention_mask=attention_mask)

            _, predicted_labels = torch.max(outputs.logits, -1)

            for p, l in zip(predicted_labels.flatten(), labels.flatten()):

                if l == -100:
                    continue

                if p == l:
                    correct_predictions += 1
                    label_metrics[l.item()]['correct_predictions'] += 1

                label_metrics[l.item()]['total_samples'] += 1
                label_metrics[p.item()]['total_predicted'] += 1

                total_samples += 1

    label_metrics = pd.DataFrame.from_records(label_metrics).T
    logger.info(f'Eval metrics: {label_metrics}')

    return label_metrics

def get_score(df, beta=5):
    tp = df['correct_predictions'].sum()
    fp = df['total_predicted'].sum() - tp
    fn = df['total_samples'].sum() - tp
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    micro_fbeta_score = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

    return micro_fbeta_score

### Main training loop

In [None]:
num_epochs = 5
learning_rate = 3e-5
batch_size = 4
model_id = 'sileod/deberta-v3-large-tasksource-nli'
# model_id = 'microsoft/deberta-v3-base'
dataset = './data/processed/dataset_3/'

In [None]:
train, val = load_data(path=dataset)
model, tokenizer = get_model(model_id=model_id)

model = update_model(model)

optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1, verbose=True)
total_steps = len(train) * num_epochs

device = 'cuda'
loss_fn = CrossEntropyLoss(
    # weight=torch.tensor([1, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]).to('cuda', dtype=torch.bfloat16),
    # label_smoothing=0.05,
    ignore_index=-100
)

all_losses = []
model.train()

In [None]:
for epoch in range(num_epochs):
    with tqdm(total=len(train)//batch_size, desc=f'Epoch {epoch+1}/{num_epochs}') as pbar:
        for s in range(0, len(train), batch_size):
            optimizer.zero_grad()
            batch = train[s:s+batch_size]

            input_ids = stack_wo_pad(batch['input_ids']).to(device)
            attention_mask = stack_wo_pad(batch['attention_mask']).to(device)
            labels = stack_wo_pad(batch['labels']).to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            logits_flat = outputs.logits.view(-1, outputs.logits.size(-1))
            targets_flat = labels.view(-1)

            loss = loss_fn(logits_flat, targets_flat)

            loss.backward()
            all_losses.append(loss.detach())

            optimizer.step()

            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
            pbar.update(1)

    scheduler.step()

In [None]:
train_metrics = eval_model(model, train, batch_size)
val_metrics = eval_model(model, val, batch_size)

In [None]:
save_path = f"./model/{datetime.strftime(datetime.now(), '%Y%m%d_%H%M')}"
model.save_pretrained(os.path.join(save_path, 'model'))
tokenizer.save_pretrained(os.path.join(save_path, 'tokenizer'))

### Rough

In [None]:
import json
import matplotlib.pyplot as plt
from transformers import AutoModelForTokenClassification, AutoTokenizer

import os
import wandb
import pandas as pd
from tqdm import tqdm
import torch
from datetime import datetime
from torch.cuda import empty_cache
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence

from data_utils import label2id, id2label
from datasets import load_from_disk
from model import get_model

from loguru import logger
import numpy as np

In [None]:
device = 'cuda:0'

In [None]:
def get_trained_model(model_path, tokenizer_path):
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model.to(device)
    return model, tokenizer

In [None]:
m, t = get_trained_model('./model/20240407_1032/model/', './model/20240407_1032/tokenizer/')

In [None]:
ds = load_from_disk('./data/processed/dataset_4/')

In [None]:
def eval_single_ex(ex):
    with torch.no_grad():
        ii = torch.tensor(ex['input_ids']).reshape(1, -1).to(device)
        am = torch.tensor(ex['attention_mask']).reshape(1, -1).to(device)

        o = m(
            input_ids=ii,
            attention_mask=am
        )

        i = 0
        # Predicted tokens
        print('Predicted tokens')
        for pred, token, label in zip(o.logits.argmax(-1)[0], ex['input_ids'], ex['labels']):
            if t.decode(token) == '[PAD]':
                i += 1
                continue
            if pred.item() == 0:
                i += 1
                continue
            print(f'{i} {t.decode(token)} ==> Predicted: {id2label[pred.item()]}, True: {id2label[label]}')
            i += 1

        i = 0
        print('Expected tokens')
        for pred, token, label in zip(o.logits.argmax(-1)[0], ex['input_ids'], ex['labels']):
            if t.decode(token) == '[PAD]':
                i += 1
                continue
            if label == 0 or label == -100:
                i += 1
                continue
            print(f'{i} {t.decode(token)} ==> Predicted: {id2label[pred.item()]}, True: {id2label[label]}')
            i += 1

    empty_cache()

    return o.logits

In [None]:
result = eval_single_ex(ds['train'][0])

In [None]:
sm = torch.nn.functional.softmax(result[0], dim=-1).cpu()

In [None]:
plt.plot(sm[:25].cpu())
plt.show()

In [None]:
val_metrics = eval_model(m, ds['val'], 8)

In [None]:
get_score(val_metrics[1:], 5)