In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import os
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoModel, T5ForConditionalGeneration
import torch.optim as optim
from sklearn.metrics import precision_score, f1_score, recall_score
from tqdm import tqdm

In [2]:
SEED = 42

torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float32


## Data Preproc

In [3]:
train_path = '../../data/tsv/train.tsv'
test_path = '../../data/tsv/test.tsv'

from data.dataset import TextGraphDataset

## Model prep and finetuning

In [4]:
# %pip install accelerate bitsandbytes

In [5]:
# Load model directly

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_bert = AutoModel.from_pretrained(model_name,)

# model_name = "whaleloops/phrase-bert"
# model_name = "DeepPavlov/t5-wikidata5M-with-neighbors"


In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
class QuestionClassifier(nn.Module):
    def __init__(self, pretrained_bert):
        super().__init__()
        self.bert_backbone = pretrained_bert
        self.hidden_size = pretrained_bert.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size // 2),
            nn.ELU(),
            nn.Linear(self.hidden_size // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert_backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state  # Access the last hidden states
        pooled_output = last_hidden_state[:, 0, :]  # Take the [CLS] token representation
        logits = self.head(pooled_output)
        return logits
    
model = QuestionClassifier(
    pretrained_bert
).to(DEVICE)


In [8]:
# %pip install peft -q

In [9]:
from peft import LoraConfig, LoraModel, get_peft_model

LORA_RANK=16 # 16 default
LORA_ALPHA=32.
LORA_DROPOUT=1e-1

config = LoraConfig(
    task_type="SEQ_CLS",
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=["query", "value"], # for minilm
    # target_modules=["q", "v"], # T5 ["q", "v", "k", "o"]
    lora_dropout=LORA_DROPOUT,
    use_rslora=True,
)

lora_model = LoraModel(model, config, "default")
##  make sure to first wrap the base model by calling get_peft_model before wrapping it in PyTorch
# lora_model = get_peft_model(model, config)

print_trainable_parameters(lora_model)

print('Unfreezing head')
# Unfreeze the clf head
for p in lora_model.head.parameters():
    p.requires_grad = True
print_trainable_parameters(lora_model)
    

trainable params: 147456 || all params: 22934785 || trainable%: 0.6429360467080899
Unfreezing head
trainable params: 221569 || all params: 22934785 || trainable%: 0.9660827428728894


In [10]:
# See if the parameters are frozen

# for n, p in lora_model.named_parameters():
#     print(p.requires_grad, n)

### Functions for training

In [11]:
def train_epoch(model, loader, optimizer, loss_fn, scaler):
    model.train()

    avg_loss = 0.

    predictions = []
    true_labels = []
    
    for i, batch in tqdm(enumerate(loader), total=len(loader)):

        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE) # not for T5
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        
        # # https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training
        # with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        #     logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        #     # logits = model(input_ids=input_ids, attention_mask=attention_mask).squeeze() # for T5
        #     loss = loss_fn(logits, labels)
        # scaler.scale(loss).backward()
        # scaler.step(optimizer)
        # scaler.update()
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        with torch.no_grad():
            preds = F.sigmoid(logits).detach().cpu().float().numpy()
            preds = (preds > 0.5) * 1
            y_true = labels.detach().cpu().numpy()
            
            predictions += preds.tolist()
            true_labels += y_true.tolist()
    
    avg_loss /= len(loader) + 1
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)    
    
    return avg_loss, f1, precision, recall


@torch.no_grad()
def eval_epoch(model, loader, loss_fn):
    model.eval()

    avg_loss = 0.
    predictions, true_labels = [], []

    for i, batch in tqdm(enumerate(loader), total=len(loader)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        # logits = model(input_ids=input_ids, attention_mask=attention_mask).squeeze() # for T5
        loss = loss_fn(logits, labels)
        
        avg_loss += loss.item()
        preds = F.sigmoid(logits).detach().cpu().numpy()
        preds = (preds > 0.5) * 1
        y_true = labels.detach().cpu().numpy()
        predictions += preds.tolist()
        true_labels += y_true.tolist()

    avg_loss /= len(loader)
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)

    return avg_loss, f1, precision, recall


def train(model, train_loader, val_loader, optimizer, loss_fn, save_dir, epochs=10, ):
    """_summary_

    Args:
        model (_type_): Moder for training
        train_loader (_type_): _description_
        val_loader (_type_): _description_
        optimizer (_type_): _description_
        loss_fn (_type_): _description_
        save_dir (str): folder name to save ckpt to
        epochs (int, optional): _description_. Defaults to 10.
    """
    scaler = torch.cuda.amp.GradScaler()
    
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    best_f1_val = 0
    for e in range(epochs):
        loss, f1, prec, rec = train_epoch(model, train_loader, optimizer, loss_fn, scaler=scaler)
        print(f"Train epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        
        loss, f1, prec, rec = eval_epoch(model, val_loader, loss_fn)
        print(f"Eval epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        
        if f1 > best_f1_val:
            best_f1_val = f1
            torch.save(
                model.state_dict(), 
                os.path.join(save_dir,"minilm--lora-fixed_oversampling-includegraphs-BEST.pth")
            )    
        
        torch.save(
            model.state_dict(), 
            os.path.join(save_dir,"minilm--lora-fixed_oversampling-includegraphs-LAST.pth")
            
            )    

## Training, evaluation and submit

In [12]:
def split_train_dev_test(df: pd.DataFrame):
        all_questions = list(df["question"].unique())
        num_questions = len(all_questions)
        random.shuffle(all_questions)

        train_dev_ratio = 0.8
        train_ratio = 0.9
        num_train_dev_questions = int(num_questions * train_dev_ratio)
        train_dev_questions = all_questions[:num_train_dev_questions]
        test_questions = set(all_questions[num_train_dev_questions:])
        
        num_train_questions = int(len(train_dev_questions) * train_ratio)
        train_questions = set(train_dev_questions[:num_train_questions])
        dev_questions = set(train_dev_questions[num_train_questions:])

        train_df = df[df["question"].isin(train_questions)]
        dev_df = df[df["question"].isin(dev_questions)]
        test_df = df[df["question"].isin(test_questions)]

        return train_df, dev_df, test_df
    
def split_train_dev(df: pd.DataFrame):
        all_questions = list(df["question"].unique())
        num_questions = len(all_questions)
        random.shuffle(all_questions)

        train_dev_ratio = 0.8
        train_ratio = 0.9
        num_train_questions = int(num_questions * train_dev_ratio)
        train_questions = set(all_questions[:num_train_questions])
        dev_questions = set(all_questions[num_train_questions:])
        
        train_df = df[df["question"].isin(train_questions)]
        dev_df = df[df["question"].isin(dev_questions)]

        return train_df, dev_df

In [13]:
BATCH_SIZE=64
MAX_LENGTH=256
EPOCHS=50
LR = 3e-4

INCLUDE_GRAPH = False

from sklearn.utils.class_weight import compute_sample_weight
from torch.utils.data import WeightedRandomSampler

# Loading dataframe for making splits
df = pd.read_csv(train_path, sep='\t')
df["label"] = df["correct"].astype(np.float32)
# df_train, df_dev, df_test = split_train_dev_test(df)
df_train, df_dev = split_train_dev(df)

train_ds = TextGraphDataset(tokenizer, MAX_LENGTH, train_path=train_path, test_path=test_path, 
                            split='train',
                            df_split=df_train, 
                            include_graph=INCLUDE_GRAPH,
                            is_T5=False,
                           )
dev_ds = TextGraphDataset(tokenizer, MAX_LENGTH, train_path=train_path, test_path=test_path,
                          split='val',
                          df_split=df_dev,
                          include_graph=INCLUDE_GRAPH,
                          is_T5=False,
                         )                         
# test_ds = TextGraphDataset(tokenizer, MAX_LENGTH, train_path=train_path, test_path=test_path,
#                            split='test',
#                            df_split=df_test, 
#                            include_graph=INCLUDE_GRAPH,
#                            is_T5=True,
#                           )

weights = compute_sample_weight('balanced', train_ds.labels)
sampler = WeightedRandomSampler(weights, len(weights)) # we will oversample correct answers :)

CONFIG_DATALOADER = {"num_workers":4, "pin_memory":True}
train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    sampler=sampler, # does not allow to use shuffle
#     shuffle=True, 
    **CONFIG_DATALOADER
)
dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, **CONFIG_DATALOADER)
# test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, **CONFIG_DATALOADER)

In [14]:
tokenizer.decode(train_ds[23]['input_ids'], skip_special_tokens=False)

"[CLS] roger federer, novak djokovic : who's won more head - to - head tennis matches between each other, novak djokovic or roger federer? [SEP] rafael nadal [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [15]:
tokenizer.decode(dev_ds[0]['input_ids'], skip_special_tokens=False)

'[CLS] major league baseball : whose is the oldest mlb player to hit a home run? [SEP] yogi berra [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [16]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(params=lora_model.parameters(), lr=LR)

In [17]:
import gc
torch.cuda.empty_cache()
gc.collect()

20

## PQlet run - with linearized graph

### MiniLM training

In [18]:
%%time 

SAVE_DIR = f"logs-minilm-cl{MAX_LENGTH}-lingraph_{INCLUDE_GRAPH}"

train(
    lora_model,
    train_loader,
    dev_loader,
    optimizer,  
    loss_fn,
    epochs=EPOCHS,
    save_dir = SAVE_DIR,
)

100%|██████████| 472/472 [02:09<00:00,  3.64it/s]


Train epoch 1 - loss: 0.659, f1: 0.597, precision: 0.586, recall: 0.608


100%|██████████| 118/118 [00:14<00:00,  8.35it/s]


Eval epoch 1 - loss: 0.605, f1: 0.252, precision: 0.159, recall: 0.599


100%|██████████| 472/472 [02:11<00:00,  3.60it/s]


Train epoch 2 - loss: 0.583, f1: 0.710, precision: 0.670, recall: 0.754


100%|██████████| 118/118 [00:13<00:00,  8.69it/s]


Eval epoch 2 - loss: 0.601, f1: 0.265, precision: 0.166, recall: 0.664


100%|██████████| 472/472 [02:01<00:00,  3.87it/s]


Train epoch 3 - loss: 0.529, f1: 0.754, precision: 0.702, recall: 0.815


100%|██████████| 118/118 [00:13<00:00,  8.83it/s]


Eval epoch 3 - loss: 0.640, f1: 0.263, precision: 0.163, recall: 0.685


100%|██████████| 472/472 [02:02<00:00,  3.87it/s]


Train epoch 4 - loss: 0.486, f1: 0.784, precision: 0.726, recall: 0.852


100%|██████████| 118/118 [00:13<00:00,  8.86it/s]


Eval epoch 4 - loss: 0.572, f1: 0.280, precision: 0.183, recall: 0.594


100%|██████████| 472/472 [02:01<00:00,  3.87it/s]


Train epoch 5 - loss: 0.457, f1: 0.798, precision: 0.743, recall: 0.861


100%|██████████| 118/118 [00:13<00:00,  8.90it/s]


Eval epoch 5 - loss: 0.532, f1: 0.290, precision: 0.195, recall: 0.570


100%|██████████| 472/472 [02:01<00:00,  3.87it/s]


Train epoch 6 - loss: 0.421, f1: 0.817, precision: 0.767, recall: 0.875


100%|██████████| 118/118 [00:13<00:00,  8.83it/s]


Eval epoch 6 - loss: 0.577, f1: 0.293, precision: 0.196, recall: 0.583


100%|██████████| 472/472 [02:05<00:00,  3.77it/s]


Train epoch 7 - loss: 0.405, f1: 0.828, precision: 0.775, recall: 0.888


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 7 - loss: 0.595, f1: 0.295, precision: 0.197, recall: 0.580


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 8 - loss: 0.384, f1: 0.839, precision: 0.790, recall: 0.895


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 8 - loss: 0.535, f1: 0.289, precision: 0.198, recall: 0.533


100%|██████████| 472/472 [02:04<00:00,  3.79it/s]


Train epoch 9 - loss: 0.372, f1: 0.848, precision: 0.798, recall: 0.904


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 9 - loss: 0.593, f1: 0.297, precision: 0.202, recall: 0.563


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 10 - loss: 0.354, f1: 0.853, precision: 0.807, recall: 0.905


100%|██████████| 118/118 [00:13<00:00,  8.75it/s]


Eval epoch 10 - loss: 0.616, f1: 0.292, precision: 0.192, recall: 0.611


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 11 - loss: 0.340, f1: 0.860, precision: 0.816, recall: 0.910


100%|██████████| 118/118 [00:13<00:00,  8.72it/s]


Eval epoch 11 - loss: 0.599, f1: 0.292, precision: 0.193, recall: 0.600


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 12 - loss: 0.327, f1: 0.868, precision: 0.825, recall: 0.917


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 12 - loss: 0.658, f1: 0.289, precision: 0.193, recall: 0.582


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 13 - loss: 0.322, f1: 0.869, precision: 0.826, recall: 0.918


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 13 - loss: 0.585, f1: 0.298, precision: 0.209, recall: 0.524


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 14 - loss: 0.309, f1: 0.878, precision: 0.838, recall: 0.921


100%|██████████| 118/118 [00:13<00:00,  8.68it/s]


Eval epoch 14 - loss: 0.620, f1: 0.296, precision: 0.207, recall: 0.516


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 15 - loss: 0.304, f1: 0.877, precision: 0.838, recall: 0.919


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 15 - loss: 0.641, f1: 0.293, precision: 0.202, recall: 0.530


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 16 - loss: 0.296, f1: 0.883, precision: 0.843, recall: 0.928


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 16 - loss: 0.537, f1: 0.302, precision: 0.219, recall: 0.485


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 17 - loss: 0.289, f1: 0.885, precision: 0.849, recall: 0.924


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 17 - loss: 0.604, f1: 0.307, precision: 0.219, recall: 0.513


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 18 - loss: 0.280, f1: 0.889, precision: 0.854, recall: 0.927


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 18 - loss: 0.631, f1: 0.302, precision: 0.210, recall: 0.533


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 19 - loss: 0.280, f1: 0.891, precision: 0.856, recall: 0.930


100%|██████████| 118/118 [00:13<00:00,  8.70it/s]


Eval epoch 19 - loss: 0.563, f1: 0.307, precision: 0.226, recall: 0.478


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 20 - loss: 0.276, f1: 0.892, precision: 0.857, recall: 0.930


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 20 - loss: 0.558, f1: 0.302, precision: 0.221, recall: 0.476


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 21 - loss: 0.267, f1: 0.895, precision: 0.861, recall: 0.933


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 21 - loss: 0.595, f1: 0.310, precision: 0.233, recall: 0.462


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 22 - loss: 0.264, f1: 0.897, precision: 0.862, recall: 0.935


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 22 - loss: 0.629, f1: 0.298, precision: 0.215, recall: 0.484


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 23 - loss: 0.260, f1: 0.898, precision: 0.864, recall: 0.935


100%|██████████| 118/118 [00:13<00:00,  8.71it/s]


Eval epoch 23 - loss: 0.611, f1: 0.302, precision: 0.224, recall: 0.464


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 24 - loss: 0.260, f1: 0.898, precision: 0.862, recall: 0.937


100%|██████████| 118/118 [00:13<00:00,  8.67it/s]


Eval epoch 24 - loss: 0.558, f1: 0.299, precision: 0.225, recall: 0.446


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 25 - loss: 0.250, f1: 0.904, precision: 0.872, recall: 0.939


100%|██████████| 118/118 [00:13<00:00,  8.60it/s]


Eval epoch 25 - loss: 0.606, f1: 0.302, precision: 0.226, recall: 0.459


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 26 - loss: 0.251, f1: 0.904, precision: 0.872, recall: 0.937


100%|██████████| 118/118 [00:13<00:00,  8.67it/s]


Eval epoch 26 - loss: 0.564, f1: 0.304, precision: 0.231, recall: 0.445


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 27 - loss: 0.244, f1: 0.908, precision: 0.875, recall: 0.943


100%|██████████| 118/118 [00:13<00:00,  8.63it/s]


Eval epoch 27 - loss: 0.596, f1: 0.307, precision: 0.233, recall: 0.450


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 28 - loss: 0.242, f1: 0.906, precision: 0.874, recall: 0.940


100%|██████████| 118/118 [00:13<00:00,  8.73it/s]


Eval epoch 28 - loss: 0.622, f1: 0.300, precision: 0.224, recall: 0.456


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 29 - loss: 0.237, f1: 0.911, precision: 0.882, recall: 0.941


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 29 - loss: 0.641, f1: 0.309, precision: 0.234, recall: 0.454


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 30 - loss: 0.235, f1: 0.911, precision: 0.880, recall: 0.944


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 30 - loss: 0.588, f1: 0.308, precision: 0.233, recall: 0.456


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 31 - loss: 0.232, f1: 0.912, precision: 0.884, recall: 0.943


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 31 - loss: 0.616, f1: 0.294, precision: 0.222, recall: 0.438


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 32 - loss: 0.224, f1: 0.915, precision: 0.885, recall: 0.946


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 32 - loss: 0.621, f1: 0.290, precision: 0.225, recall: 0.406


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 33 - loss: 0.225, f1: 0.915, precision: 0.888, recall: 0.945


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 33 - loss: 0.622, f1: 0.308, precision: 0.236, recall: 0.445


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 34 - loss: 0.224, f1: 0.914, precision: 0.887, recall: 0.943


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 34 - loss: 0.582, f1: 0.309, precision: 0.239, recall: 0.438


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 35 - loss: 0.220, f1: 0.916, precision: 0.887, recall: 0.947


100%|██████████| 118/118 [00:13<00:00,  8.71it/s]


Eval epoch 35 - loss: 0.589, f1: 0.312, precision: 0.252, recall: 0.410


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 36 - loss: 0.213, f1: 0.919, precision: 0.893, recall: 0.946


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 36 - loss: 0.657, f1: 0.312, precision: 0.229, recall: 0.492


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 37 - loss: 0.222, f1: 0.914, precision: 0.886, recall: 0.944


100%|██████████| 118/118 [00:13<00:00,  8.61it/s]


Eval epoch 37 - loss: 0.588, f1: 0.301, precision: 0.244, recall: 0.393


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 38 - loss: 0.222, f1: 0.915, precision: 0.887, recall: 0.945


100%|██████████| 118/118 [00:13<00:00,  8.71it/s]


Eval epoch 38 - loss: 0.548, f1: 0.300, precision: 0.238, recall: 0.406


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 39 - loss: 0.217, f1: 0.917, precision: 0.890, recall: 0.946


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 39 - loss: 0.571, f1: 0.290, precision: 0.229, recall: 0.396


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 40 - loss: 0.208, f1: 0.922, precision: 0.897, recall: 0.948


100%|██████████| 118/118 [00:13<00:00,  8.63it/s]


Eval epoch 40 - loss: 0.617, f1: 0.303, precision: 0.235, recall: 0.425


100%|██████████| 472/472 [02:05<00:00,  3.75it/s]


Train epoch 41 - loss: 0.216, f1: 0.918, precision: 0.892, recall: 0.946


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 41 - loss: 0.620, f1: 0.314, precision: 0.237, recall: 0.464


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 42 - loss: 0.209, f1: 0.920, precision: 0.896, recall: 0.945


100%|██████████| 118/118 [00:13<00:00,  8.74it/s]


Eval epoch 42 - loss: 0.602, f1: 0.310, precision: 0.228, recall: 0.484


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 43 - loss: 0.211, f1: 0.921, precision: 0.895, recall: 0.949


100%|██████████| 118/118 [00:13<00:00,  8.67it/s]


Eval epoch 43 - loss: 0.673, f1: 0.299, precision: 0.218, recall: 0.479


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 44 - loss: 0.206, f1: 0.921, precision: 0.897, recall: 0.947


100%|██████████| 118/118 [00:13<00:00,  8.66it/s]


Eval epoch 44 - loss: 0.644, f1: 0.314, precision: 0.241, recall: 0.449


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 45 - loss: 0.200, f1: 0.925, precision: 0.901, recall: 0.950


100%|██████████| 118/118 [00:13<00:00,  8.63it/s]


Eval epoch 45 - loss: 0.626, f1: 0.304, precision: 0.245, recall: 0.401


100%|██████████| 472/472 [02:04<00:00,  3.78it/s]


Train epoch 46 - loss: 0.203, f1: 0.923, precision: 0.899, recall: 0.949


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 46 - loss: 0.625, f1: 0.324, precision: 0.249, recall: 0.464


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 47 - loss: 0.204, f1: 0.923, precision: 0.896, recall: 0.951


100%|██████████| 118/118 [00:13<00:00,  8.72it/s]


Eval epoch 47 - loss: 0.645, f1: 0.304, precision: 0.239, recall: 0.420


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 48 - loss: 0.197, f1: 0.927, precision: 0.904, recall: 0.951


100%|██████████| 118/118 [00:13<00:00,  8.65it/s]


Eval epoch 48 - loss: 0.640, f1: 0.311, precision: 0.246, recall: 0.423


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 49 - loss: 0.191, f1: 0.928, precision: 0.905, recall: 0.951


100%|██████████| 118/118 [00:13<00:00,  8.64it/s]


Eval epoch 49 - loss: 0.675, f1: 0.313, precision: 0.237, recall: 0.463


100%|██████████| 472/472 [02:04<00:00,  3.80it/s]


Train epoch 50 - loss: 0.194, f1: 0.926, precision: 0.904, recall: 0.950


100%|██████████| 118/118 [00:13<00:00,  8.69it/s]

Eval epoch 50 - loss: 0.597, f1: 0.303, precision: 0.239, recall: 0.412
CPU times: total: 1h 54min 48s
Wall time: 2h 6min 34s





# Submission

In [21]:
@torch.no_grad()
def make_submit_predictions(model, tokenizer, include_graph, filename='submission.csv'):
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH,  train_path=train_path, test_path=test_path,
                               split='eval', include_graph=include_graph)
    preds = []
    for idx, data in tqdm(enumerate(eval_ds)):
        input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
        attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
        token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
        
        logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        pred = (logit.detach().cpu().numpy() > 0) * 1
        preds.append(pred)

    df = eval_ds.df
    df['prediction'] = preds
    df['prediction'] = df['prediction'].astype(int)
    df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

@torch.no_grad()
def make_submit_predictions_ranked(model, tokenizer, include_graph, filename='submission.csv', is_t5=False):
    """based of Vika's idea - select all candidate answers for questions, select one with max prob"""
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH,  train_path=train_path, test_path=test_path,
                               split='eval', 
                               df_split=None,
                               include_graph=include_graph)
    eval_df = eval_ds.df
    eval_df["correct"] = False

    for question in tqdm(eval_df['question'].unique()):
        ids = eval_df.index[eval_df['question'] == question].tolist()
        
        logits = []
        for idx in ids:
            data = eval_ds[idx]
            input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
            attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
            if is_t5:
                logit = model(input_ids=input_ids, attention_mask=attention_mask,).squeeze()
            else:
                token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
                logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
            
            
            logits.append(logit.detach().cpu().item())

        right_ans_id = ids[np.argmax(logits)]
        eval_df.loc[right_ans_id, 'correct'] = True

    eval_df['prediction'] = eval_df['correct']
    eval_df['prediction'] = eval_df['prediction'].astype(int)
    eval_df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

In [23]:

import copy 

best_leaderboard_path = r"C:\keep_pc\Skoltech-PC\Courses\Term 4\DL for NLP\Task 1\work-2-from_repo\text-graph\all-MiniLM-L6-v2--lora-fixed_oversampling-pqlet-includegraphs.pth"
load_path_best = f'{SAVE_DIR}\minilm--lora-fixed_oversampling-includegraphs-BEST.pth'
load_path_last = f'{SAVE_DIR}\minilm--lora-fixed_oversampling-includegraphs-LAST.pth'

# Best model submission
submission_lora_model = copy.deepcopy(lora_model)
state_dict_loaded = torch.load(load_path_best)
submission_lora_model.load_state_dict(state_dict_loaded)
make_submit_predictions_ranked(
    submission_lora_model,
    tokenizer,
    include_graph=INCLUDE_GRAPH,
    filename=f"{SAVE_DIR}/{SAVE_DIR}-submission_best.csv",
)

# Last model submission
submission_lora_model = copy.deepcopy(lora_model)
state_dict_loaded = torch.load(load_path_last)
submission_lora_model.load_state_dict(state_dict_loaded)
make_submit_predictions_ranked(
    submission_lora_model,
    tokenizer,
    include_graph=INCLUDE_GRAPH,
    filename=f"{SAVE_DIR}/{SAVE_DIR}-submission_last.csv",
)

# # Best model according to the public leaderboard
# submission_lora_model = copy.deepcopy(lora_model)
# state_dict_loaded = torch.load(best_leaderboard_path)
# submission_lora_model.model.load_state_dict(state_dict_loaded)
# make_submit_predictions_ranked(
#     submission_lora_model,
#     tokenizer,
#     include_graph=INCLUDE_GRAPH,
#     filename=f"{SAVE_DIR}/submission_best.csv",
# )

100%|██████████| 1000/1000 [00:52<00:00, 18.97it/s]
100%|██████████| 1000/1000 [00:52<00:00, 19.23it/s]
