# Beating Text-Graph-17 with only Text

Current plan is following

- preprocess text into **q-a connection prediction** (question + question entities [SEP] answer + answer entities (+ linear. graph))
- finetune bert-like model (bigger=better) with some cool LoRA (this one needs to be tuned too)
- abuse augmentations for upsampling minor "correct" label examples

In [4]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import os
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import torch.optim as optim
from sklearn.metrics import precision_score, f1_score, recall_score

In [5]:
SEED = 42

torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

## Data Preproc

In [71]:
train_path = 'data/tsv/train.tsv'
test_path = 'data/tsv/test.tsv'

def linearize_graph(graph_dict, sep_token):
    """Borrowed from baseline, needs to be modified..."""
    
    nodes = sorted((node_dict for node_dict in graph_dict["nodes"]), key=lambda d:d["id"])
    for n_id, node_dict in enumerate(nodes):
        assert n_id == node_dict["id"]
    src_node_id2links = {}
    
    for link_dict in graph_dict["links"]:
        link_src =  link_dict["source"]
        if src_node_id2links.get(link_src) is None:
            src_node_id2links[link_src] = []
        src_node_id2links[link_src].append(link_dict)
    
    graph_s = ""
    for n_id, node_dict in enumerate(nodes):
        links = src_node_id2links.get(n_id, list())
        start_label = node_dict["label"]
        if node_dict["type"] == "ANSWER_CANDIDATE_ENTITY":
            start_label = f"{sep_token} {start_label} {sep_token}"
        for link_dict in links:
            target_label = nodes[link_dict["target"]]["label"]
            if nodes[link_dict["target"]]["type"] == "ANSWER_CANDIDATE_ENTITY":
                target_label = f"{sep_token} {target_label} {sep_token}"
            link_s = f" {start_label}, {link_dict['label']}, {target_label} "
            graph_s += link_s
    return graph_s

class TextGraphDataset(Dataset):
    def __init__(self,  tokenizer, max_length, split='train', include_graph=False):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split
        self.include_graph = include_graph
        if split in ['train', 'val', 'test']:
            df = pd.read_csv(train_path, sep='\t')
            df["label"] = df["correct"].astype(np.float32)
            self.df = self._split_train_dev_test(df, split)
        elif split == 'full': # use this to use all data for training (before submit)
            self.df = pd.read_csv(train_path, sep='\t')
            self.df["label"] = self.df["correct"].astype(np.float32)
        elif split == 'eval': # this corresponds to submit
            self.df = pd.read_csv(test_path, sep='\t')
        else:
            raise ValueError("Unrecognized split!")
        
        self.questions = []
        self.q_entities = []
        self.a_entities = []
        self.graphs = []
        self.labels = []

        self._get_data()

    def __getitem__(self, idx):
        q_entities = self.q_entities[idx] + ':'
        question = self.questions[idx]
        a_entities = self.a_entities[idx]
        
        if self.include_graph:
            raise NotImplementedError("Need to append graph in text form to answer")
        
        try: 
            tokenizer_out = self.tokenizer.encode_plus(
                text=q_entities + ' ' + question,
                text_pair=a_entities,
                max_length=self.max_length,
                padding="max_length",
                truncation="only_first",
                return_tensors="pt"
            )
        except Exception:
            print(question, q_entities, a_entities)

        res = {
            "input_ids": tokenizer_out["input_ids"].flatten(),
            "attention_mask": tokenizer_out["attention_mask"].flatten(),
        }
        
        if self.split != "eval":
            res["labels"] = self.labels[idx]
        
        if "token_type_ids" in tokenizer_out:
            res["token_type_ids"] = tokenizer_out["token_type_ids"].flatten()
        
        return res

    def __len__(self):
        return len(self.df)
    
    def _get_data(self):
        for idx, data in self.df.iterrows():
            self.questions.append(data["question"])
            self.q_entities.append(data["questionEntity"])
            self.a_entities.append(data["answerEntity"])
            if self.split != "eval":
                self.labels.append(data["label"])
            if self.include_graph:
                self.graphs.append(data["graph"].apply(eval))

    def _split_train_dev_test(self, df, split='train'):
        all_questions = list(df["question"].unique())
        num_questions = len(all_questions)
        random.shuffle(all_questions)

        train_dev_ratio = 0.8
        train_ratio = 0.9
        num_train_dev_questions = int(num_questions * train_dev_ratio)
        train_dev_questions = all_questions[:num_train_dev_questions]
        test_questions = set(all_questions[num_train_dev_questions:])
        num_train_questions = int(len(train_dev_questions) * train_ratio)
        train_questions = set(train_dev_questions[:num_train_questions])
        dev_questions = set(train_dev_questions[num_train_questions:])

        train_df = df[df["question"].isin(train_questions)]
        dev_df = df[df["question"].isin(dev_questions)]
        test_df = df[df["question"].isin(test_questions)]

        if split == 'train':
            return train_df
        elif split =='val':
            return dev_df
        else:
            return test_df

## Model prep and finetuning

In [34]:
# Load model directly
model_name = "whaleloops/phrase-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_bert = AutoModel.from_pretrained(model_name)

In [62]:
class QuestionClassifier(nn.Module):
    def __init__(self, pretrained_bert):
        super().__init__()
        self.bert_backbone = pretrained_bert
        self.hidden_size = pretrained_bert.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size // 2),
            nn.ELU(),
            nn.Linear(self.hidden_size // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert_backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state  # Access the last hidden states
        pooled_output = last_hidden_state[:, 0, :]  # Take the [CLS] token representation
        logits = self.head(pooled_output)
        return logits
    
model = QuestionClassifier(
    pretrained_bert
).to(DEVICE)

for p in model.bert_backbone.parameters():
    p.requires_grad = False

In [10]:
!pip install peft -q

[0m

In [63]:
from peft import LoraConfig, LoraModel

LORA_RANK=16
LORA_ALPHA=32.
LORA_DROPOUT=1e-1

config = LoraConfig(
    task_type="SEQ_CLS",
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=["query", "value"],
    lora_dropout=LORA_DROPOUT,
    use_rslora=True,
)

lora_model = LoraModel(model, config, "default")

for p in lora_model.head.parameters():
    p.requires_grad = True

In [64]:
def get_trainable_params(model: nn.Module):
    params = []
    for name, p in model.named_parameters():
        if p.requires_grad:
            params.append(p)
    return params

trainable_params = get_trainable_params(lora_model)
len(trainable_params)

52

In [60]:
def train_epoch(model, loader, optimizer, loss_fn):
    model.train()

    avg_loss = 0.

    predictions = []
    true_labels = []
    
    for i, batch in enumerate(loader):

        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        with torch.no_grad():
            preds = F.sigmoid(logits).detach().cpu().numpy()
            preds = (preds > 0.5) * 1
            y_true = labels.detach().cpu().numpy()
            
            predictions += preds.tolist()
            true_labels += y_true.tolist()
    
    avg_loss /= len(loader) + 1
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)    
    
    return avg_loss, f1, precision, recall


@torch.no_grad
def eval_epoch(model, loader, loss_fn):
    model.eval()

    avg_loss = 0.
    predictions, true_labels = [], []

    for i, batch in enumerate(loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        loss = loss_fn(logits, labels)
        
        avg_loss += loss.item()
        preds = F.sigmoid(logits).detach().cpu().numpy()
        preds = (preds > 0.5) * 1
        y_true = labels.detach().cpu().numpy()
        predictions += preds.tolist()
        true_labels += y_true.tolist()

    avg_loss /= len(loader)
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)

    return avg_loss, f1, precision, recall


def train(model, train_loader, val_loader, optimizer, loss_fn, epochs=10):
    for e in range(epochs):
        loss, f1, prec, rec = train_epoch(model, train_loader, optimizer, loss_fn)
        print(f"Train epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        
        loss, f1, prec, rec = eval_epoch(model, val_loader, loss_fn)
        print(f"Eval epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        

## Training, evaluation and submit

In [39]:
BATCH_SIZE=64
MAX_LENGTH=150
EPOCHS=50

from sklearn.utils.class_weight import compute_sample_weight
from torch.utils.data import WeightedRandomSampler

train_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='train')
dev_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='val')
test_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='test')

weights = compute_sample_weight('balanced', train_ds.labels)
sampler = WeightedRandomSampler(weights, len(weights)) # we will oversample correct answers :)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)
#train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [67]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(params=trainable_params, lr=3e-4)

In [65]:
import gc
torch.cuda.empty_cache()
gc.collect()

36

In [41]:
train(
    lora_model,
    train_loader,
    dev_loader,
    optimizer,
    loss_fn,
    epochs=EPOCHS
)

Train epoch 1 - loss: 0.636, f1: 0.637, precision: 0.618, recall: 0.657
Eval epoch 1 - loss: 0.565, f1: 0.286, precision: 0.180, recall: 0.696
Train epoch 2 - loss: 0.550, f1: 0.732, precision: 0.687, recall: 0.784
Eval epoch 2 - loss: 0.757, f1: 0.285, precision: 0.171, recall: 0.870
Train epoch 3 - loss: 0.509, f1: 0.766, precision: 0.717, recall: 0.822
Eval epoch 3 - loss: 0.533, f1: 0.340, precision: 0.215, recall: 0.813
Train epoch 4 - loss: 0.473, f1: 0.791, precision: 0.738, recall: 0.853
Eval epoch 4 - loss: 0.523, f1: 0.352, precision: 0.221, recall: 0.873
Train epoch 5 - loss: 0.459, f1: 0.800, precision: 0.751, recall: 0.856
Eval epoch 5 - loss: 0.438, f1: 0.404, precision: 0.269, recall: 0.809
Train epoch 6 - loss: 0.416, f1: 0.820, precision: 0.778, recall: 0.867
Eval epoch 6 - loss: 0.422, f1: 0.434, precision: 0.299, recall: 0.786
Train epoch 7 - loss: 0.390, f1: 0.837, precision: 0.794, recall: 0.885
Eval epoch 7 - loss: 0.415, f1: 0.432, precision: 0.296, recall: 0.803

In [42]:
_, f1, prec, rec = eval_epoch(model, test_loader, loss_fn)
print(f"Performance on hold-out test - f1: {f1:.2f}, precision: {prec:.2f}, recall: {rec:.2f}")

Performance on hold-out test - f1: 0.53, precision: 0.39, recall: 0.82


In [43]:
torch.save(model.state_dict(), "phrase_bert_lora_fixed_oversampling.pth")

In [None]:
# load saved model (maybe there is a better way with PEFT around, but...)

In [44]:
eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH, split='eval')
eval_df = eval_ds.df

In [48]:
q = eval_df.loc[0, "question"]
ids = eval_df.index[eval_df['question'] == q].tolist()
for id in ids:
    print(eval_ds.questions[id])

After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?
After publishing A Time to Kill, which book did its author begin working on immediately?


In [53]:
@torch.no_grad
def make_submit_predictions(model, tokenizer, filename='test_result_1.tsv'):
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH, split='eval')
    preds = []
    for idx, data in enumerate(eval_ds):
        input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
        attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
        token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
        
        logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        pred = (logit.detach().cpu().numpy() > 0) * 1
        preds.append(pred)

    df = eval_ds.df
    df['prediction'] = preds
    df['prediction'] = df['prediction'].astype(int)
    df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

@torch.no_grad
def make_submit_predictions_ranked(model, tokenizer, filename='test_result_2.tsv'):
    """based of Vika's idea - select all candidate answers for questions, select one with max prob"""
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH, split='eval')
    eval_df = eval_ds.df
    eval_df["correct"] = False

    for question in eval_df['question'].unique():
        ids = eval_df.index[eval_df['question'] == question].tolist()
        
        logits = []
        for idx in ids:
            data = eval_ds[idx]
            input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
            attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
            token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
            
            logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
            logits.append(logit.detach().cpu().item())

        right_ans_id = ids[np.argmax(logits)]
        eval_df.loc[right_ans_id, 'correct'] = True

    eval_df['prediction'] = eval_df['correct']
    eval_df['prediction'] = eval_df['prediction'].astype(int)
    eval_df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

In [132]:
make_submit_predictions(
    model,
    tokenizer
)

In [54]:
make_submit_predictions_ranked(
    model,
    tokenizer
)

### Complete retrain on full data (for best result)

In [72]:
BATCH_SIZE=64
MAX_LENGTH=150
EPOCHS=50

from sklearn.utils.class_weight import compute_sample_weight
from torch.utils.data import WeightedRandomSampler

ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='full')
weights = compute_sample_weight('balanced', train_ds.labels)
sampler = WeightedRandomSampler(weights, len(weights)) # we will oversample correct answers :)
loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)

In [76]:
for e in range(EPOCHS - 13):
    loss, f1, prec, rec = train_epoch(model, loader, optimizer, loss_fn)
    print(f"Train epoch {e + 1 + 13} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")

Train epoch 14 - loss: 0.193, f1: 0.930, precision: 0.904, recall: 0.957
Train epoch 15 - loss: 0.186, f1: 0.930, precision: 0.903, recall: 0.959
Train epoch 16 - loss: 0.186, f1: 0.932, precision: 0.907, recall: 0.959
Train epoch 17 - loss: 0.180, f1: 0.934, precision: 0.909, recall: 0.960
Train epoch 18 - loss: 0.180, f1: 0.937, precision: 0.914, recall: 0.960
Train epoch 19 - loss: 0.170, f1: 0.937, precision: 0.913, recall: 0.962
Train epoch 20 - loss: 0.168, f1: 0.938, precision: 0.916, recall: 0.961
Train epoch 21 - loss: 0.164, f1: 0.940, precision: 0.918, recall: 0.964
Train epoch 22 - loss: 0.168, f1: 0.939, precision: 0.915, recall: 0.963
Train epoch 23 - loss: 0.162, f1: 0.942, precision: 0.921, recall: 0.964
Train epoch 24 - loss: 0.161, f1: 0.941, precision: 0.921, recall: 0.962
Train epoch 25 - loss: 0.153, f1: 0.945, precision: 0.927, recall: 0.963
Train epoch 26 - loss: 0.154, f1: 0.943, precision: 0.925, recall: 0.962
Train epoch 27 - loss: 0.147, f1: 0.946, precision:

In [77]:
torch.save(model.state_dict(), "phrase_bert_lora_oversampling_full.pth")

In [None]:
make_submit_predictions_ranked(
    model,
    tokenizer,
    "test_res_overfit.csv"
)