# Beating Text-Graph-17 with only Text

Current plan is following

- preprocess text into **q-a connection prediction** (question + question entities [SEP] answer + answer entities (+ linear. graph))
- finetune bert-like model (bigger=better) with some cool LoRA (this one needs to be tuned too)
- abuse augmentations for upsampling minor "correct" label examples

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import os
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import torch.optim as optim
from sklearn.metrics import precision_score, f1_score, recall_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
SEED = 42

torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

## Data Preproc

In [127]:
train_path = 'data/tsv/train.tsv'
test_path = 'data/tsv/test.tsv'

class TextGraphDataset(Dataset):
    def __init__(self,  tokenizer, max_length, split='train',include_graph=False):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split
        self.include_graph = include_graph
        if split in ['train', 'val', 'test']:
            df = pd.read_csv(train_path, sep='\t')
            df["label"] = df["correct"].astype(np.float32)
            self.df = self._split_train_dev_test(df, split)
        elif split == 'eval': # this corresponds to submit
            self.df = pd.read_csv(test_path, sep='\t')
        else:
            raise ValueError("Unrecognized split!")
        
        self.quetions = []
        self.q_entities = []
        self.a_entities = []
        self.graphs = []
        self.labels = []

        self._get_data()

    def __getitem__(self, idx):
        q_entities = self.q_entities[idx] + ':'
        question = self.quetions[idx]
        a_entities = self.a_entities[idx]
        
        if self.include_graph:
            raise NotImplementedError("Need to append graph in text form to answer")
        
        try: 
            tokenizer_out = self.tokenizer.encode_plus(
                text=q_entities + ' ' + question,
                text_pair=a_entities,
                max_length=self.max_length,
                padding="max_length",
                truncation="only_first",
                return_tensors="pt"
            )
        except Exception:
            print(question, q_entities, a_entities)

        res = {
            "input_ids": tokenizer_out["input_ids"].flatten(),
            "attention_mask": tokenizer_out["attention_mask"].flatten(),
        }
        
        if self.split != "eval":
            res["labels"] = self.labels[idx]
        
        if "token_type_ids" in tokenizer_out:
            res["token_type_ids"] = tokenizer_out["token_type_ids"].flatten()
        
        return res

    def __len__(self):
        return len(self.df)
    
    def _get_data(self):
        for idx, data in self.df.iterrows():
            self.quetions.append(data["question"])
            self.q_entities.append(data["answerEntity"])
            self.a_entities.append(data["questionEntity"])
            if self.split != "eval":
                self.labels.append(data["label"])
            if self.include_graph:
                self.graphs.append(data["graph"].apply(eval))

    def _split_train_dev_test(self, df, split='train'):
        all_questions = list(df["question"].unique())
        num_questions = len(all_questions)
        random.shuffle(all_questions)

        train_dev_ratio = 0.8
        train_ratio = 0.9
        num_train_dev_questions = int(num_questions * train_dev_ratio)
        train_dev_questions = all_questions[:num_train_dev_questions]
        test_questions = set(all_questions[num_train_dev_questions:])
        num_train_questions = int(len(train_dev_questions) * train_ratio)
        train_questions = set(train_dev_questions[:num_train_questions])
        dev_questions = set(train_dev_questions[num_train_questions:])

        train_df = df[df["question"].isin(train_questions)]
        dev_df = df[df["question"].isin(dev_questions)]
        test_df = df[df["question"].isin(test_questions)]

        if split == 'train':
            return train_df
        elif split =='dev_df':
            return dev_df
        else:
            return test_df

## Model prep and finetuning

In [109]:
# Load model directly
model_name = "whaleloops/phrase-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_bert = AutoModel.from_pretrained(model_name)

In [110]:
class QuestionClassifier(nn.Module):
    def __init__(self, pretrained_bert):
        super().__init__()
        self.bert_backbone = pretrained_bert
        self.hidden_size = pretrained_bert.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size // 2),
            nn.ELU(),
            nn.Linear(self.hidden_size // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert_backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state  # Access the last hidden states
        pooled_output = last_hidden_state[:, 0, :]  # Take the [CLS] token representation
        logits = self.head(pooled_output)
        return logits
    
model = QuestionClassifier(
    pretrained_bert
).to(DEVICE)

for p in model.bert_backbone.parameters():
    p.requires_grad = False

In [101]:
#!pip install peft -q

In [111]:
from peft import LoraConfig, LoraModel

LORA_RANK=8
LORA_ALPHA=10.
LORA_DROPOUT=1e-2

config = LoraConfig(
    task_type="SEQ_CLS",
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=["query", "value"],
    lora_dropout=LORA_DROPOUT,
)

lora_model = LoraModel(model, config, "default")

for p in lora_model.head.parameters():
    p.requires_grad = True

In [112]:
def get_trainable_params(model: nn.Module):
    params = []
    for name, p in model.named_parameters():
        if p.requires_grad:
            params.append(p)
    return params

trainable_params = get_trainable_params(lora_model)
len(trainable_params)

52

In [114]:
def train_epoch(model, loader, optimizer, loss_fn):
    model.train()

    avg_loss = 0.

    predictions = []
    true_labels = []
    
    for i, batch in enumerate(loader):

        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        with torch.no_grad():
            preds = F.sigmoid(logits).detach().cpu().numpy()
            preds = (preds > 0.5) * 1
            y_true = labels.detach().cpu().numpy()
            
            predictions += preds.tolist()
            true_labels += y_true.tolist()
    
    avg_loss /= len(loader) + 1
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)    
    
    return avg_loss, f1, precision, recall


@torch.no_grad
def eval_epoch(model, loader, loss_fn):
    model.eval()

    avg_loss = 0.
    predictions, true_labels = [], []

    for i, batch in enumerate(loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        token_type_ids = batch["token_type_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        loss = loss_fn(logits, labels)
        
        avg_loss += loss.item()
        preds = F.sigmoid(logits).detach().cpu().numpy()
        preds = (preds > 0.5) * 1
        y_true = labels.detach().cpu().numpy()
        predictions += preds.tolist()
        true_labels += y_true.tolist()

    avg_loss /= len(loader)
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)

    return avg_loss, f1, precision, recall


def train(model, train_loader, val_loader, optimizer, loss_fn, epochs=10):
    for e in range(epochs):
        loss, f1, prec, rec = train_epoch(model, train_loader, optimizer, loss_fn)
        print(f"Train epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        
        loss, f1, prec, rec = eval_epoch(model, val_loader, loss_fn)
        print(f"Eval epoch {e + 1} - loss: {loss:.3f}, f1: {f1:.3f}, precision: {prec:.3f}, recall: {rec:.3f}")
        

## Training, evaluation and submit

In [115]:
BATCH_SIZE=64
MAX_LENGTH=128
EPOCHS=30

train_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='train')
dev_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='val')
test_ds = TextGraphDataset(tokenizer, MAX_LENGTH, split='test')

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [116]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params=trainable_params, lr=1e-3)

In [117]:
import gc
torch.cuda.empty_cache()
gc.collect()

8934

In [118]:
train(
    lora_model,
    train_loader,
    dev_loader,
    optimizer,
    loss_fn,
    epochs=EPOCHS
)

Train epoch 1 - loss: 0.322, f1: 0.001, precision: 0.080, recall: 0.001


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval epoch 1 - loss: 0.305, f1: 0.000, precision: 0.000, recall: 0.000
Train epoch 2 - loss: 0.308, f1: 0.002, precision: 0.500, recall: 0.001


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval epoch 2 - loss: 0.297, f1: 0.000, precision: 0.000, recall: 0.000
Train epoch 3 - loss: 0.300, f1: 0.011, precision: 0.600, recall: 0.006
Eval epoch 3 - loss: 0.287, f1: 0.003, precision: 1.000, recall: 0.001
Train epoch 4 - loss: 0.290, f1: 0.018, precision: 0.490, recall: 0.009


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Eval epoch 4 - loss: 0.284, f1: 0.000, precision: 0.000, recall: 0.000
Train epoch 5 - loss: 0.281, f1: 0.037, precision: 0.598, recall: 0.019
Eval epoch 5 - loss: 0.269, f1: 0.013, precision: 1.000, recall: 0.007
Train epoch 6 - loss: 0.273, f1: 0.054, precision: 0.567, recall: 0.028
Eval epoch 6 - loss: 0.276, f1: 0.019, precision: 0.875, recall: 0.009
Train epoch 7 - loss: 0.266, f1: 0.070, precision: 0.592, recall: 0.037
Eval epoch 7 - loss: 0.258, f1: 0.045, precision: 0.773, recall: 0.023
Train epoch 8 - loss: 0.258, f1: 0.099, precision: 0.607, recall: 0.054
Eval epoch 8 - loss: 0.251, f1: 0.162, precision: 0.667, recall: 0.092
Train epoch 9 - loss: 0.254, f1: 0.135, precision: 0.655, recall: 0.075
Eval epoch 9 - loss: 0.247, f1: 0.070, precision: 0.771, recall: 0.037
Train epoch 10 - loss: 0.247, f1: 0.152, precision: 0.602, recall: 0.087
Eval epoch 10 - loss: 0.247, f1: 0.019, precision: 1.000, recall: 0.009
Train epoch 11 - loss: 0.241, f1: 0.184, precision: 0.642, recall: 0.

In [120]:
_, f1, prec, rec = eval_epoch(model, test_loader, loss_fn)
print(f"Performance on hold-out test - f1: {f1:.2f}, precision: {prec:.2f}, recall: {rec:.2f}")

Performance on hold-out test - f1 - 0.44, precision: 0.82, recall: 0.30


In [134]:
torch.save(model.state_dict(), "phrase_bert_lora_tuned.pth")

In [152]:
@torch.no_grad
def make_submit_predictions(model, tokenizer, filename='test_result_1.tsv'):
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH, split='eval')
    preds = []
    for idx, data in enumerate(eval_ds):
        input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
        attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
        token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
        
        logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
        pred = (logit.detach().cpu().numpy() > 0) * 1
        preds.append(pred)

    df = eval_ds.df
    df['prediction'] = preds
    df['prediction'] = df['prediction'].astype(int)
    df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

@torch.no_grad
def make_submit_predictions_ranked(model, tokenizer, filename='test_result_2.tsv'):
    """based of Vika's idea - select all candidate answers for questions, select one with max prob"""
    model.eval()
    eval_ds = TextGraphDataset(tokenizer, max_length=MAX_LENGTH, split='eval')
    eval_df = eval_ds.df
    eval_df["correct"] = False

    for question in eval_df['question'].unique():
        ids = eval_df.index[eval_df['question'] == question].tolist()
        
        logits = []
        for idx in ids:
            data = eval_ds[idx]
            input_ids = data["input_ids"].to(DEVICE).unsqueeze(0)
            attention_mask = data["attention_mask"].to(DEVICE).unsqueeze(0)
            token_type_ids = data["token_type_ids"].to(DEVICE).unsqueeze(0)
            
            logit = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).squeeze()
            logits.append(logit.detach().cpu().item())

        right_ans_id = ids[np.argmax(logits)]
        eval_df.loc[right_ans_id, 'correct'] = True

    eval_df['prediction'] = eval_df['correct']
    eval_df['prediction'] = eval_df['prediction'].astype(int)
    eval_df[["sample_id", "prediction"]].to_csv(filename, sep='\t', index=False)

In [132]:
make_submit_predictions(
    model,
    tokenizer
)

In [153]:
make_submit_predictions_ranked(
    model,
    tokenizer
)