In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir

In [3]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datasets import load_dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

2024-03-15 10:30:24.915790: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-15 10:30:25.075661: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-15 10:30:25.639301: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-15 10:30:25.639379: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

#### Configuration

In the cell below, you can configure what type of training. For instance, you can choose what type of sequence to train MPNET on (both, gap, or g2t); or what kind of dataset (either xl or large). The following options are valid:

* `sequence_type`: choices=[both, gap, or g2t] where both means both gap and g2t as features
* `is_special_tok_context`: to whether highlight the answer candidate in the sequence or not
* `model_weights`: None or the provided model weight, if None, train from scratch. If loading the model weights, we are evaluating & reranking *Note*: since this is using HF interface to train, it is **highly** recommended to use the script to train instead of this notebook
* `seq_ds_path`: either "hle2000/Mintaka_Updated_Sequences_T5-large-ssm" or "hle2000/Mintaka_Updated_Sequences_T5-xl-ssm"
* `res_csv_path`: either "hle2000/Mintaka_T5_large_ssm_outputs" or "hle2000/Mintaka_T5_xl_ssm_outputs" (**please use the same dataset version for both `seq_ds_path` and `res_csv_path`**)

In [28]:
train_bs = 32
eval_bs = 32

sequence_type = "gap"
res_csv_path = "hle2000/Mintaka_T5_large_ssm_outputs"
seq_ds_path = "hle2000/Mintaka_Updated_Sequences_T5-large-ssm"

is_special_tok_context = False  # highlight ans cand or no
model_weights = "/workspace/storage/misc/subgraphs_reranking_results/gap/T5-large-ssm/gap_hl_false_cutoff_50/outputs/checkpoint-best"
model_name = "sentence-transformers/all-mpnet-base-v2"
model_save_name = (
    f"{model_name.split('/')[-1]}_{sequence_type}_highlight_{is_special_tok_context}"
)

In [29]:
if model_weights:  # evaluating
    print("evaluating")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ["[unused1]", "[unused2]"]}
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_weights).to(device)
else:  # training from scratch
    print("training from scratch")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ["[unused1]", "[unused2]"]}
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1
    ).to(device)

evaluating


#### Building our dataset

In [39]:
# reading the sequence dataset
seq_dataset = load_dataset(seq_ds_path)
train_df = seq_dataset["train"].to_pandas()
test_df = seq_dataset["test"].to_pandas()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_Updated_Sequences_T5-large-ssm-245ee66053ade193/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
class SequenceDataset(torch.utils.data.Dataset):
    """Dataset class for sequences"""

    def __init__(self, dataframe, tokenizer, seq_name):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.seq_name = seq_name

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        item = self.tokenizer(
            row[self.seq_name],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        )
        item["input_ids"] = item["input_ids"].view(-1)
        item["attention_mask"] = item["attention_mask"].view(-1)
        item["labels"] = torch.tensor(
            row["correct"], dtype=torch.float
        )  # pylint: disable=no-member
        return item

    def __len__(self):
        return self.dataframe.index.size

In [41]:
hl_type = "highlighted" if is_special_tok_context else "no_highlighted"
seq_type = f"{hl_type}_{sequence_type}_sequence"

train_dataset = SequenceDataset(train_df, tokenizer, seq_type)
test_dataset = SequenceDataset(test_df, tokenizer, seq_type)

#### Training

In [42]:
import numpy as np
import evaluate

threshold = 0.5
metric_classifier = evaluate.combine(
    [
        "accuracy",
        "f1",
        "precision",
        "recall",
        "hyperml/balanced_accuracy",
    ]
)
metric_regression = evaluate.combine(["mae"])


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    results = metric_regression.compute(predictions=predictions, references=labels)

    predictions = predictions > threshold
    results.update(
        metric_classifier.compute(predictions=predictions, references=labels)
    )

    return results

In [43]:
# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir=f"/workspace/storage/misc/subgraphs_reranking_results/new_sequences/t5-xl-ssm/{model_save_name}",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=train_bs,  # batch size per device during training
    per_device_eval_batch_size=eval_bs,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    metric_for_best_model="balanced_accuracy",  # select the base metrics
    logging_steps=500,  # log & save weights each logging_steps
    save_steps=500,
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    report_to="wandb",
)

In [44]:
from torch.utils.data.sampler import WeightedRandomSampler
import numpy as np


class CustomTrainer(Trainer):
    """custom trainer with sampler"""

    def get_labels(self):
        """get labels from train dataset"""
        labels = []
        for i in self.train_dataset:
            labels.append(int(i["labels"].cpu().detach().numpy()))
        return labels

    def _get_train_sampler(self) -> torch.utils.data.Sampler:
        """create our custom sampler"""
        labels = self.get_labels()
        return self.create_sampler(labels)

    def create_sampler(self, target):
        """weighted random sampler"""
        class_sample_count = np.array(
            [len(np.where(target == t)[0]) for t in np.unique(target)]
        )
        weight = 1.0 / class_sample_count
        samples_weight = np.array([weight[t] for t in target])

        samples_weight = torch.from_numpy(samples_weight)  # pylint: disable=no-member
        samples_weight = samples_weight.double()
        sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        return sampler

In [45]:
# Call the Trainer
trainer = CustomTrainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)

In [46]:
# Train the model
if not model_weights:  # training
    trainer.train()

    checkpoint_best_path = f"/workspace/storage/misc/subgraphs_reranking_results/new_sequences/t5-xl-ssm/results/{model_save_name}/checkpoint-best"
    model.save_pretrained(checkpoint_best_path)
    tokenizer.save_pretrained(checkpoint_best_path)

    print("Model dumped to ", checkpoint_best_path)
    print("\nFinal evaluation:\n\n", trainer.evaluate())

#### Evaluating

In [47]:
evaluate_res = trainer.evaluate()
evaluate_res

{'eval_loss': 0.14575406908988953,
 'eval_mae': 0.19223036955801073,
 'eval_accuracy': 0.8229367760617761,
 'eval_f1': 0.5437587439763718,
 'eval_precision': 0.4227701232777375,
 'eval_recall': 0.7617595818815331,
 'eval_balanced_accuracy': 0.7972663455626153,
 'eval_runtime': 130.9255,
 'eval_samples_per_second': 126.606,
 'eval_steps_per_second': 3.956}

#### Final Re-ranking

In [48]:
from datasets import load_dataset

res_csv = load_dataset(res_csv_path, verification_mode="no_checks")["test"].to_pandas()
res_csv.head()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_T5_large_ssm_outputs-f31d696c971a731a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,target,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,...,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199,target_out_of_vocab,__index_level_0__
0,What man was a famous American author and also...,Mark Twain,Mark Twain,Mark Twain,Harriet Beecher Stowe,Charles Dickens,William Faulkner,Mark Twain,Harriet Beecher Stowe,H. G. Wells,...,Theodore Sturgeon,H. P. Lovecraft,Stephen Crane,Horatio Bottomley,William Faulkner,Mark Twain,Edgar Allan Poe.,Horatio Parker,False,0
1,How many Academy Awards has Jake Gyllenhaal be...,1,1,1,1,1,1,1,2,1,...,11,12,One,13,128,215,128,128,False,1
2,"Who is older, The Weeknd or Drake?",Drake,The Weeknd,Drake,Drake,The Weeknd,Drake,Drake,Drake,The Weeknd,...,DJ Khaled,TWiG,"Drake,",Weeknd,TWENTY,TWiT,Twice as old,"The Weeknd,",False,2
3,How many children did Donald Trump have?,5,2,3,2,3,2,3,2,4,...,7,9,4 children,11,8,10,12,13,False,3
4,Is the main hero in Final Fantasy IX named Kuja?,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,The Final Fantasy IX.,Is Kuja the Hero,The Answer Is No,Is Final Fantasy VIII,Yu Yu Hakuku,"Yep, yes",YYYY,The Final Fantasy VII Final Fantasy,False,4


In [49]:
seq_type

'no_highlighted_gap_sequence'

In [50]:
final_acc, top200_total, top1_total, seq2seq_correct = 0, 0, 0, 0

for idx, group in tqdm(res_csv.iterrows()):
    curr_question_df = test_df[test_df["question"] == group["question"]]
    if (
        len(curr_question_df) == 0
    ):  # we don't have subgraph for this question, take answer from seq2seq
        if group["answer_0"] == group["target"]:
            seq2seq_correct += 1
        else:  # check if answer exist in 200 beams for question with no subgraphs
            all_beams = group.tolist()[2:-1]  # all 200 beams
            all_beams = set(all_beams)
            top200_total += 1 if group["target"] in all_beams else 0

    else:  # we have subgraph for this question
        all_beams = group.tolist()[2:-1]  # all 200 beams
        all_beams = set(all_beams)

        if group["target"] not in all_beams:  # no correct answer in beam
            continue

        # correct answer exist in beam
        top1_total += 1 if group["answer_0"] == group["target"] else 0
        top200_total += 1

        # reranking
        seqs = curr_question_df[seq_type].tolist()
        # print(seqs)
        is_corrects = curr_question_df["correct"].astype(bool).tolist()

        tok_seq = tokenizer(
            seqs,
            padding="max_length",
            max_length=512,
            truncation=True,
            return_tensors="pt",
        )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        output = torch.flatten(output)

        max_idx = output.argmax(dim=0).item()

        if is_corrects[max_idx] is True:
            final_acc += 1

# final rerankinga, top1 and top200 result
reranking_res = (final_acc + seq2seq_correct) / len(res_csv)
top200 = (top200_total + seq2seq_correct) / len(res_csv)
top1 = (top1_total + seq2seq_correct) / len(res_csv)

0it [00:00, ?it/s]

4000it [01:20, 49.64it/s] 


In [51]:
print(f"top1: {top1}, top200: {top200}, reranking top1: {reranking_res}")

top1: 0.25425, top200: 0.64375, reranking top1: 0.28425
