In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir

In [3]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datasets import load_dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

2024-02-29 10:47:49.014306: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-29 10:47:49.174566: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-29 10:47:49.754688: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-02-29 10:47:49.754778: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

#### Getting JSONL subgraphs dataset

In [4]:
dataset_type = "t5-xl-ssm"
new_test_dataset = False
train_bs = 32
eval_bs = 32
is_special_tok_context = False
proccess_data_from_scratch = False
model_weights = "/workspace/storage/misc/subgraphs_reranking_results/new_sequences/t5-xl-ssm/all-mpnet-base-v2_wrapped_updated_seqs_no_highlight/checkpoint-13500"
model_name = "sentence-transformers/all-mpnet-base-v2"
model_save_name = f"{model_name.split('/')[-1]}_wrapped_updated_seqs_highlight_{is_special_tok_context}"

In [5]:
if model_weights:  # evaluating
    print("evaluating")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ["[unused1]", "[unused2]"]}
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_weights).to(device)
else:  # training from scratch
    print("training from scratch")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ["[unused1]", "[unused2]"]}
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1
    ).to(device)

evaluating


In [6]:
import yaml


def add_new_seqs(path, df):
    """get the new seqs from yaml and add to df"""
    with open(path, "r") as stream:
        try:
            new_seqs = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    updated_seqs = []
    for curr_seq in new_seqs["data"]:
        updated_seqs.append(curr_seq["predicted"])
    df["updated_sequence"] = updated_seqs
    return df

#### Converting graph to its sequences

In [7]:
def get_node_names(
    subgraph,
    candidate_start_token="[unused1]",
    candidate_end_token="[unused2]",
    highlight=False,
):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    node_type = [subgraph.nodes[node]["type"] for node in subgraph.nodes()]

    if "ANSWER_CANDIDATE_ENTITY" not in node_type:
        return None

    if highlight:
        candidate_idx = node_type.index("ANSWER_CANDIDATE_ENTITY")
        node_names[
            candidate_idx
        ] = f"{candidate_start_token}{node_names[candidate_idx]}{candidate_end_token}"

    return node_names

In [8]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info != 0:  # no endge from_node -> to_node
                sequence.extend([from_node, edge_info, to_node])

    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [9]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_sequences(df):
    questions = list(df["question"])
    graphs = list(df["graph"])
    hl_graph_seq, no_hl_graph_seq = [], []
    for question, graph in tqdm(zip(questions, graphs)):
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))
        try:
            hl_graph_node_names = get_node_names(graph_obj, highlight=True)
            hl_seq = graph_to_sequence(graph_obj, hl_graph_node_names)
            
            no_hl_graph_node_names = get_node_names(graph_obj, highlight=False)
            no_hl_seq = graph_to_sequence(graph_obj, no_hl_graph_node_names)
            
            hl_seq = f"{question}{tokenizer.sep_token}{hl_seq}"
            no_hl_seq = f"{question}{tokenizer.sep_token}{no_hl_seq}"

        except KeyError:
            hl_seq, no_hl_seq = None, None
        except nx.NetworkXError:
            hl_seq, no_hl_seq = None, None
        hl_graph_seq.append(hl_seq)
        no_hl_graph_seq.append(no_hl_seq)

    return no_hl_graph_seq, hl_graph_seq

In [10]:
def find_label(graph, wd_id):
    """find label of the wikidata id using graph"""
    for node_id in graph.nodes:
        node = graph.nodes[node_id]
        if node["name_"] == wd_id:
            return node["label"]
    return f"cannot find label for {wd_id}"

In [11]:
def preproc_updated_sequences(
    df, candidate_start_token="[unused1]", candidate_end_token="[unused2]"
):
    no_hl_seqs, hl_seqs = [], []
    for _, group in tqdm(df.iterrows()):
        graph_obj = nx.readwrite.json_graph.node_link_graph(
            try_literal_eval(group["graph"])
        )
        updated_seq = group["updated_sequence"]

        try:
            ans_ent_label = find_label(graph_obj, group["answerEntity"])
            splits = updated_seq.split(ans_ent_label)
            hl_seq = f"{splits[0].strip()} {candidate_start_token}{ans_ent_label}{candidate_end_token} {splits[1].strip()}"
            hl_seq = f"{group['question']}{tokenizer.sep_token}{hl_seq}"
            no_hl_seq = f"{group['question']}{tokenizer.sep_token}{updated_seq}"
        except:
            hl_seq, no_hl_seq = None, None
        no_hl_seqs.append(no_hl_seq)
        hl_seqs.append(hl_seq)
    return no_hl_seqs, hl_seqs

In [12]:
def preprocess_data(df):
    # Filter all graphs without ANSWER_CANDIDATE_ENTITY
    df = df[
        df["graph"].apply(lambda x: "ANSWER_CANDIDATE_ENTITY" in str(x))
    ]
    
    print("getting updated sequences")
    no_hl_seqs, hl_seqs = preproc_updated_sequences(df)
    df["highlighted_updated_sequence"] = hl_seqs
    df["no_highlighted_updated_sequence"] = no_hl_seqs

    # get the determ sequences
    print("getting sequences")
    no_hl_graph_seq, hl_graph_seq = get_sequences(df)
    df["highlighted_sequence"] = hl_graph_seq
    df["no_highlighted_sequence"] = no_hl_graph_seq

    # filter out all invalid graphs
    print("filtering")
    df = df.dropna(subset=["highlighted_updated_sequence", "no_highlighted_updated_sequence",
                           "highlighted_sequence", "no_highlighted_sequence"])  

    return df

In [13]:
from datasets import Dataset, DatasetDict


def upload_dataset(train, test, path="hle2000/Mintaka_Sequences_T5_xl_ssm"):
    ds = DatasetDict()

    ds["train"] = Dataset.from_pandas(train)
    ds["test"] = Dataset.from_pandas(test)
    ds.push_to_hub(path)

In [14]:
if proccess_data_from_scratch:
    path = (
        "Mintaka_Subgraphs_T5_xl_ssm"
        if dataset_type == "t5-xl-ssm"
        else "Mintaka_Subgraphs_T5_large_ssm"
    )
    subgraphs_dataset = load_dataset(f"hle2000/{path}")
    train_df = subgraphs_dataset["train"].to_pandas()
    test_df = subgraphs_dataset["test"].to_pandas()

    train_df = add_new_seqs(
        "/workspace/storage/misc/train_results_mintaka_xl.yaml", train_df
    )
    test_df = add_new_seqs("/workspace/storage/misc/test_results_mintaka.yaml", test_df)

    train_df = preprocess_data(train_df)
    test_df = preprocess_data(test_df)
else:
    seq_dataset = load_dataset("hle2000/Mintaka_Sequences_T5-xl-ssm")
    train_df = seq_dataset["train"].to_pandas()
    test_df = seq_dataset["test"].to_pandas()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_Sequences_T5-xl-ssm-4753c3af9c5ad7ad/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

#### Building our dataset

In [15]:
class SequenceDataset(torch.utils.data.Dataset):
    """Dataset class for sequences"""

    def __init__(self, dataframe, tokenizer, seq_name):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.seq_name = seq_name

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        item = self.tokenizer(
            row[self.seq_name],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        )
        item["input_ids"] = item["input_ids"].view(-1)
        item["attention_mask"] = item["attention_mask"].view(-1)
        item["labels"] = torch.tensor(
            row["correct"], dtype=torch.float
        )  # pylint: disable=no-member
        return item

    def __len__(self):
        return self.dataframe.index.size

In [16]:
seq_type = (
    "highlighted_updated_sequence"
    if is_special_tok_context
    else "no_highlighted_updated_sequence"
)
train_dataset = SequenceDataset(train_df, tokenizer, seq_type)
test_dataset = SequenceDataset(test_df, tokenizer, seq_type)

#### Training

In [17]:
import numpy as np
import evaluate

threshold = 0.5
metric_classifier = evaluate.combine(
    [
        "accuracy",
        "f1",
        "precision",
        "recall",
        "hyperml/balanced_accuracy",
    ]
)
metric_regression = evaluate.combine(["mae"])


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    results = metric_regression.compute(predictions=predictions, references=labels)

    predictions = predictions > threshold
    results.update(
        metric_classifier.compute(predictions=predictions, references=labels)
    )

    return results

In [19]:
# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir=f"/workspace/storage/misc/subgraphs_reranking_results/new_sequences/t5-xl-ssm/{model_save_name}",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=train_bs,  # batch size per device during training
    per_device_eval_batch_size=eval_bs,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    metric_for_best_model="balanced_accuracy",  # select the base metrics
    logging_steps=500,  # log & save weights each logging_steps
    save_steps=500,
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    report_to="wandb",
)

In [20]:
from torch.utils.data.sampler import WeightedRandomSampler
import numpy as np


class CustomTrainer(Trainer):
    """custom trainer with sampler"""

    def get_labels(self):
        """get labels from train dataset"""
        labels = []
        for i in self.train_dataset:
            labels.append(int(i["labels"].cpu().detach().numpy()))
        return labels

    def _get_train_sampler(self) -> torch.utils.data.Sampler:
        """create our custom sampler"""
        labels = self.get_labels()
        return self.create_sampler(labels)

    def create_sampler(self, target):
        """weighted random sampler"""
        class_sample_count = np.array(
            [len(np.where(target == t)[0]) for t in np.unique(target)]
        )
        weight = 1.0 / class_sample_count
        samples_weight = np.array([weight[t] for t in target])

        samples_weight = torch.from_numpy(samples_weight)  # pylint: disable=no-member
        samples_weight = samples_weight.double()
        sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        return sampler

In [21]:
# Call the Trainer
trainer = CustomTrainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)

In [22]:
# Train the model
if not model_weights:  # training
    trainer.train()
    
    checkpoint_best_path = (
        f"/workspace/storage/misc/subgraphs_reranking_results/new_sequences/t5-xl-ssm/results/{model_save_name}/checkpoint-best"
    )
    model.save_pretrained(checkpoint_best_path)
    tokenizer.save_pretrained(checkpoint_best_path)

    print("Model dumped to ", checkpoint_best_path)
    print("\nFinal evaluation:\n\n", trainer.evaluate())

#### Evaluating

In [23]:
evaluate_res = trainer.evaluate()
evaluate_res

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mhle2000[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 0.12478556483983994,
 'eval_mae': 0.14876305486096023,
 'eval_accuracy': 0.8579425784281122,
 'eval_f1': 0.5668413047152273,
 'eval_precision': 0.4930902675683623,
 'eval_recall': 0.6665341812400636,
 'eval_balanced_accuracy': 0.7777473173365074,
 'eval_runtime': 142.7337,
 'eval_samples_per_second': 126.403,
 'eval_steps_per_second': 3.951}

#### Final Re-ranking

In [24]:
from datasets import load_dataset

res_csv = load_dataset(
    f"hle2000/Mintaka_T5_xl_ssm_outputs", verification_mode="no_checks"
)["test"].to_pandas()
res_csv.head()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_T5_xl_ssm_outputs-9a78025ce7d9a549/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,target,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,...,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199,target_out_of_vocab,__index_level_0__
0,What man was a famous American author and also...,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,...,Louisa May Alcott,Ambrose Bierce,Ishmael Lehman,"Mark Twain, Natchez, Missouri","Mark Twain, Louisa",Ishmael Levy,Ishmael Beam,"Mark Twain, Natchez, Mississippi",False,0
1,How many Academy Awards has Jake Gyllenhaal be...,1,3,2,3,2,3,2,3,2,...,13,12,8,11,10,6,9,13,False,1
2,"Who is older, The Weeknd or Drake?",Drake,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,...,The Weeknd (2017),The Weeknd's oldest,The Weeknd is older than Drake,The Weeknd's,Dierks Bentley,"The Weeknd""",Drake & The Weeknd,The Weeknd's age,False,2
3,How many children did Donald Trump have?,5,5,5,3,5,3,5,3,5,...,24,6,76,13,61,108,0,8,False,3
4,Is the main hero in Final Fantasy IX named Kuja?,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Is it a Final Fantasy,Does it include Kuja?,Is it,Is he Kuja,Is it No,Y Yes,Is Kuja,Is he called Kuja,False,4


In [25]:
final_acc, top200_total, top1_total, seq2seq_correct = 0, 0, 0, 0

for idx, group in tqdm(res_csv.iterrows()):
    curr_question_df = test_df[test_df["question"] == group["question"]]
    if (
        len(curr_question_df) == 0
    ):  # we don't have subgraph for this question, take answer from seq2seq
        if group["answer_0"] == group["target"]:
            seq2seq_correct += 1
        else:  # check if answer exist in 200 beams for question with no subgraphs
            all_beams = group.tolist()[2:-1]  # all 200 beams
            all_beams = set(all_beams)
            top200_total += 1 if group["target"] in all_beams else 0

    else:  # we have subgraph for this question
        all_beams = group.tolist()[2:-1]  # all 200 beams
        all_beams = set(all_beams)

        if group["target"] not in all_beams:  # no correct answer in beam
            continue

        # correct answer exist in beam
        top1_total += 1 if group["answer_0"] == group["target"] else 0
        top200_total += 1

        # reranking
        seqs = curr_question_df[seq_type].tolist()
        # print(seqs)
        is_corrects = curr_question_df["correct"].astype(bool).tolist()

        tok_seq = tokenizer(
            seqs,
            padding="max_length",
            max_length=512,
            truncation=True,
            return_tensors="pt",
        )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        output = torch.flatten(output)
        # print(output)
        # print(is_corrects)

        max_idx = output.argmax(dim=0).item()

        if is_corrects[max_idx] is True:
            final_acc += 1
    # break

# final rerankinga, top1 and top200 result
reranking_res = (final_acc + seq2seq_correct) / len(res_csv)
top200 = (top200_total + seq2seq_correct) / len(res_csv)
top1 = (top1_total + seq2seq_correct) / len(res_csv)

0it [00:00, ?it/s]

4000it [01:37, 40.98it/s] 


In [26]:
print(f"top1: {top1}, top200: {top200}, reranking top1: {reranking_res}")

top1: 0.31725, top200: 0.69025, reranking top1: 0.35225


In [30]:
# saving the final result to txt file
with open(
    f"/workspace/storage/misc/subgraphs_reranking_results/new_sequences/{dataset_type}/{model_save_name}/final_results_seq2seq.txt",
    "w+",
) as f:
    f.write(f"original top1: {top1}, top200: {top200}\n")
    f.write(f"Final reranking accuracy: {reranking_res}\n")
    f.write("\n")
    f.write("trainer.evaluate() result:\n")
    for k, v in evaluate_res.items():
        f.write(f"{k}:{v}\n")

wandb: Network error (ConnectionError), entering retry loop.
