In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir

In [3]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

2023-07-26 12:54:29.517983: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-26 12:54:29.684632: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-26 12:54:30.256929: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-26 12:54:30.257006: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

#### Getting JSONL subgraphs dataset

In [4]:
dataset_type = 't5-large-ssm'
new_test_dataset = False 
train_bs = 16
eval_bs = 32
is_special_tok_context = False
model_weights = None #f"/workspace/storage/subgraphs_reranking_results/{dataset_type}/results/mse_subgraph_mpnet_ranking_T5LargeSSM"
model_name = "roberta-large"
model_save_name = f"{model_name}_mse_token_context"

In [5]:
if model_weights: # evaluating
    tokenizer = AutoTokenizer.from_pretrained(model_weights)
    model = AutoModelForSequenceClassification.from_pretrained(model_weights).to(
        device
    )
else: # training from scratch
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ["[unused1]", "[unused2]"]}
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(
        device
    )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [6]:
def read_jsonl(path):
    jsonl_reader = jsonlines.open(path)
    jsonl_reader_list = list(jsonl_reader)
    df = []
    for line in tqdm(jsonl_reader_list):
        df.append(line)
    df = pd.DataFrame(df)
    return df


train_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_train_labeled.jsonl"
)
val_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_validation_labeled.jsonl"
)
test_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_test_labeled.jsonl"
)

100%|██████████| 98033/98033 [00:00<00:00, 2507333.96it/s]
100%|██████████| 14286/14286 [00:00<00:00, 2281095.89it/s]
100%|██████████| 28325/28325 [00:00<00:00, 2292666.03it/s]


#### Converting graph to its sequences

In [7]:
def get_node_names(subgraph, candidate_start_token="[unused1]", candidate_end_token="[unused2]",):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    node_type = [subgraph.nodes[node]["type"] for node in subgraph.nodes()]
    
    if 'ANSWER_CANDIDATE_ENTITY' not in node_type:
        return None

    if is_special_tok_context:
        candidate_idx = node_type.index("ANSWER_CANDIDATE_ENTITY")
        node_names[
            candidate_idx
        ] = f"{candidate_start_token}{node_names[candidate_idx]}{candidate_end_token}"
    
    return node_names

In [8]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info != 0:  # no endge from_node -> to_node
                sequence.extend([from_node, edge_info, to_node])
    
    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [9]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_sequences(df):
    questions = list(df["question"])
    graphs = list(df["graph"])
    graph_seq = []
    for question, graph in tqdm(zip(questions, graphs)):
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))
        try:
            graph_node_names = get_node_names(graph_obj)
            if graph_node_names is None:
                curr_seq = "ERROR_NO_CANDIDATE"
            else:     
                curr_seq = graph_to_sequence(graph_obj, graph_node_names)
                if is_special_tok_context:
                    curr_seq = f"{question}{tokenizer.sep_token}{curr_seq}"
        except KeyError:
            curr_seq = "ERROR_NO_LABEL"
        except nx.NetworkXError:
            curr_seq = "ERROR_EMPTY_GRAPH"
        graph_seq.append(curr_seq)
    
    return graph_seq

In [10]:
def preprocess_data(df):
    # get the sequences
    df_sequences = get_sequences(df)
    df["graph_sequence"] = df_sequences
    
    # filter out all invalid graphs
    error_df = df[
        (df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
        | (df["graph_sequence"] == "ERROR_NO_LABEL")
        | (df["graph_sequence"] == "ERROR_NO_CANDIDATE")
    ]
    df = df.drop(error_df.index)
    
    # turn list of entities into string
    df["answerEntity"] = df["answerEntity"].apply(lambda x: ", ".join(x))
    df["questionEntity"] = df["questionEntity"].apply(lambda x: ", ".join(x))
    df["groundTruthAnswerEntity"] = df["groundTruthAnswerEntity"].apply(
        lambda x: ", ".join(x)
    )
    df["correct"] = df.apply(
        lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
    )
    
    return df

In [11]:
# get train and test texts & labels
concat_train_df = pd.concat([train_df, val_df])

In [12]:
concat_train_df = preprocess_data(concat_train_df)
concat_train_df.head()

0it [00:00, ?it/s]

112319it [00:25, 4399.01it/s]


Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,a9011ddf,What is the seventh tallest mountain in North ...,Q194057,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Mount Rainier,continent,North America",False
1,a9011ddf,What is the seventh tallest mountain in North ...,Q5401,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,shares border with,Eurasia,Euras...",False
2,a9011ddf,What is the seventh tallest mountain in North ...,Q223,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,has part(s),Greenland,Greenland,...",False
3,a9011ddf,What is the seventh tallest mountain in North ...,Q1153188,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Canada,country,Canada,Canada,part of,North Ame...",True
4,a9011ddf,What is the seventh tallest mountain in North ...,Q14946340,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,category for people who died her...",False


In [13]:
test_df = preprocess_data(test_df)
test_df.head()

28325it [00:06, 4676.07it/s]


Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,fae46b21,What man was a famous American author and also...,Q893594,"Q1497, Q846570",Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,country,United States...",False
1,fae46b21,What man was a famous American author and also...,Q102513,"Q1497, Q846570",Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,described by source,S...",False
2,fae46b21,What man was a famous American author and also...,Q7245,"Q1497, Q846570",Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","Mississippi River,described by source,Small Br...",True
3,fae46b21,What man was a famous American author and also...,Q34652890,"Q1497, Q846570",Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,country,United States...",False
4,fae46b21,What man was a famous American author and also...,Q5686,"Q1497, Q846570",Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","Mississippi River,described by source,Small Br...",False


#### Building our dataset

In [14]:
train_texts = concat_train_df["graph_sequence"].tolist()
train_labels = concat_train_df["correct"].astype(int).tolist()  # convert true false to 1 0

test_texts = test_df["graph_sequence"].tolist()
test_labels = test_df["correct"].astype(int).tolist()  # convert true false to 1 0

In [15]:
# get our encodings
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [16]:
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SequenceDataset(train_encodings, train_labels)
test_dataset = SequenceDataset(test_encodings, test_labels)

#### Training

In [17]:
import numpy as np
import evaluate

threshold = 0.5
metric_classifier = evaluate.combine(["accuracy", "f1", "precision", "recall", "hyperml/balanced_accuracy",])
metric_regression = evaluate.combine(["mae"])


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    results = metric_regression.compute(predictions=predictions, references=labels)

    predictions = predictions > threshold
    results.update(
        metric_classifier.compute(predictions=predictions, references=labels)
    )

    return results

In [18]:
# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir=f"/workspace/storage/subgraphs_reranking_results/t5-xl-ssm/results/{model_save_name}",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=train_bs,  # batch size per device during training
    per_device_eval_batch_size=eval_bs,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=f"/workspace/storage/subgraphs_reranking_results/t5-xl-ssm/logs/{model_save_name}",  # directory for storing logs
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    metric_for_best_model="balanced_accuracy",  # select the base metrics
    logging_steps=500,  # log & save weights each logging_steps
    save_steps=500,
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    #report_to='wandb',
)

In [19]:
from torch.utils.data.sampler import WeightedRandomSampler
import numpy as np

def create_sampler(target):
    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)]
    )
    weight = 1.0 / class_sample_count
    samples_weight = np.array([weight[t] for t in target])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weigth = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    return sampler

In [20]:
class CustomTrainer(Trainer):    
    def get_train_dataloader(self) -> torch.utils.data.DataLoader:
        train_sampler = create_sampler(concat_train_df["correct"].astype(int).ravel())
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=train_bs, sampler=train_sampler
        )
        return train_loader

In [21]:
# Call the Trainer
trainer = CustomTrainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Train the model
if not model_weights: # training
    trainer.train()

#### Evaluating

In [23]:
evaluate_res = trainer.evaluate()
evaluate_res

***** Running Evaluation *****
  Num examples = 28325
  Batch size = 32


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mhle2000[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 0.12268020957708359,
 'eval_mae': 0.18467992317326046,
 'eval_accuracy': 0.823936451897617,
 'eval_f1': 0.599855572494584,
 'eval_precision': 0.45036144578313253,
 'eval_recall': 0.8979101609416287,
 'eval_balanced_accuracy': 0.8545506437519996,
 'eval_runtime': 218.3198,
 'eval_samples_per_second': 129.741,
 'eval_steps_per_second': 4.058}

#### Final Re-ranking

In [24]:
if not new_test_dataset:
    res_csv = pd.read_csv(
        f"/workspace/storage/mintaka_seq2seq/{dataset_type}/test/results_filtered.csv"
    )
    final_acc, top200_total, top1_total = 0, 0, 0
    
    for idx, group in tqdm(res_csv.iterrows()):
        curr_question_df = test_df[test_df["question"] == group['question']]
        all_beams = group.tolist()[2:-1] # all 200 beams
        all_beams = list(set(all_beams))
        
        if group["target"] not in all_beams:  # no correct answer in beam
            continue

        top200_total += 1
        top1_total += 1 if group["answer_0"] in group["target"] else 0
        
        # reranking
        seqs = curr_question_df["graph_sequence"].tolist()
        is_corrects = curr_question_df["correct"].tolist()
        
        tok_seq = tokenizer(seqs, padding="max_length",
                            max_length=512,
                            truncation=True,
                            return_tensors="pt",
                            )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        output = torch.flatten(output)

        max_idx = output.argmax(dim=0).item()

        if is_corrects[max_idx] is True:
            final_acc += 1
    
    # final rerankinga, top1 and top200 result
    reranking_res = final_acc / len(res_csv)
    top200 = top200_total/len(res_csv)
    top1 = top1_total / len(res_csv)


2811it [02:31, 18.51it/s]


In [26]:
if new_test_dataset: #new_test_dataset: # reranking for new test dataset format
    test_dataset_path = '/workspace/storage/new_subgraph_dataset/t5-xl-ssm/mintaka_test_labeled_new.jsonl'
    new_test_df = read_jsonl(test_dataset_path)
    new_test_df = preprocess_data(new_test_df)
    new_test_df_group = new_test_df.groupby('question')
    final_acc, top200_total = 0, 0

    for question, row in new_test_df_group:
        answers = row['answerEntity'].tolist()
        ground_truth = row['groundTruthAnswerEntity'].tolist()[0]
        if ground_truth in answers:
            top200_total += 1
        # reranking
        seqs = row["graph_sequence"].tolist()
        is_corrects = row["correct"].tolist()
        
        tok_seq = tokenizer(seqs, padding="max_length",
                            max_length=512,
                            truncation=True,
                            return_tensors="pt",
                            )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        output = torch.flatten(output)

        max_idx = output.argmax(dim=0).item()

        if is_corrects[max_idx] is True:
            final_acc += 1   
    
    # final rerankinga, top1 and top200 result
    reranking_res = final_acc / len(new_test_df_group)
    top200 = top200_total/len(new_test_df_group)
    top1 = 'Not available for new test dataset'

In [28]:
# saving the final result to txt file
with open(
    f"/workspace/storage/subgraphs_reranking_results/{dataset_type}/results/{model_save_name}/final_results.txt",
    "w+",
) as f:
    f.write(f'original top1: {top1}, top200: {top200}\n')
    f.write(f"Final reranking accuracy: {reranking_res}\n")
    f.write("\n")
    f.write("trainer.evaluate() result:\n")
    for k, v in evaluate_res.items():
        f.write(f"{k}:{v}\n")