In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir

In [3]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm

from pathlib import Path

#### Getting JSONL subgraphs dataset

In [4]:
def read_jsonl(path):
    jsonl_reader = jsonlines.open(path)
    jsonl_reader_list = list(jsonl_reader)
    df = []
    for idx, line in tqdm(enumerate(jsonl_reader_list)):
        df.append(line)
    df = pd.DataFrame(df)
    return df


train_df = read_jsonl(
    "/workspace/storage/new_subgraph_dataset/t5-large-ssm/MINTAKA/mintaka_train_labeled.jsonl"
)
val_df = read_jsonl(
    "/workspace/storage/new_subgraph_dataset/t5-large-ssm/MINTAKA/mintaka_validation_labeled.jsonl"
)
test_df = read_jsonl(
    "/workspace/storage/new_subgraph_dataset/t5-large-ssm/MINTAKA/mintaka_test_labeled.jsonl"
)

98033it [00:00, 1795672.20it/s]
14286it [00:00, 2040379.57it/s]
28325it [00:00, 2162701.12it/s]


#### Converting graph to its sequences

In [5]:
def get_node_names(subgraph):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    return node_names

In [6]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info == 0:  # no endge from_node -> to_node
                # sequence.extend([from_node, "None", to_node])
                pass
            else:
                sequence.extend([from_node, edge_info, to_node])
    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [7]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_sequences(df):
    graphs = list(df["graph"])
    graph_seq = []
    for graph in tqdm(graphs):
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))
        try:
            graph_node_names = get_node_names(graph_obj)
            curr_seq = graph_to_sequence(graph_obj, graph_node_names)
        except KeyError:
            # print("ERROR NO LABEL!")
            curr_seq = "ERROR_NO_LABEL"
        except nx.NetworkXError:
            # print("ERROR EMPTY GRAPHS!")
            curr_seq = "ERROR_EMPTY_GRAPH"
        graph_seq.append(curr_seq)
    return graph_seq

In [8]:
train_df_sequences = get_sequences(train_df)
train_df["graph_sequence"] = train_df_sequences
# filter out all invalid graphs
error_df = train_df[
    (train_df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
    | (train_df["graph_sequence"] == "ERROR_NO_LABEL")
]
train_df = train_df.drop(error_df.index)
train_df["answerEntity"] = train_df["answerEntity"].apply(lambda x: "".join(x))
train_df["questionEntity"] = train_df["questionEntity"].apply(lambda x: "".join(x))
train_df["groundTruthAnswerEntity"] = train_df["groundTruthAnswerEntity"].apply(
    lambda x: "".join(x)
)
train_df["correct"] = train_df.apply(
    lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
)

100%|██████████| 98033/98033 [00:20<00:00, 4823.73it/s]


In [9]:
train_df.head()

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,a9011ddf,What is the seventh tallest mountain in North ...,Q194057,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Mount Rainier,continent,North America",False
1,a9011ddf,What is the seventh tallest mountain in North ...,Q5401,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,shares border with,Eurasia,Euras...",False
2,a9011ddf,What is the seventh tallest mountain in North ...,Q223,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,has part(s),Greenland,Greenland,...",False
3,a9011ddf,What is the seventh tallest mountain in North ...,Q1153188,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Canada,country,Canada,Canada,part of,North Ame...",True
4,a9011ddf,What is the seventh tallest mountain in North ...,Q14946340,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...","North America,category for people who died her...",False


In [10]:
val_df_sequences = get_sequences(val_df)
val_df["graph_sequence"] = val_df_sequences
# filter out all invalid graphs
error_df = val_df[
    (val_df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
    | (val_df["graph_sequence"] == "ERROR_NO_LABEL")
]
val_df = val_df.drop(error_df.index)
val_df["answerEntity"] = val_df["answerEntity"].apply(lambda x: "".join(x))
val_df["questionEntity"] = val_df["questionEntity"].apply(lambda x: "".join(x))
val_df["groundTruthAnswerEntity"] = val_df["groundTruthAnswerEntity"].apply(
    lambda x: "".join(x)
)
val_df["correct"] = val_df.apply(
    lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
)

  0%|          | 0/14286 [00:00<?, ?it/s]

100%|██████████| 14286/14286 [00:04<00:00, 3121.94it/s]


In [11]:
val_df.head()

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,9ace9041,What is the fourth book in the Twilight series?,Q189378,Q44523,Q53945,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Twilight,Twilight,part of...",False
1,9ace9041,What is the fourth book in the Twilight series?,Q19765983,Q44523,Q53945,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,romantic fiction",False
2,9ace9041,What is the fourth book in the Twilight series?,Q111019576,Q44523,Q53945,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,vampire fiction",False
3,9ace9041,What is the fourth book in the Twilight series?,Q849907,Q44523,Q53945,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),The Short Second Life of ...",False
4,9ace9041,What is the fourth book in the Twilight series?,Q53945,Q44523,Q53945,ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Breaking Dawn,Breaking Da...",True


In [12]:
test_df_sequences = get_sequences(test_df)
test_df["graph_sequence"] = test_df_sequences
# filter out all invalid graphs
error_df = test_df[
    (test_df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
    | (test_df["graph_sequence"] == "ERROR_NO_LABEL")
]
test_df = test_df.drop(error_df.index)
test_df["answerEntity"] = test_df["answerEntity"].apply(lambda x: "".join(x))
test_df["questionEntity"] = test_df["questionEntity"].apply(lambda x: "".join(x))
test_df["groundTruthAnswerEntity"] = test_df["groundTruthAnswerEntity"].apply(
    lambda x: "".join(x)
)
test_df["correct"] = test_df.apply(
    lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
)

  0%|          | 0/28325 [00:00<?, ?it/s]

100%|██████████| 28325/28325 [00:06<00:00, 4670.36it/s]


In [13]:
test_df.head()

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,fae46b21,What man was a famous American author and also...,Q893594,Q1497Q846570,Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,country,United States...",False
1,fae46b21,What man was a famous American author and also...,Q102513,Q1497Q846570,Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,described by source,S...",False
2,fae46b21,What man was a famous American author and also...,Q7245,Q1497Q846570,Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","Mississippi River,described by source,Small Br...",True
3,fae46b21,What man was a famous American author and also...,Q34652890,Q1497Q846570,Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","United States of America,country,United States...",False
4,fae46b21,What man was a famous American author and also...,Q5686,Q1497Q846570,Q7245,intersection,"{'directed': True, 'multigraph': False, 'graph...","Mississippi River,described by source,Small Br...",False


#### Building our dataset

In [14]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

2023-07-20 10:06:41.238092: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-20 10:06:41.406913: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-20 10:06:41.905711: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-20 10:06:41.905789: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_save_name = "sentence-transformers_all-mpnet-base-v2_mse"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(
    device
)

In [16]:
concat_train_df = pd.concat([train_df, val_df])

In [32]:
train_texts = train_df["graph_sequence"].tolist()
train_labels = train_df["correct"].astype(int).tolist()  # convert true false to 1 0

test_texts = test_df["graph_sequence"].tolist()
test_labels = test_df["correct"].astype(int).tolist()  # convert true false to 1 0

In [33]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [34]:
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SequenceDataset(train_encodings, train_labels)
test_dataset = SequenceDataset(test_encodings, test_labels)

In [35]:
from torch.utils.data.sampler import WeightedRandomSampler
import numpy as np


def create_sampler(target):
    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)]
    )
    weight = 1.0 / class_sample_count
    samples_weight = np.array([weight[t] for t in target])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weigth = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    return sampler

#### Training

In [21]:
import numpy as np
import evaluate

threshold = 0.5
metric_classifier = evaluate.combine(["accuracy", "f1", "precision", "recall"])
metric_regression = evaluate.combine(["mae"])


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    results = metric_regression.compute(predictions=predictions, references=labels)

    predictions = predictions > threshold
    results.update(
        metric_classifier.compute(predictions=predictions, references=labels)
    )

    return results

In [37]:
# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir=f"/workspace/storage/subgraphs_reranking_results/results/{model_save_name}",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=f"/workspace/storage/subgraphs_reranking_results/logs/{model_save_name}",  # directory for storing logs
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    metric_for_best_model="accuracy",  # select the base metrics
    logging_steps=500,  # log & save weights each logging_steps
    save_steps=500,
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    # report_to='wandb',
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
class CustomTrainer(Trainer):
    def get_train_dataloader(self) -> torch.utils.data.DataLoader:
        train_sampler = create_sampler(train_df["correct"].astype(int).ravel())
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=32, sampler=train_sampler
        )
        return train_loader

In [None]:
# Call the Trainer
trainer = CustomTrainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)

In [None]:
# Train the model
trainer.train()

#### Evaluating

In [None]:
evaluate_res = trainer.evaluate()
evaluate_res

In [60]:
from sklearn.metrics import balanced_accuracy_score
import torch.nn.functional as F


def test_model(model, test_dataloader):
    total_acc_val = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for item in tqdm(test_dataloader):
            val_label = item["labels"].to(device)
            y_true.extend(val_label.tolist())
            mask = item["attention_mask"].to(device)
            input_id = item["input_ids"].squeeze(1).to(device)

            logits = model(input_id, mask).logits
            y_pred.extend(logits.argmax(dim=1).tolist())

            acc = (logits.argmax(dim=1) == val_label).sum().item()
            total_acc_val += acc

    final_acc = total_acc_val / len(test_dataloader.dataset)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    return final_acc, balanced_acc, y_true


test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)
final_acc, balanced_acc, _ = test_model(model, test_loader)

100%|██████████| 885/885 [03:35<00:00,  4.10it/s]


In [61]:
final_acc, balanced_acc

(0.8241319628412984, 0.7600850420686618)

#### Final Re-ranking

In [78]:
combined_df = pd.concat([train_df, test_df])

In [82]:
res_csv = pd.read_csv(
    "/workspace/storage/mintaka_seq2seq/t5-large-ssm/results_filtered.csv"
)

In [83]:
res_csv

Unnamed: 0,question,target,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,...,answer_191,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199,target_out_of_vocab
0,What is the seventh tallest mountain in North ...,Mount Lucania,Mount McKinley,Mount McKinley,Mount St. Elias,Mount Rainier,Denali,Mount McKinley,Denali,Mount Rainier,...,Mount McKinlay,Ben Nevis,Mt. Whitney,Kangchenjunga Mountain,Mt. Massive,Mount Hood,Mt. Marcy,Cascade Peak,Mount McLoughlin,False
1,Which actor was the star of Titanic and was bo...,Leonardo DiCaprio,Leonardo DiCaprio,Leonardo DiCaprio,Leonardo DiCaprio,Leonardo DiCaprio,Meryl Streep,Matthew McConaughey,Leonardo Di Caprio,Robert Pattinson,...,Kevin Spacey,Kate Winslet.,Leonardo di Caprio,Robert Pattinson.,James Franco,Samuel L. Jackson.,Ryan Reynolds,Harrison Ford,Leonardo Di Caprio,False
2,Which actor starred in Vanilla Sky and was mar...,Tom Cruise,Tom Hanks,Tom Cruise,Tom Hanks,Tom Cruise,Tom Cruise,Tom Hanks,Tom Cruise,Tom Hanks,...,Tom Hanks.,Dustin Hoffman,Matt Damon,Will Smith,Harrison Ford,Tom Cruise.,James Franco,Russell Crowe,"Tom Cruise, Jr.",False
3,Who is the youngest current US governor?,Ron DeSantis,Bobby Jindal,Jon Corzine,Rick Perry,Jennifer Granholm,Bobby Jindal,Steve Beshear,Kay Ivey,Mike Pence,...,Jay Nixon,Scott Walker,Rick Perry,Chris Christie,Mike Pence.,Jennifer Granholm.,Gary Herbert,"Scott Walker, Jr.",Scott Walker,False
4,Which US president has had the most votes?,Joe Biden,Donald Trump,George W. Bush,Barack Obama,Donald Trump,John F. Kennedy,George Washington,Barack Obama,Theodore Roosevelt,...,John F Kennedy.,Franklin D. Roosevelt,John F Kennedy,George W Bush,Ronald Reagan,Bill Clinton,Barack Obama.,George Washington,Harry S Truman,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14101,Who was the first woman elected mayor of Cambr...,Barbara Ackermann,Laura Ingalls Wilder,Shirley Chisholm,Shirley Chisholm,Phyllis O'Brien,Priscilla Wentworth,Priscilla Levy,Jeannette Grasso,Priscilla O'Neill,...,Phyllis O'Hara Smith,Jeannette Platt,Jeannette Sullivant,Martha Coakley,Katherine O'Hara,Jeannette Grasser,Betty McFadden,Marjorie Merriam,"Katherine O'Hara, Jr.",False
14102,Who was the first woman mayor of San Francisco?,Dianne Feinstein,Jeannette Rankin,Jeannette Rankin,Jeannette Rankin,Libby Schaefer,Eleanor Roosevelt,Jeannette Rankin,Gloria Steinem,Eleanor Roosevelt,...,Elinor Ostrom,Yvonne Lee,Angela Davis,Angela Carter,Annette O'Toole,Rosa Parks,Gloria Allred,Rosemary Brown,Betty Williamson,False
14103,Where was the last Republican mayor of Boston ...,"Portland, Maine","Boston, Massachusetts","Boston, MA","Providence, RI","Boston, MA","Boston, Massachusetts","Northampton, Massachusetts","Boston, Mass.",New York City,...,"New Bedford, MA",North Carolina,"Boston, Massachusetts","Lowell, Mass.","Northampton, Maine","South Boston, MA.",Boston,New Hampshire,"Boston, MA",False
14104,Who was the first British monarch to have a pr...,George I,Queen Victoria,Queen Victoria,Queen Elizabeth I,George III,Queen Victoria,Queen Elizabeth I,George III,Queen Victoria,...,Elizabeth I,Queen Elizabeth I,William IV,Henry VIII,King George III,George III.,King George IV,Edward VI,Queen Anne,False


In [85]:
res_csv_grouped = res_csv.groupby(["question"])

In [97]:
final_acc, total = 0, 0
for name, group in tqdm(res_csv_grouped):
    total += 1
    curr_question_df = combined_df[combined_df["question"] == name]
    all_beams = group.iloc[0, 3:-1].values.tolist()  # all 200 beams
    all_beams = list(set(all_beams))

    if group["target"].values[0] not in all_beams:  # no correct answer in beam
        curr_question_df = curr_question_df[curr_question_df["correct"] == False]

    # reranking
    seqs = curr_question_df["graph_sequence"]
    is_corrects = curr_question_df["correct"]

    curr_max = 0
    best_pred_answer = None

    for seq, is_correct in zip(seqs, is_corrects):
        tok_seq = tokenizer(
            seq,
            padding="max_length",
            max_length=512,
            truncation=True,
            return_tensors="pt",
        )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        correct_pred = output.argmax(dim=1).item()

        # get the highest predicted correct sequence/beam answer
        if correct_pred == 1:
            softmax_score = F.softmax(output, dim=1).cpu().detach().numpy()[0][1]

            if softmax_score > curr_max:
                curr_max = softmax_score
                best_pred_answer = is_correct

    if best_pred_answer is True:
        final_acc += 1

100%|██████████| 14106/14106 [31:10<00:00,  7.54it/s] 


In [99]:
reranking_res = final_acc / total

0.4411597901602155

In [4]:
# saving the final result to txt file
with open(
    f"/workspace/storage/subgraphs_reranking_results/results/{model_save_name}/final_results.txt",
    "w+",
) as f:
    f.write(f"Accuracy: {final_acc}, Balanced Accuracy: {balanced_acc}\n")
    f.write(f"Final reranking accuracy: {reranking_res}")
    f.write("\n")
    f.write("trainer.evaluate() result:")
    for k, v in evaluate_res.items():
        f.write(f"{k}:{v}")