In [12]:
import torch
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

#### Getting JSONL subgraphs dataset

In [14]:
dataset_type = "t5-xl-ssm"
train_bs = 16
eval_bs = 32
is_special_tok_context = True
model_weights = f"/workspace/storage/subgraphs_reranking_results/{dataset_type}/results/mse_subgraph_mpnet_ranking_T5XLSSMNQ"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_weights)
model = AutoModelForSequenceClassification.from_pretrained(model_weights).to(device)

In [16]:
def read_jsonl(path):
    jsonl_reader = jsonlines.open(path)
    jsonl_reader_list = list(jsonl_reader)
    df = []
    for line in tqdm(jsonl_reader_list):
        df.append(line)
    df = pd.DataFrame(df)
    return df


train_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_train_labeled.jsonl"
)
val_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_validation_labeled.jsonl"
)
test_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_test_labeled.jsonl"
)

100%|██████████| 94690/94690 [00:00<00:00, 1777051.84it/s]
100%|██████████| 13646/13646 [00:00<00:00, 1424547.57it/s]
100%|██████████| 27094/27094 [00:00<00:00, 1846971.66it/s]


#### Converting graph to its sequences

In [17]:
def get_node_names(
    subgraph,
    candidate_start_token="[unused1]",
    candidate_end_token="[unused2]",
):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    node_type = [subgraph.nodes[node]["type"] for node in subgraph.nodes()]

    if "ANSWER_CANDIDATE_ENTITY" not in node_type:
        return None

    if is_special_tok_context:
        candidate_idx = node_type.index("ANSWER_CANDIDATE_ENTITY")
        node_names[
            candidate_idx
        ] = f"{candidate_start_token}{node_names[candidate_idx]}{candidate_end_token}"

    return node_names

In [18]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info != 0:  # no endge from_node -> to_node
                sequence.extend([from_node, edge_info, to_node])

    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [19]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_sequences(df):
    questions = list(df["question"])
    graphs = list(df["graph"])
    graph_seq = []
    for question, graph in tqdm(zip(questions, graphs)):
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))
        try:
            graph_node_names = get_node_names(graph_obj)
            if graph_node_names is None:
                curr_seq = "ERROR_NO_CANDIDATE"
            else:
                curr_seq = graph_to_sequence(graph_obj, graph_node_names)
                if is_special_tok_context:
                    curr_seq = f"{question}{tokenizer.sep_token}{curr_seq}"
        except KeyError:
            curr_seq = "ERROR_NO_LABEL"
        except nx.NetworkXError:
            curr_seq = "ERROR_EMPTY_GRAPH"
        graph_seq.append(curr_seq)

    return graph_seq

In [20]:
def preprocess_data(df):
    # get the sequences
    df_sequences = get_sequences(df)
    df["graph_sequence"] = df_sequences

    # filter out all invalid graphs
    error_df = df[
        (df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
        | (df["graph_sequence"] == "ERROR_NO_LABEL")
        | (df["graph_sequence"] == "ERROR_NO_CANDIDATE")
    ]
    df = df.drop(error_df.index)

    # turn list of entities into string
    df["answerEntity"] = df["answerEntity"].apply(lambda x: ", ".join(x))
    df["questionEntity"] = df["questionEntity"].apply(lambda x: ", ".join(x))
    df["groundTruthAnswerEntity"] = df["groundTruthAnswerEntity"].apply(
        lambda x: ", ".join(x)
    )
    df["correct"] = df.apply(
        lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
    )

    return df

In [21]:
# get train and test texts & labels
concat_train_df = pd.concat([train_df, val_df])

In [22]:
concat_train_df = preprocess_data(concat_train_df)

108336it [00:27, 3998.23it/s]


In [23]:
test_df = preprocess_data(test_df)

27094it [00:06, 4169.31it/s]


#### Final Re-ranking Based on Question Types

First, questions with subgraphs

In [24]:
test_res_csv = pd.read_csv(
    f"/workspace/storage/mintaka_seq2seq/{dataset_type}/test/results.csv"
)
question_types = test_df["complexityType"].unique().tolist()

In [56]:
def get_question_type_df_filtered(res_csv, jsonl_dataset, question_type):
    df_type = jsonl_dataset[jsonl_dataset["complexityType"] == question_type]
    # getting questions in result csv that exist in our jsonl
    res_filtered = []
    for index, row in res_csv.iterrows():
        question = row[0].strip()
        curr_ques_df = df_type[df_type["question"] == question]
        if len(curr_ques_df) > 0:
            res_filtered.append(row)
    res_filtered = pd.DataFrame(res_filtered)

    return res_filtered

In [57]:
def rerank_question_type(res_csv, jsonl_dataset):
    final_acc, top200_total, top1_total = 0, 0, 0

    for _, group in res_csv.iterrows():
        curr_question_df = jsonl_dataset[jsonl_dataset["question"] == group["question"]]
        all_beams = group.tolist()[2:-1]  # all 200 beams
        all_beams = list(set(all_beams))

        if group["target"] not in all_beams:  # no correct answer in beam
            continue

        top200_total += 1
        top1_total += 1 if group["answer_0"] in group["target"] else 0

        # reranking
        seqs = curr_question_df["graph_sequence"].tolist()
        is_corrects = curr_question_df["correct"].tolist()

        tok_seq = tokenizer(
            seqs,
            padding="max_length",
            max_length=512,
            truncation=True,
            return_tensors="pt",
        )
        mask = tok_seq["attention_mask"].to(device)
        input_id = tok_seq["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask).logits
        output = torch.flatten(output)

        max_idx = output.argmax(dim=0).item()

        if is_corrects[max_idx] is True:
            final_acc += 1

    # final rerankinga, top1 and top200 result
    reranking_res = final_acc / len(res_csv)
    top200 = top200_total / len(res_csv)
    top1 = top1_total / len(res_csv)

    return top1, top200, reranking_res

In [27]:
res_dict = {}
for question_type in tqdm(question_types):
    res_csv = get_question_type_df_filtered(test_res_csv, test_df, question_type)
    top1, top200, reranking_res = rerank_question_type(res_csv, test_df)
    res_dict[
        question_type
    ] = f"top1: {top1}, top200: {top200}, reranking result: {reranking_res}"

4000it [00:01, 2112.28it/s]<?, ?it/s]
395it [00:26, 14.98it/s]
4000it [00:02, 1802.10it/s]<03:18, 28.31s/it]
380it [00:48,  7.90it/s]
4000it [00:01, 2694.56it/s]<04:07, 41.28s/it]
223it [00:18, 12.16it/s]
4000it [00:01, 2185.05it/s]<02:37, 31.52s/it]
306it [00:17, 17.90it/s]
4000it [00:01, 2287.41it/s]<01:46, 26.58s/it]
220it [00:12, 17.72it/s]
4000it [00:01, 2129.99it/s]<01:06, 22.17s/it]
349it [00:15, 22.79it/s]
4000it [00:01, 2147.68it/s]<00:40, 20.50s/it]
384it [00:22, 16.87it/s]
4000it [00:02, 1659.46it/s]<00:21, 21.87s/it]
554it [00:34, 16.22it/s]
100%|██████████| 8/8 [03:30<00:00, 26.33s/it]


In [30]:
res_dict

{'intersection': 'top1: 0.3822784810126582, top200: 0.6860759493670886, reranking result: 0.4379746835443038',
 'count': 'top1: 0.25263157894736843, top200: 0.9368421052631579, reranking result: 0.8921052631578947',
 'comparative': 'top1: 0.4977578475336323, top200: 0.9282511210762332, reranking result: 0.4798206278026906',
 'ordinal': 'top1: 0.21895424836601307, top200: 0.5816993464052288, reranking result: 0.28104575163398693',
 'multihop': 'top1: 0.20909090909090908, top200: 0.5409090909090909, reranking result: 0.2727272727272727',
 'difference': 'top1: 0.14613180515759314, top200: 0.4383954154727794, reranking result: 0.1977077363896848',
 'superlative': 'top1: 0.3046875, top200: 0.5729166666666666, reranking result: 0.3177083333333333',
 'generic': 'top1: 0.41696750902527074, top200: 0.6787003610108303, reranking result: 0.42057761732851984'}

In [31]:
with open("./reranking_types.txt", "w+") as file:
    for k, v in res_dict.items():
        file.write(f"{k}: {v}\n")

Questions without subgraphs

In [58]:
without_subgraphs = []
for idx, group in test_res_csv.iterrows():
    curr_question_df = test_df[test_df["question"] == group["question"]]
    if len(curr_question_df) == 0:
        without_subgraphs.append(group)

df_without_subgraphs = pd.DataFrame(without_subgraphs)
df_without_subgraphs.head()

Unnamed: 0,question,target,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,...,answer_191,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199,target_out_of_vocab
4,Is the main hero in Final Fantasy IX named Kuja?,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Is Final Fantasy IX,Is it a Final Fantasy,Does it include Kuja?,Is it,Is he Kuja,Is it No,Y Yes,Is Kuja,Is he called Kuja,False
5,Who performed at the Super Bowl XXIII halftime...,Elvis Presto,The Rolling Stones,Gladys Knight & the Pips,Phyllis Hyman,The Rolling Stones,Gladys Knight & The Pips,The Jackson 5,Carmen McRae,Gladys Knight and the Pips,...,The Three Degrees,Carmen McRae & The Time,"Phyllis Hyman, Madonna",Gladys Knight,Gloria Estefan & The Four Tops,Phyllis Hyman.,Grace Slick,Madonna,Carmen McRae and The Time,False
6,Did Free Guy come out in 2021?,Yes,Yes,Yes,Yes,Yes,No,Yes,No,Yes,...,Is it a Yes,I'm not sure,It's not too early to tell,Is it a No,Is Free Guy Coming out in 2021,Y Yes,"It's not gonna happen, yes",Is Free Guy Coming,YYYYY yes,False
8,When was the first Donkey Kong arcade game rel...,1981,1981,1981,1981,1981,1981,1981,1981,1981,...,1975,November 1981,1985,"January 8, 1981",1983,1980,1979,February 1981,January 1983,False
12,Did Avatar come out after Children of Men?,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,...,Is Avatar after Children of Men?,Is Avatar a sequel to Avatar?,Avatar No,YYYYYYYYYYY,Is Avatar after Children of Men,Avatar,Is Avatar After Children of Men?,Is Avatar a sequel,Is Avatar a sequel to Avatar,False


In [66]:
# make sure without subgraph + with subgraph = 4000
len(df_without_subgraphs) + len(test_df.groupby("question"))

4000

In [43]:
new_subgraphs_dataset = read_jsonl(
    "/workspace/storage/mintaka_t5xl_test_no_graph_with_sparql.jsonl"
)

100%|██████████| 90879/90879 [00:00<00:00, 2328605.88it/s]


In [59]:
question_types = new_subgraphs_dataset["complexityType"].unique().tolist()
res_dict = {}
for question_type in tqdm(question_types):
    res_csv = get_question_type_df_filtered(
        df_without_subgraphs, new_subgraphs_dataset, question_type
    )

    top1_total, top200_total = 0, 0
    for idx, group in tqdm(res_csv.iterrows()):
        if group["answer_0"] == group["target"]:
            top1_total += 1
            top200_total += 1
        else:  # check if answer exist in 200 beams for question with no subgraphs
            all_beams = group.tolist()[2:-1]  # all 200 beams
            all_beams = list(set(all_beams))
            top200_total += 1 if group["target"] in all_beams else 0

    top1 = top1_total / len(res_csv)
    top200 = top200_total / len(res_csv)
    res_dict[question_type] = f"top1: {top1}, top200: {top200}"

res_dict

  0%|          | 0/9 [00:00<?, ?it/s]

5it [00:00, 3681.80it/s]
20it [00:00, 13456.22it/s]0<00:06,  1.23it/s]
177it [00:00, 16160.03it/s]<00:06,  1.15it/s]
400it [00:00, 17782.27it/s]<00:04,  1.27it/s]
244it [00:00, 13255.92it/s]<00:03,  1.35it/s]
94it [00:00, 12923.74it/s]4<00:03,  1.03it/s]
180it [00:00, 14109.30it/s]<00:02,  1.08it/s]
51it [00:00, 14463.12it/s]6<00:01,  1.06it/s]
16it [00:00, 8929.99it/s]07<00:00,  1.12it/s]
100%|██████████| 9/9 [00:07<00:00,  1.13it/s]


{'intersection': 'top1: 0.2, top200: 0.4',
 'count': 'top1: 0.2, top200: 1.0',
 'comparative': 'top1: 0.5480225988700564, top200: 0.9887005649717514',
 'yesno': 'top1: 0.62, top200: 1.0',
 'generic': 'top1: 0.20491803278688525, top200: 0.5737704918032787',
 'ordinal': 'top1: 0.19148936170212766, top200: 0.5957446808510638',
 'multihop': 'top1: 0.07777777777777778, top200: 0.34444444444444444',
 'difference': 'top1: 0.1568627450980392, top200: 0.5098039215686274',
 'superlative': 'top1: 0.0, top200: 0.0'}

In [60]:
with open("./reranking_types_seq2seq_output.txt", "w+") as file:
    for k, v in res_dict.items():
        file.write(f"{k}: {v}\n")

#### Subgraphs Analysis

In [69]:
combined_subgraphs_dataset = pd.concat([concat_train_df, test_df])
combined_subgraphs_dataset

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,2723bb1b,Which actor was the star of Titanic and was bo...,Q100711983,"Q44578, Q65",Q38111,intersection,"{'directed': True, 'multigraph': False, 'graph...",Which actor was the star of Titanic and was bo...,False
1,a9011ddf,What is the seventh tallest mountain in North ...,Q130018,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...",What is the seventh tallest mountain in North ...,False
2,982450cf,Who is the youngest current US governor?,Q11673,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>Un...,False
3,982450cf,Who is the youngest current US governor?,Q30,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>[u...,False
4,982450cf,Who is the youngest current US governor?,Q132050,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>go...,False
...,...,...,...,...,...,...,...,...,...
27089,2761e54a,What is the oldest city building game develope...,Q249854,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27090,2761e54a,What is the oldest city building game develope...,Q5114201,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27091,2761e54a,What is the oldest city building game develope...,Q20796085,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27092,2761e54a,What is the oldest city building game develope...,Q30,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False


In [70]:
correct_subgraph_dataset = combined_subgraphs_dataset[
    combined_subgraphs_dataset["correct"] == True
]
wrong_subgraph_dataset = combined_subgraphs_dataset[
    combined_subgraphs_dataset["correct"] == False
]

In [71]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_graph_analysis(df):
    graphs = df["graph"].tolist()
    questions = df["question"].tolist()
    answers = df["answerEntity"].tolist()

    res = []
    for question, answer, graph in zip(questions, answers, graphs):
        curr_dict = {}
        curr_dict["question"] = question
        curr_dict["answer"] = answer
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))

        # number edges and nodes
        nodes_num = graph_obj.number_of_nodes()
        edges_num = graph_obj.number_of_edges()
        curr_dict["num_nodes"], curr_dict["num_edges"] = nodes_num, edges_num
        curr_dict["density"] = nx.density(
            graph_obj
        )  # edges_num / (nodes_num * (nodes_num - 1))

        # cycles
        cycle = nx.recursive_simple_cycles(graph_obj)
        curr_dict["num_recursive_simple_cycles"] = len(cycle)

        # bridges
        k_edge_comp = sorted(map(sorted, nx.k_edge_components(graph_obj, k=2)))
        curr_dict["num_bridges_connectivity"] = len(k_edge_comp)

        res.append(curr_dict)

    res = pd.DataFrame(res)
    return res

In [72]:
correct_analysis_df = get_graph_analysis(correct_subgraph_dataset)
correct_analysis_df.to_csv("./correct_subgraph_analysis.csv", index=False)
correct_analysis_df.head()

Unnamed: 0,question,answer,num_nodes,num_edges,density,num_recursive_simple_cycles,num_bridges_connectivity
0,Who is the youngest current US governor?,Q3105215,3,3,0.5,1,3
1,Which actor was the star of Titanic and was bo...,Q38111,3,2,0.333333,0,3
2,Which US president has had the most votes?,Q6279,2,3,1.5,2,2
3,Which river is longer than the Mississippi River?,Q3392,3,2,0.333333,0,3
4,What is the longest lake in the world?,Q5511,2,1,0.5,0,2


In [74]:
correct_analysis_df.mean(axis=0)

  correct_analysis_df.mean(axis=0)


num_nodes                      2.902530
num_edges                      3.228027
density                        0.649091
num_recursive_simple_cycles    1.063706
num_bridges_connectivity       2.880956
dtype: float64

In [75]:
wrong_analysis_df = get_graph_analysis(wrong_subgraph_dataset)
wrong_analysis_df.to_csv("./wrong_subgraph_analysis.csv", index=False)
wrong_analysis_df

Unnamed: 0,question,answer,num_nodes,num_edges,density,num_recursive_simple_cycles,num_bridges_connectivity
0,Which actor was the star of Titanic and was bo...,Q100711983,4,3,0.250000,0,4
1,What is the seventh tallest mountain in North ...,Q130018,2,2,1.000000,1,2
2,Who is the youngest current US governor?,Q11673,3,3,0.500000,1,3
3,Who is the youngest current US governor?,Q30,2,2,1.000000,1,2
4,Who is the youngest current US governor?,Q132050,2,1,0.500000,0,2
...,...,...,...,...,...,...,...
112747,What is the oldest city building game develope...,Q249854,7,9,0.214286,1,7
112748,What is the oldest city building game develope...,Q5114201,4,4,0.333333,1,4
112749,What is the oldest city building game develope...,Q20796085,4,4,0.333333,1,4
112750,What is the oldest city building game develope...,Q30,4,4,0.333333,1,4


In [76]:
wrong_analysis_df.mean(axis=0)

  wrong_analysis_df.mean(axis=0)


num_nodes                      3.323453
num_edges                      3.971424
density                        0.610797
num_recursive_simple_cycles    1.218550
num_bridges_connectivity       3.296456
dtype: float64