In [1]:
import torch
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn.functional as F

2023-07-28 10:24:31.971706: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-28 10:24:32.191971: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-28 10:24:32.774054: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-28 10:24:32.774132: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

#### Getting JSONL subgraphs dataset

In [3]:
dataset_type = "t5-xl-ssm"
train_bs = 16
eval_bs = 32
is_special_tok_context = True
model_weights = f"/workspace/storage/subgraphs_reranking_results/{dataset_type}/results/mse_subgraph_mpnet_ranking_T5XLSSMNQ"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_weights)
model = AutoModelForSequenceClassification.from_pretrained(model_weights).to(device)

In [5]:
def read_jsonl(path):
    jsonl_reader = jsonlines.open(path)
    jsonl_reader_list = list(jsonl_reader)
    df = []
    for line in tqdm(jsonl_reader_list):
        df.append(line)
    df = pd.DataFrame(df)
    return df


train_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_train_labeled.jsonl"
)
val_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_validation_labeled.jsonl"
)
test_df = read_jsonl(
    f"/workspace/storage/new_subgraph_dataset/{dataset_type}/mintaka_test_labeled.jsonl"
)

100%|██████████| 94690/94690 [00:00<00:00, 2550384.63it/s]
100%|██████████| 13646/13646 [00:00<00:00, 2185310.69it/s]
100%|██████████| 27094/27094 [00:00<00:00, 2442410.43it/s]


#### Converting graph to its sequences

In [6]:
def get_node_names(
    subgraph,
    candidate_start_token="[unused1]",
    candidate_end_token="[unused2]",
):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    node_type = [subgraph.nodes[node]["type"] for node in subgraph.nodes()]

    if "ANSWER_CANDIDATE_ENTITY" not in node_type:
        return None

    if is_special_tok_context:
        candidate_idx = node_type.index("ANSWER_CANDIDATE_ENTITY")
        node_names[
            candidate_idx
        ] = f"{candidate_start_token}{node_names[candidate_idx]}{candidate_end_token}"

    return node_names

In [7]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info != 0:  # no endge from_node -> to_node
                sequence.extend([from_node, edge_info, to_node])

    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [8]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_sequences(df):
    questions = list(df["question"])
    graphs = list(df["graph"])
    graph_seq = []
    for question, graph in tqdm(zip(questions, graphs)):
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))
        try:
            graph_node_names = get_node_names(graph_obj)
            if graph_node_names is None:
                curr_seq = "ERROR_NO_CANDIDATE"
            else:
                curr_seq = graph_to_sequence(graph_obj, graph_node_names)
                if is_special_tok_context:
                    curr_seq = f"{question}{tokenizer.sep_token}{curr_seq}"
        except KeyError:
            curr_seq = "ERROR_NO_LABEL"
        except nx.NetworkXError:
            curr_seq = "ERROR_EMPTY_GRAPH"
        graph_seq.append(curr_seq)

    return graph_seq

In [9]:
def preprocess_data(df):
    # get the sequences
    df_sequences = get_sequences(df)
    df["graph_sequence"] = df_sequences

    # filter out all invalid graphs
    error_df = df[
        (df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
        | (df["graph_sequence"] == "ERROR_NO_LABEL")
        | (df["graph_sequence"] == "ERROR_NO_CANDIDATE")
    ]
    df = df.drop(error_df.index)

    # turn list of entities into string
    df["answerEntity"] = df["answerEntity"].apply(lambda x: ", ".join(x))
    df["questionEntity"] = df["questionEntity"].apply(lambda x: ", ".join(x))
    df["groundTruthAnswerEntity"] = df["groundTruthAnswerEntity"].apply(
        lambda x: ", ".join(x)
    )
    df["correct"] = df.apply(
        lambda x: x["answerEntity"] in x["groundTruthAnswerEntity"], axis=1
    )

    return df

In [10]:
# get train and test texts & labels
concat_train_df = pd.concat([train_df, val_df])

In [11]:
concat_train_df = preprocess_data(concat_train_df)

0it [00:00, ?it/s]

108336it [00:23, 4627.40it/s]


In [12]:
test_df = preprocess_data(test_df)

27094it [00:06, 4081.64it/s]


#### Final Re-ranking Based on Question Types

In [13]:
def get_question_type_df_filtered(res_csv, jsonl_dataset, question_type):
    df_type = jsonl_dataset[jsonl_dataset["complexityType"] == question_type]
    # getting questions in result csv that exist in our jsonl
    res_filtered = []
    for index, row in res_csv.iterrows():
        question = row[0].strip()
        curr_ques_df = df_type[df_type["question"] == question]
        if len(curr_ques_df) > 0:
            res_filtered.append(row)
    res_filtered = pd.DataFrame(res_filtered)

    return res_filtered

In [14]:
dataset_with_complexity = read_jsonl('/workspace/storage/mintaka_t5xl_test_no_graph_with_sparql.jsonl')
question_types = dataset_with_complexity['complexityType'].unique().tolist()
test_res_csv = pd.read_csv(
    f"/workspace/storage/mintaka_seq2seq/{dataset_type}/test/results.csv"
)


100%|██████████| 90879/90879 [00:00<00:00, 2879045.84it/s]


In [19]:
result_dict = {}
for curr_type in tqdm(question_types):
    df_type = dataset_with_complexity[dataset_with_complexity['complexityType']==curr_type]
    grouped_df_type = df_type.groupby('question')
    
    top1, top200, reranking_res, seq2seq_correct = 0, 0, 0, 0
    type_len = 0
    
    for question, group in grouped_df_type:
        curr_question_subgraphs = test_df[test_df['question']==question] # subgraphs of ques
        curr_question_seq2seq = test_res_csv[test_res_csv['question']==question] # seq2seq outputs
        
        # if question doesnt exist test results.csv -> not in test set
        if len(curr_question_seq2seq) == 0:
            continue
        
        type_len += 1
        target = curr_question_seq2seq['target'].values[0]
        all_beams = curr_question_seq2seq.iloc[:,2:].values.flatten().tolist()
       
        if len(curr_question_subgraphs) == 0: # no subgraphs
            if all_beams[0] == target: # first answer of seq2seq is correct
                seq2seq_correct += 1
            elif target in all_beams:
                top200 += 1
        else:  # have subgraphs -> rerank
            if target not in all_beams: # no need to rerank
                continue
                
            top1 += 1 if all_beams[0] == target else 0
            top200 += 1
            
            seqs = curr_question_subgraphs["graph_sequence"].tolist()
            is_corrects = curr_question_subgraphs["correct"].tolist()

            tok_seq = tokenizer(seqs, padding="max_length",
                                max_length=512,
                                truncation=True,
                                return_tensors="pt",
                                )
            mask = tok_seq["attention_mask"].to(device)
            input_id = tok_seq["input_ids"].squeeze(1).to(device)
            output = model(input_id, mask).logits
            output = torch.flatten(output)

            max_idx = output.argmax(dim=0).item()

            if is_corrects[max_idx] is True:
                reranking_res += 1
    
    reranking_res += seq2seq_correct
    top200 += seq2seq_correct
    top1 += seq2seq_correct
    
    result_dict[curr_type] = [top1, reranking_res, top200, type_len]

100%|██████████| 9/9 [02:42<00:00, 18.04s/it]


In [20]:
result_dict

{'intersection': [143, 174, 273, 400],
 'count': [99, 343, 376, 400],
 'comparative': [202, 204, 382, 400],
 'yesno': [248, 248, 400, 400],
 'generic': [268, 283, 516, 797],
 'ordinal': [83, 104, 234, 400],
 'multihop': [57, 74, 181, 400],
 'difference': [57, 77, 179, 400],
 'superlative': [112, 122, 220, 400]}

In [25]:
acc, total = 0, 0
for k, v in result_dict.items():
    acc += v[1]
    total += v[-1]
acc/total

0.40755566675006255

In [28]:
with open("./reranking_types_all_output.txt", "w+") as file:
    for k, v in result_dict.items():
        top1_acc = v[0]/v[-1]
        reranking_acc = v[1]/v[-1]
        top200_acc = v[2]/v[-1]
        
        file.write(f"{k}: top 1: {top1_acc} ({v[0]} of {v[-1]}), reranked resul: {reranking_acc} ({v[1]} of {v[-1]}), top200: {top200_acc}  ({v[2]} of {v[-1]})\n")

#### Subgraphs Analysis

In [69]:
combined_subgraphs_dataset = pd.concat([concat_train_df, test_df])
combined_subgraphs_dataset

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,2723bb1b,Which actor was the star of Titanic and was bo...,Q100711983,"Q44578, Q65",Q38111,intersection,"{'directed': True, 'multigraph': False, 'graph...",Which actor was the star of Titanic and was bo...,False
1,a9011ddf,What is the seventh tallest mountain in North ...,Q130018,Q49,Q1153188,ordinal,"{'directed': True, 'multigraph': False, 'graph...",What is the seventh tallest mountain in North ...,False
2,982450cf,Who is the youngest current US governor?,Q11673,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>Un...,False
3,982450cf,Who is the youngest current US governor?,Q30,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>[u...,False
4,982450cf,Who is the youngest current US governor?,Q132050,Q889821,Q3105215,superlative,"{'directed': True, 'multigraph': False, 'graph...",Who is the youngest current US governor?</s>go...,False
...,...,...,...,...,...,...,...,...,...
27089,2761e54a,What is the oldest city building game develope...,Q249854,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27090,2761e54a,What is the oldest city building game develope...,Q5114201,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27091,2761e54a,What is the oldest city building game develope...,Q20796085,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False
27092,2761e54a,What is the oldest city building game develope...,Q30,"Q588289, Q2299192",Q1025416,superlative,"{'directed': True, 'multigraph': False, 'graph...",What is the oldest city building game develope...,False


In [70]:
correct_subgraph_dataset = combined_subgraphs_dataset[
    combined_subgraphs_dataset["correct"] == True
]
wrong_subgraph_dataset = combined_subgraphs_dataset[
    combined_subgraphs_dataset["correct"] == False
]

In [71]:
from ast import literal_eval
from unidecode import unidecode


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s


def get_graph_analysis(df):
    graphs = df["graph"].tolist()
    questions = df["question"].tolist()
    answers = df["answerEntity"].tolist()

    res = []
    for question, answer, graph in zip(questions, answers, graphs):
        curr_dict = {}
        curr_dict["question"] = question
        curr_dict["answer"] = answer
        graph_obj = nx.readwrite.json_graph.node_link_graph(try_literal_eval(graph))

        # number edges and nodes
        nodes_num = graph_obj.number_of_nodes()
        edges_num = graph_obj.number_of_edges()
        curr_dict["num_nodes"], curr_dict["num_edges"] = nodes_num, edges_num
        curr_dict["density"] = nx.density(
            graph_obj
        )  # edges_num / (nodes_num * (nodes_num - 1))

        # cycles
        cycle = nx.recursive_simple_cycles(graph_obj)
        curr_dict["num_recursive_simple_cycles"] = len(cycle)

        # bridges
        k_edge_comp = sorted(map(sorted, nx.k_edge_components(graph_obj, k=2)))
        curr_dict["num_bridges_connectivity"] = len(k_edge_comp)

        res.append(curr_dict)

    res = pd.DataFrame(res)
    return res

In [72]:
correct_analysis_df = get_graph_analysis(correct_subgraph_dataset)
correct_analysis_df.to_csv("./correct_subgraph_analysis.csv", index=False)
correct_analysis_df.head()

Unnamed: 0,question,answer,num_nodes,num_edges,density,num_recursive_simple_cycles,num_bridges_connectivity
0,Who is the youngest current US governor?,Q3105215,3,3,0.5,1,3
1,Which actor was the star of Titanic and was bo...,Q38111,3,2,0.333333,0,3
2,Which US president has had the most votes?,Q6279,2,3,1.5,2,2
3,Which river is longer than the Mississippi River?,Q3392,3,2,0.333333,0,3
4,What is the longest lake in the world?,Q5511,2,1,0.5,0,2


In [74]:
correct_analysis_df.mean(axis=0)

  correct_analysis_df.mean(axis=0)


num_nodes                      2.902530
num_edges                      3.228027
density                        0.649091
num_recursive_simple_cycles    1.063706
num_bridges_connectivity       2.880956
dtype: float64

In [75]:
wrong_analysis_df = get_graph_analysis(wrong_subgraph_dataset)
wrong_analysis_df.to_csv("./wrong_subgraph_analysis.csv", index=False)
wrong_analysis_df

Unnamed: 0,question,answer,num_nodes,num_edges,density,num_recursive_simple_cycles,num_bridges_connectivity
0,Which actor was the star of Titanic and was bo...,Q100711983,4,3,0.250000,0,4
1,What is the seventh tallest mountain in North ...,Q130018,2,2,1.000000,1,2
2,Who is the youngest current US governor?,Q11673,3,3,0.500000,1,3
3,Who is the youngest current US governor?,Q30,2,2,1.000000,1,2
4,Who is the youngest current US governor?,Q132050,2,1,0.500000,0,2
...,...,...,...,...,...,...,...
112747,What is the oldest city building game develope...,Q249854,7,9,0.214286,1,7
112748,What is the oldest city building game develope...,Q5114201,4,4,0.333333,1,4
112749,What is the oldest city building game develope...,Q20796085,4,4,0.333333,1,4
112750,What is the oldest city building game develope...,Q30,4,4,0.333333,1,4


In [76]:
wrong_analysis_df.mean(axis=0)

  wrong_analysis_df.mean(axis=0)


num_nodes                      3.323453
num_edges                      3.971424
density                        0.610797
num_recursive_simple_cycles    1.218550
num_bridges_connectivity       3.296456
dtype: float64

In [None]:
import datasets
ds = datasets.load_dataset('AmazonScience/mintaka', name='en')


In [77]:
train = ds['train'].to_pandas()
train[train['complexityType'] == 'ordinal']

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity
0,a9011ddf,en,What is the seventh tallest mountain in North ...,Mount Lucania,geography,ordinal,"[{'name': 'Q49', 'entityType': 'entity', 'labe...","[{'name': 'Q1153188', 'label': 'Mount Lucania'}]"
9,9e087ee3,en,How old was Taylor Swift when she won her firs...,20,music,ordinal,"[{'name': 'Q26876', 'entityType': 'entity', 'l...",[]
16,0aeebf1e,en,When did Metallica put out their fourth album?,1988,music,ordinal,"[{'name': 'Q15920', 'entityType': 'entity', 'l...",[]
654,ae20cc58,en,When did Jennifer Lawrence win her first Oscar?,2012,movies,ordinal,"[{'name': 'Q19020', 'entityType': 'entity', 'l...",[]
655,203a9487,en,When did Denzel Washington win his first Acade...,1988,movies,ordinal,"[{'name': 'Q42101', 'entityType': 'entity', 'l...",[]
...,...,...,...,...,...,...,...,...
13987,6b752d22,en,Who was the last Labour prime minister of the UK?,Gordon Brown,politics,ordinal,"[{'name': 'Q9630', 'entityType': 'entity', 'la...","[{'name': 'Q10648', 'label': 'Gordon Brown'}]"
13988,ee491ab0,en,Who was the first prime minister during George...,H. H. Asquith,politics,ordinal,"[{'name': 'Q269412', 'entityType': 'entity', '...","[{'name': 'Q166714', 'label': 'H. H. Asquith'}]"
13989,f4514171,en,Who was the first prime minister during Elizab...,Winston Churchill,politics,ordinal,"[{'name': 'Q9682', 'entityType': 'entity', 'la...","[{'name': 'Q8016', 'label': 'Winston Churchill'}]"
13990,e20c5e10,en,Who was the first Labour prime minister during...,Harold Wilson,politics,ordinal,"[{'name': 'Q9682', 'entityType': 'entity', 'la...","[{'name': 'Q128956', 'label': 'Harold Wilson'}]"
