In [31]:
import sys
import torch

sys.path.append("/workspace/kbqa/")  # go to parent dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
train_res_path = (
    "/workspace/storage/misc/features_reranking/features_train_new_seqs.csv"
)
test_res_path = "/workspace/storage/misc/features_reranking/features_test_new_seqs.csv"

In [32]:
from datasets import load_dataset

dataset_type = "t5-large-ssm"

path = (
    "Mintaka_Subgraphs_T5_xl_ssm"
    if dataset_type == "t5-xl-ssm"
    else "Mintaka_Subgraphs_T5_large_ssm"
)
subgraphs_dataset = load_dataset(f"hle2000/{path}")
train_df = subgraphs_dataset["train"].to_pandas()
test_df = subgraphs_dataset["test"].to_pandas()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_Subgraphs_T5_large_ssm-6867cf00fefe81ee/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
import yaml


def add_new_seqs(path, df):
    """get the new seqs from yaml and add to df"""
    with open(path, "r") as stream:
        try:
            new_seqs = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    updated_seqs = []
    for curr_seq in new_seqs["data"]:
        updated_seqs.append(curr_seq["predicted"])
    df["updated_sequence"] = updated_seqs
    return df


train_df = add_new_seqs("/workspace/storage/misc/train_results_mintaka.yaml", train_df)
test_df = add_new_seqs("/workspace/storage/misc/test_results_mintaka.yaml", test_df)

In [19]:
from networkx.readwrite import json_graph
from ast import literal_eval
import networkx as nx
import pandas as pd


def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s

In [20]:
def get_distance_ans_cand(graph, ans_cand_id):
    """get avg distance from ans entity to answer candidate"""
    graph = graph.to_undirected()  # for ssp both ways
    ssp_dict = nx.shortest_path(graph, target=ans_cand_id)
    total_ssp, total_paths = 0, 0

    for k, v in ssp_dict.items():
        if k != ans_cand_id:
            total_ssp += len(v)
            total_paths += 1

    return total_ssp / total_paths

In [21]:
from gtda.graphs import GraphGeodesicDistance
from gtda.homology import FlagserPersistence
from gtda.graphs import GraphGeodesicDistance
from gtda.diagrams import Scaler, HeatKernel
import numpy as np

heat = HeatKernel(sigma=0.15, n_bins=60, n_jobs=-1)


def get_graph_vector(graph):
    """find tfidf vector of graph"""
    A = nx.adjacency_matrix(graph)
    X_ggd = GraphGeodesicDistance(directed=True, unweighted=True).fit_transform([A])
    B = FlagserPersistence().fit_transform(X_ggd)
    B_scaled = np.nan_to_num(B)
    B_heat = heat.fit_transform(B_scaled)
    B_heat_1d = B_heat.ravel()
    return B_heat_1d

In [22]:
def find_candidate_note(graph):
    """find id of answer candidate node"""
    for node_id in graph.nodes:
        node = graph.nodes[node_id]
        if node["type"] == "ANSWER_CANDIDATE_ENTITY":
            return node_id
    raise ValueError(f"cannot find answer candidate entity")

In [23]:
def find_label(graph, wd_id):
    """find label of the wikidata id using graph"""
    for node_id in graph.nodes:
        node = graph.nodes[node_id]
        if node["name_"] == wd_id:
            return node["label"]
    return f"cannot find label for {wd_id}"

In [24]:
def get_node_names(
    subgraph,
    candidate_start_token="[unused1]",
    candidate_end_token="[unused2]",
):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    node_type = [subgraph.nodes[node]["type"] for node in subgraph.nodes()]

    if "ANSWER_CANDIDATE_ENTITY" not in node_type:
        return None

    if False:
        candidate_idx = node_type.index("ANSWER_CANDIDATE_ENTITY")
        node_names[
            candidate_idx
        ] = f"{candidate_start_token}{node_names[candidate_idx]}{candidate_end_token}"

    return node_names


def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info != 0:  # no endge from_node -> to_node
                sequence.extend([from_node, edge_info, to_node])

    sequence = ",".join(str(node) for node in sequence)
    return sequence

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

2024-02-14 10:11:32.669462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-14 10:11:32.879667: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-14 10:11:33.467904: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-02-14 10:11:33.468012: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [7]:
import numpy as np


def arr_to_str(arr):
    arr = list(arr)
    return ",".join(str(a) for a in arr)


def str_to_arr(str):
    arr = str.split(",")
    arr = [float(a) for a in arr]
    return np.array(arr)

In [27]:
from tqdm import tqdm
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()


def get_features(df):
    dict_list = []
    for _, row in tqdm(df.iterrows()):
        # convert from json dict to networkx graph
        graph_json = try_literal_eval(row["graph"])
        graph_obj = json_graph.node_link_graph(graph_json)
        graph_node_names = get_node_names(graph_obj)

        # skip if we have no answer candidates in our graph
        try:
            ans_cand_id = find_candidate_note(graph_obj)
            ques_ans = (
                f"{row['question']} ; {find_label(graph_obj, row['answerEntity'])}"
            )
            graph_seq = graph_to_sequence(graph_obj, graph_node_names)
            updated_graph_seq = row["updated_sequence"]
            # build the features
            curr_dict = {
                # text data
                "question": row["question"],
                "question_answer": ques_ans,
                # numerical data
                "num_nodes": graph_obj.number_of_nodes(),
                "num_edges": graph_obj.number_of_edges(),
                "density": nx.density(graph_obj),
                "cycle": len(nx.recursive_simple_cycles(graph_obj)),
                "bridge": len(
                    sorted(map(sorted, nx.k_edge_components(graph_obj, k=2)))
                ),
                "katz_centrality": nx.katz_centrality(graph_obj)[ans_cand_id],
                "page_rank": nx.pagerank(graph_obj)[ans_cand_id],
                "avg_ssp_length": get_distance_ans_cand(graph_obj, ans_cand_id),
                "graph_sequence": graph_seq,
                "updated_graph_sequence": updated_graph_seq,
                # embedding data
                "graph_sequence_embedding": arr_to_str(
                    model.encode(graph_seq, device="cuda", convert_to_numpy=True)
                ),
                "updated_graph_sequence_embedding": arr_to_str(
                    model.encode(
                        updated_graph_seq, device="cuda", convert_to_numpy=True
                    )
                ),
                "question_answer_embedding": arr_to_str(
                    model.encode(ques_ans, device="cuda", convert_to_numpy=True)
                ),
                "tfidf_vector": arr_to_str(get_graph_vector(graph_obj)),
                # label
                "correct": float(row["correct"]),
            }
        except:
            continue
        dict_list.append(curr_dict)

    final_df = pd.DataFrame(dict_list)
    return final_df

In [28]:
processed_train_df = get_features(train_df)
processed_test_df = get_features(test_df)

processed_train_df.to_csv(train_res_path, index=False)
processed_test_df.to_csv(test_res_path, index=False)

0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (step_size ** 2)
  heats_ /= (s