In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
import random
import json

import numpy as np
from transformers import set_seed
import torch
from tqdm.auto import tqdm

from pathlib import Path

from ranking_model import LogisticRegressionRanker, LinearRegressionRanker, MPNetRanker, FullRandomRanker, NORanker, SemanticRanker
from ranking_data_utils import prepare_data

2024-04-30 17:43:24.933877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-30 17:43:25.090768: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-30 17:43:25.801544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-04-30 17:43:25.801630: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [3]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# os.environ['HF_DATASETS_CACHE'] = '/workspace/storage/misc/huggingface'

mintaka_dataset_path = "AmazonScience/mintaka"
kgqa_dataset_path = f"s-nlp/KGQASubgraphsRanking"

candidate_source = 'mixtral'


features_ds = load_dataset(kgqa_dataset_path, f"{candidate_source}_subgraphs")
outputs_ds = load_dataset(kgqa_dataset_path, f"{candidate_source}_outputs")
mintaka_ds = load_dataset(mintaka_dataset_path)


train_df = prepare_data(mintaka_ds["train"], outputs_ds["train"], features_ds["train"])
valid_df = prepare_data(
    mintaka_ds["validation"], outputs_ds["validation"], features_ds["validation"]
)
test_df = prepare_data(mintaka_ds["test"], outputs_ds["test"], features_ds["test"])
test_df.groupby(["id", "question"]).count().describe()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,target,index,answerEntity,groundTruthAnswerEntity,questionEntity,complexityType,graph,correct,t5_sequence,gap_sequence,...,bridge,katz_centrality,page_rank,avg_ssp_length,determ_sequence,determ_sequence_embedding,gap_sequence_embedding,t5_sequence_embedding,question_answer_embedding,model_answers
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,2.8455,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,...,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.43725,2.8455
std,2.013863,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,...,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.409256,2.013863
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,...,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0


In [5]:
features_map = {
    "text": ["question_answer_embedding"],
    "graph": [
        "num_nodes",
        "num_edges",
        "density",
        "cycle",
        "bridge",
        "katz_centrality",
        "page_rank",
        "avg_ssp_length",
    ],
    "g2t_determ": ["determ_sequence_embedding"],
    "g2t_t5": ["t5_sequence_embedding"],
    "g2t_gap": ["gap_sequence_embedding"],
}

In [6]:
results_path = Path(
    f"/workspace/storage/misc/subgraphs_reranking_runs/reranking_model_results/{candidate_source}/"
)
results_path.mkdir(parents=True, exist_ok=True)


full_random_ranker = FullRandomRanker()
with open(
    results_path / f"full_random_reranking_results.jsonl", "w"
) as f:
    for result in full_random_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")

In [7]:
no_ranker = NORanker()
with open(results_path / f"NO_reranking_results.jsonl", "w") as f:
    for result in no_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")

In [8]:
semantic_ranker = SemanticRanker()
with open(results_path / f'semantic_reranking_results.jsonl', 'w') as f:
    for result in semantic_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

  0%|          | 0/4000 [00:00<?, ?it/s]

### Logistic Regression

In [9]:
logreg_ranker = LogisticRegressionRanker(features_map["text"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_text_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(features_map["graph"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_graph_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(features_map["text"] + features_map["graph"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_text_graph_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(features_map["g2t_determ"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_g2t_determ_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(features_map["g2t_t5"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_g2t_t5_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(features_map["g2t_gap"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"logreg_g2t_gap_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(
    features_map["text"] + features_map["g2t_determ"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path
    / f"logreg_text_g2t_determ_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LogisticRegressionRanker(
    features_map["text"] + features_map["g2t_t5"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path
    / f"logreg_text_g2t_t5_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")

logreg_ranker = LogisticRegressionRanker(
    features_map["text"] + features_map["g2t_gap"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path
    / f"logreg_text_g2t_gap_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

### Linear Regression

In [10]:
logreg_ranker = LinearRegressionRanker(features_map["text"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_text_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(features_map["graph"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_graph_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(features_map["text"] + features_map["graph"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_text_graph_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(features_map["g2t_determ"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_g2t_determ_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(features_map["g2t_t5"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_g2t_t5_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(features_map["g2t_gap"])
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_g2t_gap_reranking_results.jsonl", "w"
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(
    features_map["text"] + features_map["g2t_determ"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_text_g2t_determ_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(
    features_map["text"] + features_map["g2t_t5"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_text_g2t_t5_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")


logreg_ranker = LinearRegressionRanker(
    features_map["text"] + features_map["g2t_gap"] + features_map["graph"]
)
logreg_ranker.fit(train_df, n_jobs=8)
with open(
    results_path / f"linreg_text_g2t_gap_graph_reranking_results.jsonl",
    "w",
) as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result) + "\n")

### MPNet 

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")


# def apply_sep(seq):
#     if isinstance(seq, str):
#         q_a_splits = seq.split(";")
#         seq = f"{q_a_splits[0]}{tokenizer.sep_token}{q_a_splits[-1]}"
#     return seq


# test_df["question_answer"] = test_df["question_answer"].apply(apply_sep)

In [None]:
# device = torch.device("cuda")

# model_path = "/workspace/storage/misc/subgraphs_reranking_results/question_answer/T5-xl-ssm/question_answer_nocherries_fixed_train/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("question_answer", model_path, device)
# with open(
#     results_path / f"mpnet_text_only_determ_reranking_seq2seq_{ds_type}_results.jsonl",
#     "w+",
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/determ/T5-{ds_type}-ssm/no_cherries_hl_false_determ/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("no_highlighted_determ_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_no_hl_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl",
#     "w",
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/determ/T5-{ds_type}-ssm/no_cherries_hl_true_determ/outputs/checkpoint-best/"
# mpnet_ranker = MPNetRanker("highlighted_determ_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_hl_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")


# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/g2t/T5-{ds_type}-ssm/no_cherries_fixed_train_g2t_hl_true_large/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("highlighted_t5_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_hl_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/g2t/T5-{ds_type}-ssm/no_cherries_fixed_train_g2t_hl_false_large/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("no_highlighted_t5_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_no_hl_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")


# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/gap/T5-{ds_type}-ssm/no_cherries_fixed_train_hl_true_gap_large/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("highlighted_gap_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_hl_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

# model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/gap/T5-{ds_type}-ssm/no_cherries_fixed_train_hl_false_gap_large/outputs/checkpoint-best"
# mpnet_ranker = MPNetRanker("no_highlighted_gap_sequence", model_path, device)
# with open(
#     results_path / f"mpnet_no_hl_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in mpnet_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

Try load model...
Model Loaded.


  0%|          | 0/4000 [00:00<?, ?it/s]

Try load model...
Model Loaded.


  0%|          | 0/4000 [00:00<?, ?it/s]

### Catboost

In [None]:
# from catboost import CatBoostRegressor

# model_weights = "/workspace/storage/misc/features_reranking/catboost/unified_reranking/T5-large-ssm/catboost_text_large_ASK/best_model"
# catboost_ranker = CatboostRanker(model_weights, features_map["text"])

# with open(
#     results_path / f"catboost_text_reranking_seq2seq_{ds_type}_results.jsonl", "w"
# ) as f:
#     for result in catboost_ranker.rerank(test_df):
#         f.write(json.dumps(result) + "\n")

Trying to load the model...
Model Loaded.


  0%|          | 0/4000 [00:00<?, ?it/s]