In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
import random
import json

import numpy as np
from transformers import set_seed
import torch
from tqdm.auto import tqdm

from pathlib import Path

from ranking_model import LogisticRegressionRanker, LinearRegressionRanker, MPNetRanker, FullRandomRanker, NORanker
from ranking_data_utils import prepare_data

2024-04-22 07:35:56.045077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-22 07:35:56.244147: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-22 07:35:56.935544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-04-22 07:35:56.935621: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [2]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# os.environ['HF_DATASETS_CACHE'] = '/workspace/storage/misc/huggingface'

ds_type = 'large' # 'large' or 'xl'


mintaka_dataset_path = "AmazonScience/mintaka"
features_dataset_path = f"hle2000/KGQA_T5-{ds_type}-ssm"
seq2seq_outputs_dataset_path = f"s-nlp/Mintaka_T5_{ds_type}_ssm_outputs"


features_ds = load_dataset(features_dataset_path)
outputs_ds = load_dataset(seq2seq_outputs_dataset_path)
mintaka_ds = load_dataset(mintaka_dataset_path)


train_df = prepare_data(mintaka_ds['train'], outputs_ds['train'], features_ds['train'])
valid_df = prepare_data(mintaka_ds['validation'], outputs_ds['validation'], features_ds['validation'])
test_df = prepare_data(mintaka_ds['test'], outputs_ds['test'], features_ds['test'])
test_df.groupby(['id', 'question']).count().describe()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,target,target_out_of_vocab,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,correct,t5_sequence,gap_sequence,...,gap_sequence_embedding,t5_sequence_embedding,question_answer_embedding,highlighted_determ_sequence,no_highlighted_determ_sequence,highlighted_t5_sequence,no_highlighted_t5_sequence,highlighted_gap_sequence,no_highlighted_gap_sequence,model_answers
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,4.5345,4.5345,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,...,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.14175,4.5345
std,3.300005,3.300005,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,...,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.729026,3.300005
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
75%,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
max,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,...,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0


In [4]:
features_map = {
    'text': ['question_answer_embedding'],
    'graph': ['num_nodes', 'num_edges', 'density', 'cycle', 'bridge', 'katz_centrality', 'page_rank', 'avg_ssp_length'],
    'g2t_determ': ['determ_sequence_embedding'],
    'g2t_t5': ['t5_sequence_embedding'],
    'g2t_gap': ['gap_sequence_embedding'],
}


results_path = Path(f'/mnt/storage/QA_System_Project/subgraphs_reranking_runs/reranking_model_results/t5_{ds_type}_ssm/')
results_path.mkdir(parents=True, exist_ok=True)


full_random_ranker = FullRandomRanker()
with open(results_path / f'full_random_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in full_random_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

In [5]:
no_ranker = NORanker()
with open(results_path / f'NO_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in no_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

### Logistic Regression

In [10]:
logreg_ranker = LogisticRegressionRanker(features_map['text'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_text_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['text'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_text_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['g2t_determ'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['g2t_t5'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['g2t_gap'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['text'] + features_map['g2t_determ'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_text_g2t_determ_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LogisticRegressionRanker(features_map['text'] + features_map['g2t_t5'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_text_g2t_t5_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

logreg_ranker = LogisticRegressionRanker(features_map['text'] + features_map['g2t_gap'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'logreg_text_g2t_gap_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

### Linear Regression

In [11]:
logreg_ranker = LinearRegressionRanker(features_map['text'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_text_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['text'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_text_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['g2t_determ'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['g2t_t5'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['g2t_gap'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['text'] + features_map['g2t_determ'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_text_g2t_determ_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['text'] + features_map['g2t_t5'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_text_g2t_t5_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


logreg_ranker = LinearRegressionRanker(features_map['text'] + features_map['g2t_gap'] + features_map['graph'])
logreg_ranker.fit(train_df, n_jobs=8)
with open(results_path / f'linreg_text_g2t_gap_graph_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in logreg_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

### MPNet 

In [15]:
device = torch.device('cuda')

model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/determ/T5-{ds_type}-ssm/no_cherries_hl_false_determ/outputs/checkpoint-best"
mpnet_ranker = MPNetRanker('no_highlighted_determ_sequence', model_path, device)
with open(results_path / f'mpnet_no_hl_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/determ/T5-{ds_type}-ssm/no_cherries_hl_true_determ/outputs/checkpoint-best/"
mpnet_ranker = MPNetRanker('highlighted_determ_sequence', model_path, device)
with open(results_path / f'mpnet_hl_g2t_determ_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/g2t/T5-{ds_type}-ssm/no_cherries_fixed_train_g2t_hl_true_large/outputs/checkpoint-best"
mpnet_ranker = MPNetRanker('highlighted_t5_sequence', model_path, device)
with open(results_path / f'mpnet_hl_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/g2t/T5-{ds_type}-ssm/no_cherries_fixed_train_g2t_hl_false_large/outputs/checkpoint-best"
mpnet_ranker = MPNetRanker('no_highlighted_t5_sequence', model_path, device)
with open(results_path / f'mpnet_no_hl_g2t_t5_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/gap/T5-{ds_type}-ssm/no_cherries_fixed_train_hl_true_gap_large/outputs/checkpoint-best"
mpnet_ranker = MPNetRanker('highlighted_gap_sequence', model_path, device)
with open(results_path / f'mpnet_hl_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')

model_path = f"/mnt/storage/QA_System_Project/subgraphs_reranking_runs/gap/T5-{ds_type}-ssm/no_cherries_fixed_train_hl_false_gap_large/outputs/checkpoint-best"
mpnet_ranker = MPNetRanker('no_highlighted_gap_sequence', model_path, device)
with open(results_path / f'mpnet_no_hl_g2t_gap_reranking_seq2seq_{ds_type}_results.jsonl', 'w') as f:
    for result in mpnet_ranker.rerank(test_df):
        f.write(json.dumps(result)+'\n')


Try load model...
Model Loaded.


  0%|          | 0/4000 [00:00<?, ?it/s]

Try load model...
Model Loaded.


  0%|          | 0/4000 [00:00<?, ?it/s]