In [1]:
import sys
import torch
sys.path.append("/workspace/kbqa/")  # go to parent dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

#### Reading and Processing Data

In [2]:
import pandas as pd
print('reading train and test')
processed_train_df = pd.read_csv('/workspace/storage/misc/features_reranking/features_train_new_seqs.csv')
processed_test_df = pd.read_csv('/workspace/storage/misc/features_reranking/features_test_new_seqs.csv')

reading train and test


In [3]:
# to whether use embedding features or not
use_embeddings = True

In [4]:
import numpy as np

def get_numeric_cols(df):
    """return all cols with numeric features"""
    cols_numeric = []
    for k, v in df.dtypes.to_dict().items():
        if (v is np.dtype("int64") or v is np.dtype("float64")) and k != "correct":
            cols_numeric.append(k)

    return cols_numeric

In [5]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()


def apply_col_scale(df, col):
    """apply min max scaling"""
    df[col] = min_max_scaler.fit_transform(df[col])
    return df


# get the numerica cols in train and test to scale
train_numeric_cols = get_numeric_cols(processed_train_df)
test_numeric_cols = get_numeric_cols(processed_test_df)

processed_train_df = apply_col_scale(processed_train_df, train_numeric_cols)
processed_test_df = apply_col_scale(processed_test_df, test_numeric_cols)

In [6]:
processed_train_df.head(3)

Unnamed: 0,question,question_answer,num_nodes,num_edges,density,cycle,bridge,katz_centrality,page_rank,avg_ssp_length,graph_sequence,graph_sequence_embedding,updated_graph_sequence_embedding,question_answer_embedding,tfidf_vector,correct,updated_graph_sequence
0,What is the seventh tallest mountain in North ...,What is the seventh tallest mountain in North ...,0.0,0.0,0.217391,0.0,0.0625,0.874914,0.53152,0.0,"Mount Rainier,continent,North America","0.034756005,-0.023538165,-0.01432218,0.0284024...","0.0307673,0.009273613,-0.018254131,0.04655365,...","0.010093523,0.0059077474,-0.0011233741,0.02769...","7.074399696475794,7.029381555603603,6.89603869...",0.0,Mount Rainier is located in North America.
1,What is the seventh tallest mountain in North ...,What is the seventh tallest mountain in North ...,0.0,0.017857,0.478261,0.001898,0.0625,0.938945,0.76576,0.0,"North America,shares border with,Eurasia,Euras...","0.006314811,0.013446206,-0.030191861,-0.004628...","0.0018651504,0.018196316,-0.014988141,-0.01599...","0.021559492,-0.003825337,-0.0013305206,0.00150...","7.074399696475794,7.029381555603603,6.89603869...",0.0,North America and Eurasia share a border.
2,What is the seventh tallest mountain in North ...,What is the seventh tallest mountain in North ...,0.0,0.035714,0.73913,0.003795,0.0625,0.938945,0.76576,0.0,"North America,has part(s),Greenland,Greenland,...","0.024753027,-0.004211257,-0.042537164,-0.03131...","0.02677703,0.013835486,-0.029348638,-0.0018101...","0.024931252,0.007847421,-0.010111074,-5.701548...","7.074399696475794,7.029381555603603,6.89603869...",0.0,Greenland is part of North America.


In [7]:
def str_to_arr(str):
    arr = str.split(',')
    arr = [float(a) for a in arr]
    return np.array(arr)

In [8]:
embedding_features = [
        "graph_sequence_embedding",
        "tfidf_vector",
        "question_answer_embedding",
        "updated_graph_sequence_embedding",
    ]
text_features = ["question_answer", "graph_sequence", "updated_graph_sequence"]

In [9]:
def process_emb(df, em_type):
    """ split embeddings into individual rows"""
    embeddings = df[em_type].tolist()
    emb_dict = {}
    for emb in embeddings:
        for i, val in enumerate(emb):
            curr_key = f'{em_type}_{i}'
            if curr_key not in emb_dict:
                emb_dict[f'{em_type}_{i}'] = [val]
            else:
                emb_dict[f'{em_type}_{i}'].append(val)
    
    return pd.DataFrame(emb_dict)

In [10]:
def add_processed_emb(df):
    """ add all processed embeddings to df"""
    em_df_list = []
    for em in embedding_features:   
        em_df_list.append(process_emb(df, em))
    
    df = pd.concat([df] +  em_df_list, axis=1)
    return df

In [11]:
if use_embeddings: # splitting embeddings into individual cols
    print('turning embeddings str to arr')
    for e_f in embedding_features:
        print(e_f)
        processed_train_df[e_f] = processed_train_df[e_f].apply(str_to_arr)
        processed_test_df[e_f] = processed_test_df[e_f].apply(str_to_arr)
    
    print('adding processed embeddings to train & test')
    processed_train_df = add_processed_emb(processed_train_df)
    processed_test_df = add_processed_emb(processed_test_df)
    
    processed_train_df = processed_train_df.dropna()
    processed_test_df = processed_test_df.dropna()

turning embeddings str to arr
graph_sequence_embedding
tfidf_vector
question_answer_embedding
updated_graph_sequence_embedding
adding processed embeddings to train & test


In [13]:
drop_cols = ['correct', 'question'] 

# dropping all text and embedding features 
X_train = processed_train_df.drop(drop_cols + embedding_features + text_features, axis=1)
y_train = processed_train_df['correct'].tolist()
X_test = processed_test_df.drop(drop_cols + embedding_features + text_features, axis=1)
y_test = processed_test_df['correct'].tolist()

In [15]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [16]:
y_pred = regr.predict(X_test)

In [17]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Mean squared error: 619240740293742080.00


#### Reranking

In [18]:
from datasets import load_dataset
test_res_csv = load_dataset(f'hle2000/Mintaka_T5_xl_ssm_outputs', verification_mode='no_checks')['test'].to_pandas()
test_res_csv.head()

Found cached dataset parquet (/root/.cache/huggingface/datasets/hle2000___parquet/hle2000--Mintaka_T5_xl_ssm_outputs-9a78025ce7d9a549/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,target,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,...,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199,target_out_of_vocab,__index_level_0__
0,What man was a famous American author and also...,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,Mark Twain,...,Louisa May Alcott,Ambrose Bierce,Ishmael Lehman,"Mark Twain, Natchez, Missouri","Mark Twain, Louisa",Ishmael Levy,Ishmael Beam,"Mark Twain, Natchez, Mississippi",False,0
1,How many Academy Awards has Jake Gyllenhaal be...,1,3,2,3,2,3,2,3,2,...,13,12,8,11,10,6,9,13,False,1
2,"Who is older, The Weeknd or Drake?",Drake,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,The Weeknd,...,The Weeknd (2017),The Weeknd's oldest,The Weeknd is older than Drake,The Weeknd's,Dierks Bentley,"The Weeknd""",Drake & The Weeknd,The Weeknd's age,False,2
3,How many children did Donald Trump have?,5,5,5,3,5,3,5,3,5,...,24,6,76,13,61,108,0,8,False,3
4,Is the main hero in Final Fantasy IX named Kuja?,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Is it a Final Fantasy,Does it include Kuja?,Is it,Is he Kuja,Is it No,Y Yes,Is Kuja,Is he called Kuja,False,4


In [21]:
from tqdm import tqdm
final_acc, top200_total, top1_total, seq2seq_correct = 0, 0, 0, 0
    
for idx, group in tqdm(test_res_csv.iterrows()):
    curr_question_df = processed_test_df[processed_test_df["question"] == group['question']]
    curr_question_df = curr_question_df.drop(embedding_features + text_features + ['question'], axis=1)
    
    if len(curr_question_df) == 0: # we don't have subgraph for this question, take answer from seq2seq
        if group["answer_0"] == group["target"]:
            seq2seq_correct += 1
        else: # check if answer exist in 200 beams for question with no subgraphs
            all_beams = group.tolist()[2:-1] # all 200 beams
            all_beams = set(all_beams)
            top200_total += 1 if group["target"] in all_beams else 0
            
    else: # we have subgraph for this question  
        all_beams = group.tolist()[2:-1] # all 200 beams
        all_beams = set(all_beams)
        
        if group["target"] not in all_beams: # no correct answer in beam
            continue
            
        # correct answer exist in beam
        top1_total += 1 if group["answer_0"] == group["target"] else 0
        top200_total += 1
        
        is_corrects = curr_question_df["correct"].astype(bool).tolist()
        curr_question_df = curr_question_df.drop('correct', axis=1)
        
        preds = regr.predict(curr_question_df)
        max_idx = preds.argmax()
        
        if is_corrects[max_idx] is True:
            final_acc += 1
        
            
# final rerankinga, top1 and top200 result
reranking_res = (final_acc + seq2seq_correct)/ len(test_res_csv)
top200 = (top200_total + seq2seq_correct)/len(test_res_csv)
top1 = (top1_total + seq2seq_correct)/ len(test_res_csv)


4000it [02:13, 29.88it/s] 


In [20]:
print(f'top1: {top1}, top200: {top200}, reranking top1: {reranking_res}')

top1: 0.31725, top200: 0.69025, reranking top1: 0.27025
