Get all the source files to re rank from the input folder

In [19]:
import os

folder_path = "dataset-v2-vanilla"

files_to_re_rank = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

In [20]:
files_to_re_rank

['CS-META-ALL-QUERIES-output.txt',
 'CS-EXTRACTED-ALL-QUERIES-output.txt',
 'LMD-META+EXTRACTED-ALL-QUERIES-output.txt',
 'BM25-EXTRACTED-ALL-QUERIES-output.txt',
 'LMD-META-ALL-QUERIES-output.txt',
 'CS-META+EXTRACTED-ALL-QUERIES-output.txt',
 'BM25-META-ALL-QUERIES-output.txt',
 'BM25-META+EXTRACTED-ALL-QUERIES-output.txt',
 'LMD-EXTRACTED-ALL-QUERIES-output.txt']

Load the regressor model

In [21]:
features = [
    "size",
    "number_of_classes",
    "number_of_literals",
    "number_of_entities",
    "number_of_properties",
    "number_of_connections",
    "number_of_connected_vertices",
    "average_literals_per_vertex",
]

In [22]:
import pickle

# Load the saved model
with open('random_forest_regressor_model.pkl', 'rb') as model:
    reg_tree_model = pickle.load(model)

Load the features extracted from the datasets

In [23]:
import pandas as pd

features_df = pd.read_csv('features.csv')

In [24]:
features_df[features_df['dataset_id'] == 25054]

Unnamed: 0,dataset_id,size,number_of_classes,number_of_literals,number_of_entities,number_of_properties,number_of_connections,number_of_connected_vertices,average_literals_per_vertex,relevance


Read all the rows from each file of the folder

In [25]:
def re_rank(rows: list) -> list:

    # Calculate new score
    new_rank = []

    max_score = float(rows[0].split()[4])
    
    for row in rows:
        values = row.split()
        dataset_id = int(values[2])
        
        lucene_score = float(values[4])
        fr = features_df[features_df['dataset_id'] == dataset_id]

        new_score = 0
        predicted_score = 0
        if not fr.empty:
            predicted_score = reg_tree_model.predict(fr[features])[0]
            new_score = (lucene_score/max_score + predicted_score)/2
        else:
            print(f"DATASET EMPTY:{dataset_id}")
            new_score = (lucene_score/max_score + predicted_score)

        new_rank.append((dataset_id, new_score))
    
    # Rank based on the new score
    new_rank_sorted = sorted(new_rank, key=lambda x: x[1], reverse=True)

    # Create the output format
    rl = []
    index = 1
    for dataset_id, new_score in new_rank_sorted:
        for e in rows:
            values = e.split()
            if int(values[2]) == dataset_id:
                values[3] = index
                values[4] = "{:.6f}".format(new_score)
                index += 1
                rl.append('\t'.join(map(str, values)))

    return rl

In [26]:
from collections import defaultdict


def re_rank_file(path: str):
    rows_per_query = defaultdict(list)
    with open(path, "r") as file:
        for line in file.readlines():
            parts = line.strip().split("\t")
            query_id = int(parts[0])
            rows_per_query[query_id].append(line)

    re_ranked = []
    for _, rows in rows_per_query.items():
        re_ranked.append(re_rank(rows))
    
    return re_ranked

In [27]:
for item in files_to_re_rank:
    path = f"{folder_path}/{item}"
    re_ranked = re_rank_file(path)

    output_path = f"{folder_path}-re-ranked/{item}"
    with open(output_path, 'w') as file:
        for rows in re_ranked:
            sr = '\n'.join(map(str, rows))
            file.write(sr + "\n")
