In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.integrate import simps
from sklearn.preprocessing import StandardScaler
import re
from pandas.plotting import table
import subprocess
import os

In [2]:
trec_outputs_folder_path = "trec-models/"
trec_model_outputs_folder_path = "Model-outputs/"
queries_filename = "Queries/queries_offline.txt"

## DataFrame info


In [3]:
recall_order = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
iprec_at_recallN = ["iprec_at_recall_0.00", "iprec_at_recall_0.10","iprec_at_recall_0.20","iprec_at_recall_0.30",
                    "iprec_at_recall_0.40","iprec_at_recall_0.50","iprec_at_recall_0.60","iprec_at_recall_0.70",
                    "iprec_at_recall_0.80","iprec_at_recall_0.90","iprec_at_recall_1.00"]
P_N = ["P_5","P_10","P_15","P_20","P_30","P_100","P_200","P_500","P_1000"]

## Preprocess a trec_eval output DataFrame

In [4]:
def preprocess(df_):
    df_cpy = df_.copy()
    # remove whitespaces that came from trec_eval 
    df_cpy = df_cpy.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    # convert the number values that are represented as strings to be represented as floats
    df_cpy.Values = df_cpy.Values.apply(lambda x: float(x) if re.search('[a-zA-Z]', x) == None else x)
    
    #make the dataframe a double index
    df_cpy = df_cpy.set_index(['Id', 'Desc'])
    #it's a convention to call sort_index() after a double index has been created
    df_cpy = df_cpy.sort_index()
    return df_cpy

## save the prec_recall plot 

In [5]:
def plot_prec_recall(df_,save_path,filename,model,save=False,showplot=True):
    idx = pd.IndexSlice

    df_cpy = df_.copy()
    # get all the query Ids that exist in our dataframe
    QueryIds = df_cpy.reset_index()["Id"].unique()
    
    plt.rcParams["figure.figsize"] = (16,14)
    
    #plt.figure(figsize=(16,14))
    plt.figure(1)
    for query in range(len(QueryIds)):
        x = recall_order
        y = df_cpy.loc[idx[QueryIds[query],iprec_at_recallN], 'Values'].values
        if (QueryIds[query] == 'all'):
            plt.plot(x,y,label="QueryId {}".format(QueryIds[query]),linewidth=8)
        else:
            plt.plot(x,y,label="QueryId {}".format(QueryIds[query]))
    plt.text(0.7,1,model)
    plt.title("Precision-recall curves for all queries",)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    if showplot:
        plt.legend()
    if save:
        plt.savefig(os.path.join(save_path,filename)+".png",bbox_inches="tight")
    plt.close()
    print("Ok")

## Fetch all trec_output files from a directory

In [6]:
#inputs: absolute path of the folder where all the trec outputs are
#outputs: returns list of filenames
def get_all_trec_outputs(path):
    return [file for file in os.listdir(path) if file[0] != '.']

## Load dataframe

In [7]:
def build_df(path):
    df = pd.read_csv(path,delimiter='\t',header=None,names=["Desc","Id","Values"])
    df = preprocess(df)
    return df

## build dataframe of queries

In [8]:
def build_queries_df(queries_filename):
    q_df = pd.read_csv(queries_filename,delimiter="\t",header=None,names=["Id","query"])
    q_df.Id = q_df.Id.apply(lambda x: str(x))
    q_df = q_df.set_index("Id")
    return q_df

## remove outliers from dataframe

In [9]:
def rem_outliers_by_ap(df_):
    idx = pd.IndexSlice
    
    # return dataframe with only AP values as column and queryId as index
    def ap(df):
        area_prec_recall = df.loc[idx[:, "map"],].reset_index().drop("Desc",axis=1).set_index("Id")
        area_prec_recall.rename(columns={"Values":"AP"},inplace=True)
        area_prec_recall= area_prec_recall.sort_values(by=["AP"],ascending=False)
        return area_prec_recall
    
    # return as a dataframe the worst queries by ap
    def worst_queries_by_ap(df):
        area_prec_recall = ap(df)
        area_prec_recall_std = (area_prec_recall - area_prec_recall.mean())/area_prec_recall.std()
        bad_results_stand = area_prec_recall_std[area_prec_recall_std["AP"] < -0.7]
        worst_queries_by_AP_value = area_prec_recall.loc[bad_results_stand.index].sort_values(by=["AP"],ascending=False)
        return worst_queries_by_AP_value
    
    # get outliers-by-ap's index
    def get_outliers_idx_by_ap(df_):
        outliers_idx = list(worst_queries_by_ap(df_).index.values)
        outliers_idx.append("all") #also remove the "all" index since it doesn't anymore represent the mean
        return outliers_idx
    
    
    outliers_idx = get_outliers_idx_by_ap(df_)
    new_idx = [idx for idx in df_.index.levels[0] if idx not in outliers_idx]
    return df_.loc[idx[new_idx,],]

## create and save prec_recall plots

In [10]:
def rem_ext(file):
    return file[:file.find(".txt")]

In [11]:
def create_prec_recall_plots(trec_output_path,save_output_path,filename):
    idx = pd.IndexSlice
    
    if not os.path.exists(save_output_path):
        os.makedirs(save_output_path)
    
    for file in get_all_trec_outputs(trec_output_path):
        df = build_df(os.path.join(trec_outputs_folder_path,file))

        save_path = os.path.join(save_output_path,rem_ext(file))
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            
        #normal df
        plot_prec_recall(df,save_path,filename,rem_ext(file),True,showplot=False)
        stats = pd.DataFrame({"P_10":df.loc[idx["all","P_10"],],"map":df.loc[idx["all","map"]]})
        stats.to_csv(os.path.join(save_path,"stats.csv"),index=False)
        
        #df without outliers by AP
        df_o_ap = rem_outliers_by_ap(df)
        plot_prec_recall(df_o_ap,save_path,filename+"_o",rem_ext(file),True,showplot=True)
        stats = pd.DataFrame({"P_10":df_o_ap.loc[idx[:,"P_10"],].mean(),"map":df_o_ap.loc[idx[:,"map"],].mean()})
        stats.to_csv(os.path.join(save_path,"stats_o.csv"),index=False)

create_prec_recall_plots(trec_outputs_folder_path,trec_model_outputs_folder_path,"prec_recall")

Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
