In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.integrate import simps
from sklearn.preprocessing import StandardScaler
import re
from pandas.plotting import table
import subprocess
import os

In [8]:
trec_outputs_folder_path = "/Users/simaonovais/Desktop/Mestrado/RI/Ri-parse_trec_eval/Ri-parse_trec_eval/trec-models/"
trec_model_outputs_folder_path = "/Users/simaonovais/Desktop/Mestrado/RI/Ri-parse_trec_eval/Ri-parse_trec_eval/Model-outputs/"

## DataFrame info


In [20]:
recall_order = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
iprec_at_recallN = ["iprec_at_recall_0.00", "iprec_at_recall_0.10","iprec_at_recall_0.20","iprec_at_recall_0.30",
                    "iprec_at_recall_0.40","iprec_at_recall_0.50","iprec_at_recall_0.60","iprec_at_recall_0.70",
                    "iprec_at_recall_0.80","iprec_at_recall_0.90","iprec_at_recall_1.00"]
P_N = ["P_5","P_10","P_15","P_20","P_30","P_100","P_200","P_500","P_1000"]

## Preprocess a trec_eval output DataFrame

In [2]:
def preprocess(df_):
    df_cpy = df_.copy()
    # remove whitespaces that came from trec_eval 
    df_cpy = df_cpy.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    # convert the number values that are represented as strings to be represented as floats
    df_cpy.Values = df_cpy.Values.apply(lambda x: float(x) if re.search('[a-zA-Z]', x) == None else x)
    
    #make the dataframe a double index
    df_cpy = df_cpy.set_index(['Id', 'Desc'])
    #it's a convention to call sort_index() after a double index has been created
    df_cpy = df_cpy.sort_index()
    return df_cpy

## save the prec_recall plot 

In [89]:
def plot_prec_recall(df_,save_path,model,save=False,showplot=True):
    idx = pd.IndexSlice

    df_cpy = df_.copy()
    # get all the query Ids that exist in our dataframe
    QueryIds = df_cpy.index.levels[0]
    
    plt.rcParams["figure.figsize"] = (16,14)
    
    #plt.figure(figsize=(16,14))
    plt.figure(1)
    for query in range(len(QueryIds)):
        x = recall_order
        y = df_cpy.loc[idx[QueryIds[query],iprec_at_recallN], 'Values'].values
        if (QueryIds[query] == 'all'):
            plt.plot(x,y,label="QueryId {}".format(QueryIds[query]),linewidth=8)
        else:
            plt.plot(x,y,label="QueryId {}".format(QueryIds[query]))
    plt.text(0.7,1,model)
    plt.title("Precision-recall curves for all queries",)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    if showplot:
        plt.legend()
    if save:
        plt.savefig(os.path.join(save_path,"map_plot")+".png",bbox_inches="tight")
    plt.close()

## Fetch all trec_output files from a directory

In [84]:
#inputs: absolute path of the folder where all the trec outputs are
#outputs: returns list of filenames
def get_all_trec_outputs(path):
    return [file for file in os.listdir(path) if file[0] != '.']

## Load dataframe

In [23]:
def build_df(path):
    df = pd.read_csv(path,delimiter='\t',header=None,names=["Desc","Id","Values"])
    df = preprocess(df)
    return df

## create and save prec_recall plots

In [92]:
def create_prec_recall_plots(trec_output_path,plot_save_output_path):
    for file in get_all_trec_outputs(trec_output_path):
        df = build_df(trec_outputs_folder_path+file1)

        save_path = os.path.join(trec_model_outputs_folder_path,rem_txt(file))
        save_file = os.path.join(save_path,"map_plot")
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        plot_prec_recall(df,save_path,rem_ext(file),True,showplot=False)

create_prec_recall_plots(trec_outputs_folder_path,trec_model_outputs_folder_path)

## Testing

In [90]:
df = build_df()
df=pd.DataFrame({'Data':np.random.normal(size=200)})  #example dataset of normally distributed data. 
df[np.abs(df.Data-df.Data.mean())<=(3*df.Data.std())] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
df[~(np.abs(df.Data-df.Data.mean())>(3*df.Data.std()))] #or if you prefer the other way around