In [1]:
import pandas as pd
from os import listdir

rankings_dir = "rankings/"

In [63]:
human_eval = pd.DataFrame(columns=["model", "rank", "song_id", "internal"])
# Read and merge all csv files
for f in listdir(rankings_dir):
    if f.endswith(".csv"):
        df = pd.read_csv(rankings_dir + f, header=None)
        df.columns = ["model", "rank", "song_id"]
        # set column internal to 1 if the file is from one member of the group
        df["internal"] =  int("arnau" in f or "dominik" in f or "corina" in f or "alex" in f)
        human_eval = pd.concat([human_eval, df])

N = len(human_eval)
print("Number of evaluations: ", N)
print(" internal:", len(human_eval[human_eval["internal"] == 1]), "external:", len(human_eval[human_eval["internal"] == 0]))
human_eval.head()

Number of evaluations:  243
 internal: 147 external: 96


Unnamed: 0,model,rank,song_id,internal
0,lstm_attn_noaug,3,ckOe-8qdaew,0
1,gpt2_enc_noaug,1,ckOe-8qdaew,0
2,gpt2_enc_chataug,2,ckOe-8qdaew,0
3,gpt2_enc_chataug,1,R_HAtyDbw1M,0
4,lstm_attn_noaug,3,R_HAtyDbw1M,0


In [68]:
# Create results table with the average rank and std for each model
results = human_eval.groupby(["model"]).agg({"rank": ["mean", "std"]}).round(3)
# Add counts of each model for each of the ranks 1, 2 and 3
results["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model"]).size()
results["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model"]).size()
results["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model"]).size()
# Same but normalised by the total number of rankings (round 2 decimals)
results["ranks_1_norm"] = (results["ranks_1"]*3/N).round(3)
results["ranks_2_norm"] = (results["ranks_2"]*3/N).round(3)
results["ranks_3_norm"] = (results["ranks_3"]*3/N).round(3)

results

Unnamed: 0_level_0,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,mean,std,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
gpt2_enc_chataug,1.728,0.652,31,41,9,0.383,0.506,0.111
gpt2_enc_noaug,1.63,0.749,43,25,13,0.531,0.309,0.16
lstm_attn_noaug,2.642,0.639,7,15,59,0.086,0.185,0.728


In [70]:
# Same but with extra column with external/internal
results_ext_int = human_eval.groupby(["model", "internal"]).agg({"rank": ["mean", "std"]}).round(3)
results_ext_int["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model", "internal"]).size()
results_ext_int["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model", "internal"]).size()
results_ext_int["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model", "internal"]).size()
results_ext_int["ranks_1_norm"] = (results_ext_int["ranks_1"]*3/N).round(3)
results_ext_int["ranks_2_norm"] = (results_ext_int["ranks_2"]*3/N).round(3)
results_ext_int["ranks_3_norm"] = (results_ext_int["ranks_3"]*3/N).round(3)
results_ext_int


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
model,internal,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gpt2_enc_chataug,0,1.75,0.762,14,12,6,0.173,0.148,0.074
gpt2_enc_chataug,1,1.714,0.577,17,29,3,0.21,0.358,0.037
gpt2_enc_noaug,0,1.719,0.729,14,13,5,0.173,0.16,0.062
gpt2_enc_noaug,1,1.571,0.764,29,12,8,0.358,0.148,0.099
lstm_attn_noaug,0,2.531,0.718,4,7,21,0.049,0.086,0.259
lstm_attn_noaug,1,2.714,0.577,3,8,38,0.037,0.099,0.469
