In [72]:
import pandas as pd
from os import listdir

rankings_dir = "rankings/"

In [80]:
# Read and merge all csv files
human_eval = pd.DataFrame(columns=["model", "rank", "song_id", "internal"])
for f in listdir(rankings_dir):
    if f.endswith(".csv"):
        df = pd.read_csv(rankings_dir + f, header=None)
        df.columns = ["model", "rank", "song_id"]
        # set column internal to 1 if the file is from one member of the group
        df["internal"] =  int("arnau" in f or "dominik" in f or "corina" in f or "alex" in f)
        human_eval = pd.concat([human_eval, df])

N = len(human_eval)
print("Number of evaluations: ", N)
print(" internal:", len(human_eval[human_eval["internal"] == 1]), "external:", len(human_eval[human_eval["internal"] == 0]))
human_eval.head()

Number of evaluations:  267
 internal: 144 external: 123


Unnamed: 0,model,rank,song_id,internal
0,lstm_attn_noaug,3,ckOe-8qdaew,0
1,gpt2_enc_noaug,1,ckOe-8qdaew,0
2,gpt2_enc_chataug,2,ckOe-8qdaew,0
3,gpt2_enc_chataug,1,R_HAtyDbw1M,0
4,lstm_attn_noaug,3,R_HAtyDbw1M,0


Careful with means and stds for ordinal data: https://www.researchgate.net/post/How-to-calculate-mean-and-standard-deviation-for-likert-scale-and-how-to-analyze-that-data-to-write-in-research-paper

In [87]:
# Create results table with the average rank and std for each model
results = human_eval.groupby(["model"]).agg({"rank": ["mean", "std", "median"]}).round(3)
# Add counts of each model for each of the ranks 1, 2 and 3
results["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model"]).size()
results["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model"]).size()
results["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model"]).size()
# Same but normalised by the total number of rankings (round 2 decimals)
results["ranks_1_norm"] = (results["ranks_1"]*3/N).round(3)
results["ranks_2_norm"] = (results["ranks_2"]*3/N).round(3)
results["ranks_3_norm"] = (results["ranks_3"]*3/N).round(3)

results

Unnamed: 0_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,mean,std,median,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gpt2_enc_chataug,1.697,0.664,2.0,37,42,10,0.416,0.472,0.112
gpt2_enc_noaug,1.674,0.75,2.0,44,30,15,0.494,0.337,0.169
lstm_attn_noaug,2.629,0.646,3.0,8,17,64,0.09,0.191,0.719


In [86]:
# Same but with extra column with external/internal
results_ext_int = human_eval.groupby(["model", "internal"]).agg({"rank": ["mean", "std", "median"]}).round(3)
results_ext_int["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model", "internal"]).size()
results_ext_int["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model", "internal"]).size()
results_ext_int["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model", "internal"]).size()
results_ext_int["ranks_1_norm"] = (results_ext_int["ranks_1"]*3/N).round(3)
results_ext_int["ranks_2_norm"] = (results_ext_int["ranks_2"]*3/N).round(3)
results_ext_int["ranks_3_norm"] = (results_ext_int["ranks_3"]*3/N).round(3)
results_ext_int


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,median,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
model,internal,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gpt2_enc_chataug,0,1.707,0.782,2.0,20,13,8,0.225,0.146,0.09
gpt2_enc_chataug,1,1.688,0.552,2.0,17,29,2,0.191,0.326,0.022
gpt2_enc_noaug,0,1.78,0.725,2.0,16,18,7,0.18,0.202,0.079
gpt2_enc_noaug,1,1.583,0.767,1.0,28,12,8,0.315,0.135,0.09
lstm_attn_noaug,0,2.512,0.711,3.0,5,10,26,0.056,0.112,0.292
lstm_attn_noaug,1,2.729,0.574,3.0,3,7,38,0.034,0.079,0.427
