In [43]:
import pandas as pd
from os import listdir

rankings_dir = "rankings/"
summarized_rankings_dir = "summarised_rankings/"

## Full captions

In [44]:
# Read and merge all csv files
human_eval = pd.DataFrame(columns=["model", "rank", "song_id", "internal"])
for f in listdir(rankings_dir):
    if f.endswith(".csv"):
        df = pd.read_csv(rankings_dir + f, header=None)
        df.columns = ["model", "rank", "song_id"]
        # set column internal to 1 if the file is from one member of the group
        df["internal"] =  int("arnau" in f or "dominik" in f or "corina" in f or "alex" in f)
        human_eval = pd.concat([human_eval, df])

N = len(human_eval)
print("Number of evaluations: ", N, "number of songs:", N//3)
print(" internal:", len(human_eval[human_eval["internal"] == 1]), "external:", len(human_eval[human_eval["internal"] == 0]))
human_eval.head()

Number of evaluations:  540 number of songs: 180
 internal: 393 external: 147


Unnamed: 0,model,rank,song_id,internal
0,lstm_attn_noaug,3,ckOe-8qdaew,0
1,gpt2_enc_noaug,1,ckOe-8qdaew,0
2,gpt2_enc_chataug,2,ckOe-8qdaew,0
3,gpt2_enc_chataug,1,R_HAtyDbw1M,0
4,lstm_attn_noaug,3,R_HAtyDbw1M,0


Careful with means and stds for ordinal data: https://www.researchgate.net/post/How-to-calculate-mean-and-standard-deviation-for-likert-scale-and-how-to-analyze-that-data-to-write-in-research-paper

In [45]:
# Create results table with the average rank and std for each model
results = human_eval.groupby(["model"]).agg({"rank": ["mean", "std", "median"]}).round(3)
# Add counts of each model for each of the ranks 1, 2 and 3
results["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model"]).size()
results["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model"]).size()
results["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model"]).size()
# Same but normalised by the total number of rankings (round 2 decimals)
results["ranks_1_norm"] = (results["ranks_1"]*3/N).round(3)
results["ranks_2_norm"] = (results["ranks_2"]*3/N).round(3)
results["ranks_3_norm"] = (results["ranks_3"]*3/N).round(3)

results

Unnamed: 0_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,mean,std,median,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gpt2_enc_chataug,1.761,0.712,2.0,72,79,29,0.4,0.439,0.161
gpt2_enc_noaug,1.744,0.799,2.0,86,54,40,0.478,0.3,0.222
lstm_attn_noaug,2.494,0.705,3.0,22,47,111,0.122,0.261,0.617


In [46]:
# Same but with extra column with external/internal
results_ext_int = human_eval.groupby(["model", "internal"]).agg({"rank": ["mean", "std", "median"]}).round(3)
results_ext_int["ranks_1"] = human_eval[human_eval["rank"] == 1].groupby(["model", "internal"]).size()
results_ext_int["ranks_2"] = human_eval[human_eval["rank"] == 2].groupby(["model", "internal"]).size()
results_ext_int["ranks_3"] = human_eval[human_eval["rank"] == 3].groupby(["model", "internal"]).size()
sizes = human_eval.groupby(["model", "internal"]).size()
results_ext_int["ranks_1_norm"] = (results_ext_int["ranks_1"]/sizes).round(3)
results_ext_int["ranks_2_norm"] = (results_ext_int["ranks_2"]/sizes).round(3)
results_ext_int["ranks_3_norm"] = (results_ext_int["ranks_3"]/sizes).round(3)
results_ext_int


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_3,ranks_1_norm,ranks_2_norm,ranks_3_norm
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,median,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
model,internal,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gpt2_enc_chataug,0,1.735,0.785,2.0,23,16,10,0.469,0.327,0.204
gpt2_enc_chataug,1,1.771,0.686,2.0,49,63,19,0.374,0.481,0.145
gpt2_enc_noaug,0,1.735,0.73,2.0,21,20,8,0.429,0.408,0.163
gpt2_enc_noaug,1,1.748,0.826,2.0,65,34,32,0.496,0.26,0.244
lstm_attn_noaug,0,2.531,0.68,3.0,5,13,31,0.102,0.265,0.633
lstm_attn_noaug,1,2.481,0.716,3.0,17,34,80,0.13,0.26,0.611


## Sumarized

In [47]:
# Read and merge all csv files
summ_human_eval = pd.DataFrame(columns=["model", "rank", "internal"])
for f in listdir(summarized_rankings_dir):
    if f.endswith(".csv"):
        df = pd.read_csv(summarized_rankings_dir + f, header=None)
        df.columns = ["model", "rank"]
        # set column internal to 1 if the file is from one member of the group
        df["internal"] =  int("arnau" in f or "dominik" in f or "corina" in f or "alex" in f)
        summ_human_eval = pd.concat([summ_human_eval, df])

N = len(summ_human_eval)
print("Number of evaluations:", N, "number of songs:", N//2)
print(" internal:", len(summ_human_eval[summ_human_eval["internal"] == 1]), "external:", len(summ_human_eval[summ_human_eval["internal"] == 0]))
summ_human_eval.head()

Number of evaluations: 152 number of songs: 76
 internal: 136 external: 16


Unnamed: 0,model,rank,internal
0,lstm,2,1
1,gpt2,1,1
2,gpt2,2,1
3,lstm,1,1
4,lstm,1,1


In [48]:
# Create results table with the average rank and std for each model
summ_results = summ_human_eval.groupby(["model"]).agg({"rank": ["mean", "std", "median"]}).round(3)
# Add counts of each model for each of the ranks 1, 2
summ_results["ranks_1"] = summ_human_eval[summ_human_eval["rank"] == 1].groupby(["model"]).size()
summ_results["ranks_2"] = summ_human_eval[summ_human_eval["rank"] == 2].groupby(["model"]).size()
# Same but normalised by the total number of rankings (round 2 decimals)
summ_results["ranks_1_norm"] = (summ_results["ranks_1"]*2/N).round(3)
summ_results["ranks_2_norm"] = (summ_results["ranks_2"]*2/N).round(3)

summ_results

Unnamed: 0_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_1_norm,ranks_2_norm
Unnamed: 0_level_1,mean,std,median,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
gpt2,1.289,0.457,1.0,54,22,0.711,0.289
lstm,1.711,0.457,2.0,22,54,0.289,0.711


In [49]:
# Same but with extra column with external/internal
summ_results_ext_int = summ_human_eval.groupby(["model", "internal"]).agg({"rank": ["mean", "std", "median"]}).round(3)
summ_results_ext_int["ranks_1"] = summ_human_eval[summ_human_eval["rank"] == 1].groupby(["model", "internal"]).size()
summ_results_ext_int["ranks_2"] = summ_human_eval[summ_human_eval["rank"] == 2].groupby(["model", "internal"]).size()
summ_sizes = summ_human_eval.groupby(["model", "internal"]).size()
summ_results_ext_int["ranks_1_norm"] = (summ_results_ext_int["ranks_1"]/summ_sizes).round(3)
summ_results_ext_int["ranks_2_norm"] = (summ_results_ext_int["ranks_2"]/summ_sizes).round(3)
summ_results_ext_int


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,rank,rank,ranks_1,ranks_2,ranks_1_norm,ranks_2_norm
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,median,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
model,internal,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
gpt2,0,1.5,0.535,1.5,4,4,0.5,0.5
gpt2,1,1.265,0.444,1.0,50,18,0.735,0.265
lstm,0,1.5,0.535,1.5,4,4,0.5,0.5
lstm,1,1.735,0.444,2.0,18,50,0.265,0.735
