# Imports

In [1]:
import collections
import pandas as pd
from nltk import agreement
import scipy.stats as stats

import vrme_lib

# Performance

In [11]:
#checking for evaulation results
df_labeled = pd.read_csv('../data/evaulation_data/evaluation_set_with_labels_DONT_LOOK.csv')

rating_dfs = {}

for initials in "RZ KK NK".split():
  rating_dfs[initials] = pd.read_csv(f'../data/evaulation_data/Title_Evaluation_Final_{initials}.csv')[['unique_id','Rating']]


df_label_rated = df_labeled.merge(rating_dfs['RZ'], on = 'unique_id').merge(rating_dfs['NK'], on = 'unique_id').merge(rating_dfs['KK'], on = 'unique_id')
df_label_rated['rating_sum'] = df_label_rated[['Rating_x','Rating_y','Rating']].sum(axis = 1)


df_label_rated['rating_percentage'] = df_label_rated['rating_sum']/303 # 300 + 3 ties
df_performance = df_label_rated[['method','rating_sum','rating_percentage']].groupby('method').sum()


for row in df_performance.itertuples():
  print(row.Index.replace("KNN", 'VRM-E'), row.rating_sum, f'{row.rating_percentage:.2%}')
print()

method_lists = {
  method:df_label_rated[df_label_rated['method']==method][['rating_sum']].rating_sum.tolist() for method in 'SPSM KNN naive'.split() 
}

naive_vrme_ttest = stats.ttest_rel(method_lists['naive'], method_lists['KNN']) # VRM-E was called KNN in in the annotation process
spsm_vrme_ttest = stats.ttest_rel(method_lists['SPSM'], method_lists['KNN'])
vrme_lib.print_ttest("naive", naive_vrme_ttest)
vrme_lib.print_ttest("SPSM", spsm_vrme_ttest)

VRM-E 212 69.97%
SPSM 53 17.49%
naive 38 12.54%

T-Test: naive vs VRM-E
statistic: -10.75
p-value: 2.5121072302919953e-18

T-Test: SPSM vs VRM-E
statistic: -8.733
p-value: 6.306465360934115e-14



# Inter-rater reliability

In [12]:
rater1 = df_label_rated.Rating_x.tolist()
rater2 = df_label_rated.Rating_y.tolist()
rater3 = df_label_rated.Rating.tolist()

taskdata=[[0,str(i),str(rater1[i])] for i in range(0,len(rater1))]+[[1,str(i),str(rater2[i])] for i in range(0,len(rater2))]+[[2,str(i),str(rater3[i])] for i in range(0,len(rater3))]
ratingtask = agreement.AnnotationTask(data=taskdata)


print('Table 8')
print(f'kappa {ratingtask.kappa():.4}')
print(f'fleiss {ratingtask.multi_kappa():.4}')
print(f'alpha {ratingtask.alpha():.4}')
print(f'scotts {ratingtask.pi():.4}')

Table 8
kappa 0.5621
fleiss 0.5622
alpha 0.5627
scotts 0.5622
