In [1]:
import krippendorff
import glob
import numpy as np
import pandas as pd  
import IPython

In [2]:
df_kappa_alpha = pd.DataFrame(columns=['Coders','#Similar','#Dissimilar','#Total','Kappa_Score','Kappa_Agreement','Alpha_Score','Alpha_Agreement'])
df_kappa_alpha.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 8 columns):
Coders             0 non-null object
#Similar           0 non-null object
#Dissimilar        0 non-null object
#Total             0 non-null object
Kappa_Score        0 non-null object
Kappa_Agreement    0 non-null object
Alpha_Score        0 non-null object
Alpha_Agreement    0 non-null object
dtypes: object(8)
memory usage: 0.0+ bytes


In [3]:
def kappa_score(label1, label2, dir):
    files = [f for f in glob.glob(dir)]
    
    same = 0
    diff = 0
    label_dic1 = {'T':0, 'P':0, 'O':0, 'D': 0, 'H':0, 'U':0}
    label_dic2 = {'T':0, 'P':0, 'O':0, 'D': 0, 'H':0, 'U':0}
    
    for file in files:
        df = pd.read_csv(file)
        df = df.replace(np.nan, '', regex=True)
        
        for i in range(len(df['text'])):
            if df['file'][i]=='':
                break
            if df[label1][i]==df[label2][i] and df[label1][i] == 'U':
                continue
            if len(df[label2])==0 and df[label1][i] == 'U':
                continue
            if len(df[label1])==0 and df[label2][i] == 'U':
                continue

            label_1 = str(df[label1][i]).replace(" ", "")
            label_1 = label_1.split(',')[0]
            if len(label_1)>0:
                label_dic1[label_1]+=1

            label_2 = str(df[label2][i]).replace(" ", "")
            label_2 = label_2.split(',')[0]
            if len(label_2)>0:
                label_dic2[label_2]+=1

            if label_1==label_2:
                same+=1
            else:
                diff+=1
                
    po=same/(same+diff)
    for k in label_dic1:
        label_dic1[k]=label_dic1[k]/(same+diff)
        
    for k in label_dic2:
        label_dic2[k]=label_dic2[k]/(same+diff)
        
    label_prob = {'T':0, 'P':0, 'O':0, 'D': 0, 'H':0, 'U':0}
    
    for k in label_prob:
        label_prob[k] = label_dic1[k]*label_dic2[k]
        
    pe = 0
    for k in label_prob:
        pe += label_prob[k]
    
    kappa = (po-pe)/(1-pe)
    
    if kappa<0.1:
        kappa_agreement = 'No'
    elif kappa>=0.1 and kappa<0.2:
        kappa_agreement =   'Slight'
    elif kappa>=0.2 and kappa<0.4:
        kappa_agreement = 'Fair'
    elif kappa>=0.4 and kappa<0.6:
        kappa_agreement = 'Moderate'
    elif kappa>=0.6 and kappa<0.8:
        kappa_agreement = 'Substatial'
    elif kappa>=0.8 and kappa<1:
        kappa_agreement = 'Near Perfect'    
    elif kappa==1:
        kappa_agreement = 'Perfect'
    
    total = same+diff
    
    return same, diff, total, kappa, kappa_agreement

In [4]:
labels_without_u = {'T':1, 'P':2, 'O':3, 'D': 4, 'H':5, 'U':np.nan}
def key_to_value_ignoring_u(key):
    if type(key) == float:
        return np.nan
    key = key.strip()
    key.split(',')
    key = key[0]
    return labels_without_u[key]

In [5]:
def krippendorff_alpha(filter_key, label1, label2, dir):
    files = [f for f in glob.glob(dir)]
    
    label_1 = []
    label_2 = []
    
    rel_mat = []
    for file in files:
        df = pd.read_csv(file)
        df[label2] = df[label2].apply(filter_key)
        df[label1] = df[label1].apply(filter_key)
        
        label_1.extend(df[label1])
        label_2.extend(df[label2])
        
    reliability_matrix = np.asarray([label_1,label_2])
        
    alpha = krippendorff.alpha(reliability_matrix[[0,1]])
    
    if alpha<0.1:
        alpha_agreement = 'No'
    elif alpha>=0.1 and alpha<0.2:
        alpha_agreement =   'Slight'
    elif alpha>=0.2 and alpha<0.4:
        alpha_agreement = 'Fair'
    elif alpha>=0.4 and alpha<0.6:
        alpha_agreement = 'Moderate'
    elif alpha>=0.6 and alpha<0.8:
        alpha_agreement = 'Substatial'
    elif alpha>=0.8 and alpha<1:
        alpha_agreement = 'Near Perfect'    
    elif alpha==1:
        alpha_agreement = 'Perfect'
    
    return alpha, alpha_agreement

In [6]:
coders = []
similars = []
dissimilars = []
totals = []
kappas = []
kappas_agrees = []
alphas = []
alphas_agrees = []

In [7]:
coder = "Sally_Frenard"
same, diff, total, kappa, kappa_agree = kappa_score('label_Frenard', 'label_SALLY', "MergedFiles-SALLY-Frenard/*.csv")
alpha, alpha_agree = krippendorff_alpha(key_to_value_ignoring_u, 'label_Frenard', 'label_SALLY', "MergedFiles-SALLY-Frenard/*.csv")

coders.append(coder)
similars.append(same)
dissimilars.append(diff)
totals.append(total)
kappas.append(kappa)
kappas_agrees.append(kappa_agree)
alphas.append(alpha)
alphas_agrees.append(alpha_agree)

In [8]:
coder = "Sally_Trixy"
same, diff, total, kappa, kappa_agree = kappa_score('label_struck', 'label_SALLY', "MergedFiles-SALLY-struck/*.csv")
alpha, alpha_agree = krippendorff_alpha(key_to_value_ignoring_u, 'label_struck', 'label_SALLY', "MergedFiles-SALLY-struck/*.csv")

coders.append(coder)
similars.append(same)
dissimilars.append(diff)
totals.append(total)
kappas.append(kappa)
kappas_agrees.append(kappa_agree)
alphas.append(alpha)
alphas_agrees.append(alpha_agree)

In [9]:
df_kappa_alpha['Coders']= coders
df_kappa_alpha['#Similar'] = similars
df_kappa_alpha['#Dissimilar'] = dissimilars
df_kappa_alpha['#Total'] = totals
df_kappa_alpha['Kappa_Score'] = kappas
df_kappa_alpha['Kappa_Agreement'] = kappas_agrees
df_kappa_alpha['Alpha_Score'] = alphas
df_kappa_alpha['Alpha_Agreement'] = alphas_agrees

In [10]:
df_kappa_alpha

Unnamed: 0,Coders,#Similar,#Dissimilar,#Total,Kappa_Score,Kappa_Agreement,Alpha_Score,Alpha_Agreement
0,Sally_Frenard,70,166,236,0.064409,No,0.429068,Moderate
1,Sally_Trixy,49,155,204,0.092813,No,0.491475,Moderate


In [12]:
df_kappa_alpha.to_csv('Results/Overall_Results.csv', header=True)