In [291]:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
import numpy as np
from math import sqrt
import random

In [292]:
# boostrap function for cohen kappa score 
def bootstrap_cqk(y_true, y_pred, quad=False, num_resamples = 999):
    Y = np.array([y_true, y_pred]).T

    weighted_kappas = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        y_true_resample = Y_resample[:, 0]
        y_pred_resample = Y_resample[:, 1]
        if quad==False:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str))
        else: 
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str), weights='quadratic')
        weighted_kappas.append(weighted_kappa)

    return np.mean(weighted_kappas), np.std(weighted_kappas), np.percentile(weighted_kappas, 2.25), np.percentile(weighted_kappas, 97.5)



In [293]:
# function to create confusion matrix 
def create_confusion_matrix(list_1: list , list_2: list)-> pd.DataFrame:
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        list_attributs = []
        for i in list_1:
            if i not in list_attributs:
                list_attributs.append(i)
        for i in list_2:
            if i not in list_attributs:
                list_attributs.append(i)


        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)))
        for k in range(size):
            for l in range(size):
                att_1 = list_attributs[k]
                att_2 = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_1):
                        if list_2[i] == att_2 :
                                matrix.loc[k,l]+=1
        for i in range(size):
            for j in range(size):
                matrix.loc[size, i] += matrix.loc[j, i]
                matrix.loc[i, size] += matrix.loc[i, j]
        for i in range(size):      
            matrix.loc[size, size] += matrix.loc[i, size]
        matrix = matrix / len(list_1)
    return matrix

In [294]:
# function to calculate the overall proportion of agreement expected by chance
def expected_proportion(matrix: pd.DataFrame):
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

# function to calculate the overall proportion of observed agreement 
def observed_proportion(matrix: pd.DataFrame):
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

In [295]:
# function to calculate the standard error
def sd_cohen(po, pe):
    sd_= sqrt((po*(1-po))/((1-pe)*(1-pe)))
    return sd_

In [313]:

def sd_fleiss(po, pe, matrix):
    y1 = 0
    y2 = 0 
    k = len(matrix) -1

    for i in range (k):
        for j in range(k):
            if j != i:
                carre = (matrix.loc[k,i] + matrix.loc[j,k])
                carre = carre*carre
                y1 += matrix.loc[i,j] * carre
        carre2 = (matrix.loc[k,i] + matrix.loc[i,k])  
        carre2 = carre2*carre2   
        y2 += matrix.loc[i,i] * carre2
    y3 = ((po*pe) - (2*pe) + po)
    y3 = y3*y3
    x = (po*(1-pe)*(1-pe)) + ((1-po)*(1-po)* y1) - (2*(1-pe)*(1-po)*y2) - y3

    
    # if x < 0:
    #     x = 0
    res = sqrt(x)
    sd_ = res/((1-pe)*(1-pe))
    return sd_



In [314]:
# function to calculcate the kappa score
def kappa(po, pe):
    return (po-pe)/(1-pe)

In [315]:
# function to write stats in the dataframe
def write_stat(df_final, category, method, kappa_, low_, high_, sd_, N):
    df_final.loc[category, ("kappa score", method)]=kappa_
    df_final.loc[category, ("ci low", method)]=low_
    df_final.loc[category, ("ci high", method)]=high_
    df_final.loc[category, ("sd", method)]=sd_
    df_final.loc[category, ("se", method)]=sd_/sqrt(N)

In [316]:

# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../rating/rating_E.tsv"
df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
df_rating_1 = df_rating_1.dropna()


In [317]:
# Enter the path to the tsv file with the rating from the second reviwer
path_tsv = "../rating/rating_O.tsv"
df_rating_2 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
df_rating_2 = df_rating_2.dropna()

In [318]:
# list of categories you want to make statistics for
list_categories = [
        "Models and algorithms",
        "Datasets",
        "Code",
        "Experimental results",
        "Error bars or statistical significance",
        "Statement",
        "Comments",
        "Meta-categories",
    ]
list_methods = ["bootstrap", "cohen", "fleiss"]

In [319]:
# function to create the DataFrame 

list_stats = ["kappa score", "ci low", "ci high", "sd", "se"]

index_line = pd.Index(list_categories)
index_column = pd.MultiIndex.from_product( [list_stats, list_methods], names=["stat", "method"])

df_final = pd.DataFrame(index=index_line, columns=index_column)

print(df_final.index)
print(df_final.columns)

Index(['Models and algorithms', 'Datasets', 'Code', 'Experimental results',
       'Error bars or statistical significance', 'Statement', 'Comments',
       'Meta-categories'],
      dtype='object')
MultiIndex([('kappa score', 'bootstrap'),
            ('kappa score',     'cohen'),
            ('kappa score',    'fleiss'),
            (     'ci low', 'bootstrap'),
            (     'ci low',     'cohen'),
            (     'ci low',    'fleiss'),
            (    'ci high', 'bootstrap'),
            (    'ci high',     'cohen'),
            (    'ci high',    'fleiss'),
            (         'sd', 'bootstrap'),
            (         'sd',     'cohen'),
            (         'sd',    'fleiss'),
            (         'se', 'bootstrap'),
            (         'se',     'cohen'),
            (         'se',    'fleiss')],
           names=['stat', 'method'])


In [325]:

for category in range(len(list_categories)):
    all_reviews_1 = []
    all_reviews_2 = []
    for i in range(3):

        if list_categories[category] == "Meta-categories":
            column_id = i + 20
        else: 
            column_id = i*8 + 3 + category
        
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        list_review_2 = df_rating_2.loc[2:, column_id].values.tolist()

        all_reviews_1 = all_reviews_1 + list_review_1
        all_reviews_2 = all_reviews_2 + list_review_2

    if all_reviews_1 != all_reviews_2:
        # all reviews 
        N = len(all_reviews_1)

        kappa_btp, sd_btp, low_btp, high_btp = bootstrap_cqk(y_true=all_reviews_1, y_pred=all_reviews_2)
        write_stat(df_final, list_categories[category], "bootstrap", kappa_btp, low_btp, high_btp, sd_btp, N)
        
        confusion_matrix = create_confusion_matrix(list_1=all_reviews_1, list_2=all_reviews_2)
        po_ = observed_proportion(confusion_matrix)
        pe_ = expected_proportion(confusion_matrix)
        kappa_ = kappa(po_, pe_)
        
        sd_cohen_ = sd_cohen(po_, pe_)
        write_stat(df_final, list_categories[category], "cohen", kappa_, -1.96 * sd_cohen_ + kappa_, 1.96 * sd_cohen_ + kappa_, sd_cohen_, N)

        sd_fleiss_ = sd_fleiss(po_, pe_, confusion_matrix)
        write_stat(df_final, list_categories[category], "fleiss", kappa_, -1.96 * sd_fleiss_ + kappa_, 1.96 * sd_fleiss_ + kappa_, sd_fleiss_, N)

        print(f"For '{list_categories[category]}' review :")
        print(f"kappa bootstrap = {kappa_btp}")
        print(f"ci bootstrap = [{low_btp}, {high_btp}]")
        print(f"standard deviation bootstrap = {sd_btp}")
        print(f"standard error = sd / sqrt(N) = {sd_btp/sqrt(N)}")
        print("**************************************************")
    else: 
        write_stat(df_final, list_categories[category], "cohen", 1,  1, 1, 1, N)
        write_stat(df_final, list_categories[category], "bootstrap", 1, 1, 1, 1, N)
        write_stat(df_final, list_categories[category], "fleiss", 1, 1,1,1, N)


For 'Models and algorithms' review :
kappa bootstrap = 0.9022508652632083
ci bootstrap = [0.8068087471565124, 0.9803295939817478]
standard deviation bootstrap = 0.04352843982853407
standard error = sd / sqrt(N) = 0.0039735847311824024
**************************************************
For 'Datasets' review :
kappa bootstrap = 0.9305092126802864
ci bootstrap = [0.8612046941239226, 0.9832789878336804]
standard deviation bootstrap = 0.033276258911821996
standard error = sd / sqrt(N) = 0.003037692939231203
**************************************************
For 'Code' review :
kappa bootstrap = 0.9830424516173248
ci bootstrap = [0.9444303732391952, 1.0]
standard deviation bootstrap = 0.01713455401175053
standard error = sd / sqrt(N) = 0.0015641636241710674
**************************************************
For 'Experimental results' review :
kappa bootstrap = 0.859122566921538
ci bootstrap = [0.712793733681462, 0.971671388101983]
standard deviation bootstrap = 0.0640156004556051
standard er

In [324]:

# add agreement review

list_agreement_1 = df_rating_1.loc[2:, 29].values.tolist()
list_agreement_2 = df_rating_2.loc[2:, 29].values.tolist()
N = len(list_agreement_1)

kappa_btp, sd_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_agreement_1, y_pred=list_agreement_2)
write_stat(df_final, "Agreement", "bootstrap", kappa_btp, low_btp, high_btp, sd_btp, N)

confusion_matrix = create_confusion_matrix(list_1=list_agreement_1, list_2=list_agreement_2)
po_ = observed_proportion(confusion_matrix)
pe_ = expected_proportion(confusion_matrix)
kappa_ = kappa(po_, pe_)

sd_cohen_ = sd_cohen(po_, pe_)
write_stat(df_final, "Agreement", "cohen", kappa_, -1.96 * sd_cohen_ + kappa_, 1.96 * sd_cohen_ + kappa_, sd_cohen_, N)

sd_fleiss_ = sd_fleiss(po_, pe_, confusion_matrix)
write_stat(df_final, "Agreement", "fleiss", kappa_, -1.96 * sd_fleiss_ + kappa_, 1.96 * sd_fleiss_ + kappa_, sd_fleiss_, N)

print(f"For Agreement review :")
print(f"kappa bootstrap = {kappa_btp}")
print(f"ci bootstrap = [{low_btp}, {high_btp}]")
print(f"standard deviation bootstrap = {sd_btp}")
print(f"standard error = sd / sqrt(N) = {sd_btp/sqrt(N)}")

For Agreement review :
kappa bootstrap = 0.8105857068367698
ci bootstrap = [0.6433393387028401, 0.9595873011082102]
standard deviation bootstrap = 0.07834263164702614
standard error = sd / sqrt(N) = 0.012387057694809552


In [323]:

df_final.to_csv("../rating/rating_analysis.csv", index = True, sep=";", encoding='utf-8')
