In [350]:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
import numpy as np
from math import sqrt
import random

In [351]:
# boostrap function for cohen kappa score 
def bootstrap_cqk(y_true, y_pred, quad=False, num_resamples = 999):
    Y = np.array([y_true, y_pred]).T

    weighted_kappas = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        y_true_resample = Y_resample[:, 0]
        y_pred_resample = Y_resample[:, 1]
        if quad==False:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str))
        else: 
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str), weights='quadratic')
        weighted_kappas.append(weighted_kappa)

    return np.mean(weighted_kappas), np.std(weighted_kappas), np.percentile(weighted_kappas, 2.25), np.percentile(weighted_kappas, 97.5)



In [352]:
# function to create confusion matrix 
def create_confusion_matrix(list_1: list , list_2: list)-> pd.DataFrame:
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        list_attributs = []
        for i in list_1:
            if i not in list_attributs:
                list_attributs.append(i)
        for i in list_2:
            if i not in list_attributs:
                list_attributs.append(i)


        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)))
        for k in range(size):
            for l in range(size):
                att_1 = list_attributs[k]
                att_2 = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_1):
                        if list_2[i] == att_2 :
                                matrix.loc[k,l]+=1
        for i in range(size):
            for j in range(size):
                matrix.loc[size, i] += matrix.loc[j, i]
                matrix.loc[i, size] += matrix.loc[i, j]
        for i in range(size):      
            matrix.loc[size, size] += matrix.loc[i, size]
        matrix = matrix / len(list_1)
    return matrix

In [353]:
# function to calculate the overall proportion of agreement expected by chance
def expected_proportion(matrix: pd.DataFrame):
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

# function to calculate the overall proportion of observed agreement 
def observed_proportion(matrix: pd.DataFrame):
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

In [354]:
# function to calculate the standard error
def sd_cohen(po, pe):
    sd_= sqrt((po*(1-po))/((1-pe)*(1-pe)))
    return sd_

In [355]:

def sd_fleiss(po, pe, matrix):
    y1 = 0
    y2 = 0 
    k = len(matrix) -1

    for i in range (k):
        for j in range(k):
            if j != i:
                carre = (matrix.loc[k,i] + matrix.loc[j,k])
                carre = carre*carre
                y1 += matrix.loc[i,j] * carre
        carre2 = (matrix.loc[k,i] + matrix.loc[i,k])  
        carre2 = carre2*carre2   
        y2 += matrix.loc[i,i] * carre2
    y3 = ((po*pe) - (2*pe) + po)
    y3 = y3*y3
    x = (po*(1-pe)*(1-pe)) + ((1-po)*(1-po)* y1) - (2*(1-pe)*(1-po)*y2) - y3
    
    if x < 0:
        x = 0
    res = sqrt(x)
    sd_ = res/((1-pe)*(1-pe))
    return sd_



In [356]:
# function to calculcate the kappa score
def kappa(po, pe):
    return (po-pe)/(1-pe)

In [357]:
# function to write stats in the dataframe
def write_stat(df_final, category, method, kappa_, low_, high_, se_):
    df_final.loc[category, ("kappa score", method)]=kappa_
    df_final.loc[category, ("ci low", method)]=low_
    df_final.loc[category, ("ci high", method)]=high_
    df_final.loc[category, ("se", method)]=se_

In [358]:

# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../rating/rating_90/rating_90_O.tsv"
df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
print(df_rating_1.columns)
print(len(df_rating_1))

Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41],
      dtype='int64')
92


In [359]:
# Enter the path to the tsv file with the rating from the second reviwer
path_tsv = "../rating/rating_90/rating_90_E.tsv"
df_rating_2 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
#df_rating_2 = df_rating_2.dropna()

In [360]:
# list of categories you want to make statistics for
list_categories = [
        "Models and algorithms",
        "Datasets",
        "Code",
        "Experimental results",
        "Error bars or statistical significance",
        "Code is or will be available",
        "Statement",
        "Comments",
        "Meta-categories",
    ]
list_methods = ["bootstrap", "cohen", "fleiss"]

In [361]:
# function to create the DataFrame 

list_stats = ["kappa score", "ci low", "ci high", "sd", "se"]

index_line = pd.Index(list_categories)
index_column = pd.MultiIndex.from_product( [list_stats, list_methods], names=["stat", "method"])

df_final = pd.DataFrame(index=index_line, columns=index_column)

print(df_final.index)
print(df_final.columns)

Index(['Models and algorithms', 'Datasets', 'Code', 'Experimental results',
       'Error bars or statistical significance',
       'Code is or will be available', 'Statement', 'Comments',
       'Meta-categories'],
      dtype='object')
MultiIndex([('kappa score', 'bootstrap'),
            ('kappa score',     'cohen'),
            ('kappa score',    'fleiss'),
            (     'ci low', 'bootstrap'),
            (     'ci low',     'cohen'),
            (     'ci low',    'fleiss'),
            (    'ci high', 'bootstrap'),
            (    'ci high',     'cohen'),
            (    'ci high',    'fleiss'),
            (         'sd', 'bootstrap'),
            (         'sd',     'cohen'),
            (         'sd',    'fleiss'),
            (         'se', 'bootstrap'),
            (         'se',     'cohen'),
            (         'se',    'fleiss')],
           names=['stat', 'method'])


In [378]:

for category in range(len(list_categories)):
    all_reviews_1 = []
    all_reviews_2 = []
    for i in range(3):

        if list_categories[category] == "Meta-categories":
            column_id = i + 29
        else: 
            column_id = i*9 + 3 + category
        
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        list_review_2 = df_rating_2.loc[2:, column_id].values.tolist()

        all_reviews_1 = all_reviews_1 + list_review_1
        all_reviews_2 = all_reviews_2 + list_review_2

    N = len(all_reviews_1)
    if list_categories[category] != "Error bars or statistical significance":
        kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=all_reviews_1, y_pred=all_reviews_2)
        write_stat(df_final, list_categories[category], "bootstrap", kappa_btp, low_btp, high_btp, se_btp)
        
    confusion_matrix = create_confusion_matrix(list_1=all_reviews_1, list_2=all_reviews_2)
    po_ = observed_proportion(confusion_matrix)
    pe_ = expected_proportion(confusion_matrix)
    kappa_ = kappa(po_, pe_)
        
    sd_cohen_ = sd_cohen(po_, pe_)
    se_cohen = sd_cohen_ / sqrt(N)
    write_stat(df_final, list_categories[category], "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )

    sd_fleiss_ = sd_fleiss(po_, pe_, confusion_matrix)
    se_fleiss = sd_fleiss_ / sqrt(N)
    write_stat(df_final, list_categories[category], "fleiss", kappa_, -1.96 * se_fleiss + kappa_, 1.96 * se_fleiss + kappa_, se_fleiss)


    print(f"For {df_rating_1.loc[1, column_id]} review :")
    print(f"Number of reviews = {N}")
    print(f"kappa = {kappa_}")
    print(f"kappa bootstrap = {kappa_btp}")
    print(f"ci bootstrap = [{low_btp}, {high_btp}]")
    print(f"standard error (bootstrap) = {se_btp}")
    print(f"standard error (fleiss) = { sd_fleiss_ / sqrt(N)}")
    print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")
    print("**************************************************")
    

For Models and algorithms review :
Number of reviews = 270
kappa = 0.7544080604534007
kappa bootstrap = 0.7539979303519965
ci bootstrap = [0.6617661485411376, 0.8412931667891257]
standard error (bootstrap) = 0.04550509715271859
standard error (fleiss) = 0.0
standard error (cohen) = 0.04578681833590641
**************************************************
For Datasets review :
Number of reviews = 270
kappa = 0.9085872576177285
kappa bootstrap = 0.9087207438402407
ci bootstrap = [0.85288316288647, 0.9582224422850777]
standard error (bootstrap) = 0.026573507173972208
standard error (fleiss) = 0.0
standard error (cohen) = 0.02699469316029271
**************************************************
For Code review :
Number of reviews = 270
kappa = 0.9107142857142856
kappa bootstrap = 0.9108468158915793
ci bootstrap = [0.8591805996066364, 0.9552678753378085]
standard error (bootstrap) = 0.02401677197831318
standard error (fleiss) = 0.022274454372247
standard error (cohen) = 0.025195287895238696
*****

In [379]:

# add agreement review

list_agreement_1 = df_rating_1.loc[2:, 38].values.tolist()
list_agreement_2 = df_rating_2.loc[2:, 38].values.tolist()
N = len(list_agreement_1)
kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_agreement_1, y_pred=list_agreement_2)
write_stat(df_final, "Agreement", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)

confusion_matrix = create_confusion_matrix(list_1=list_agreement_1, list_2=list_agreement_2)
po_ = observed_proportion(confusion_matrix)
pe_ = expected_proportion(confusion_matrix)
kappa_ = kappa(po_, pe_)

sd_cohen_ = sd_cohen(po_, pe_)
se_cohen = sd_cohen_ / sqrt(N)
write_stat(df_final, "Agreement", "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen)

sd_fleiss_ = sd_fleiss(po_, pe_, confusion_matrix)
se_fleiss = sd_fleiss_ / sqrt(N)
write_stat(df_final, "Agreement", "fleiss", kappa_, -1.96 * se_fleiss + kappa_, 1.96 * se_fleiss + kappa_, se_fleiss)

print(f"For {df_rating_1.loc[1, 38]} review :")
print(f"Number of reviews = {N}")
print(f"kappa = {kappa_}")
print(f"kappa bootstrap = {kappa_btp}")
print(f"ci bootstrap = [{low_btp}, {high_btp}]")
print(f"standard error (bootstrap) = {se_btp}")
print(f"standard error (fleiss) = { sd_fleiss_ / sqrt(N)}")
print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")

For Agreement review :
Number of reviews = 90
kappa = 0.63302752293578
kappa bootstrap = 0.6307787360019255
ci bootstrap = [0.45785859263027323, 0.7821175052550798]
standard error (bootstrap) = 0.08417355963498516
standard error (fleiss) = 0.0
standard error (cohen) = 0.08318942207177639


In [380]:
# add repo provided review

list_agreement_1 = df_rating_1.loc[2:, 40].values.tolist()
list_agreement_2 = df_rating_2.loc[2:, 40].values.tolist()
N = len(list_agreement_1)
kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_agreement_1, y_pred=list_agreement_2)
write_stat(df_final, "repo provided and not empty", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)

confusion_matrix = create_confusion_matrix(list_1=list_agreement_1, list_2=list_agreement_2)
po_ = observed_proportion(confusion_matrix)
pe_ = expected_proportion(confusion_matrix)
kappa_ = kappa(po_, pe_)

sd_cohen_ = sd_cohen(po_, pe_)
se_cohen = sd_cohen_ / sqrt(N)
write_stat(df_final, "repo provided and not empty", "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen)

sd_fleiss_ = sd_fleiss(po_, pe_, confusion_matrix)
se_fleiss = sd_fleiss_ / sqrt(N)
write_stat(df_final, "repo provided and not empty", "fleiss", kappa_, -1.96 * se_fleiss + kappa_, 1.96 * se_fleiss + kappa_, se_fleiss)

print(f"For {df_rating_1.loc[1, 40]} review :")
print(f"Number of reviews = {N}")
print(f"kappa = {kappa_}")
print(f"kappa bootstrap = {kappa_btp}")
print(f"ci bootstrap = [{low_btp}, {high_btp}]")
print(f"standard error (bootstrap) = {se_btp}")
print(f"standard error (fleiss) = { sd_fleiss_ / sqrt(N)}")
print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")

For Repo provided and not empty review :
Number of reviews = 90
kappa = 1.0
kappa bootstrap = 1.0
ci bootstrap = [1.0, 1.0]
standard error (bootstrap) = 0.0
standard error (fleiss) = 0.0
standard error (cohen) = 0.0


In [381]:
# save final df to csv file 
df_final.to_csv("../rating/rating_90/inter_raters_analysis.csv", index = True, sep=";", encoding='utf-8')


In [366]:
# TODO 

# split cells and delete some function 
# carreful about path 