In [71]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np
from math import sqrt
import random
from pathlib import Path
import os
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

In [72]:
def bootstrap_cqk(y_true, y_pred, quad=False, num_resamples = 999):
    """
    Boostrap function for cohen kappa score 
    """
    Y = np.array([y_true, y_pred]).T

    weighted_kappas = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        y_true_resample = Y_resample[:, 0]
        y_pred_resample = Y_resample[:, 1]
        if quad==False:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str))
        else: 
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str), weights='quadratic')
        weighted_kappas.append(weighted_kappa)

    return np.mean(weighted_kappas), np.std(weighted_kappas), np.percentile(weighted_kappas, 2.25), np.percentile(weighted_kappas, 97.5)

In [73]:
def create_confusion_matrix(list_1: list , list_2: list)-> pd.DataFrame:
    """
    Create confusion matrix 
    """
    
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        list_attributs = []
        for i in list_1:
            if i not in list_attributs:
                list_attributs.append(i)
        for i in list_2:
            if i not in list_attributs:
                list_attributs.append(i)


        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)))
        for k in range(size):
            for l in range(size):
                att_1 = list_attributs[k]
                att_2 = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_1):
                        if list_2[i] == att_2 :
                                matrix.loc[k,l]+=1
        for i in range(size):
            for j in range(size):
                matrix.loc[size, i] += matrix.loc[j, i]
                matrix.loc[i, size] += matrix.loc[i, j]
        for i in range(size):      
            matrix.loc[size, size] += matrix.loc[i, size]
        matrix = matrix / len(list_1)
    return matrix

In [74]:
def expected_proportion(matrix: pd.DataFrame):
    """
    Calculate the overall proportion of agreement expected by chance
    """
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

def observed_proportion(matrix: pd.DataFrame):
    """
    Calculate the overall proportion of observed agreement.
    """
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

In [75]:
def sd_cohen(po, pe):
    """
    Cohen standard deviation.
    """
    sd_= sqrt((po*(1-po))/((1-pe)*(1-pe)))
    return sd_

In [76]:
def kappa(po, pe):
    """
    Calculate the kappa cohen score.
    """
    return (po-pe)/(1-pe)

In [77]:
def write_stat(df_final, category, method, kappa_, low_, high_, se_):
    df_final.loc[category, ("kappa score", method)]=kappa_
    df_final.loc[category, ("ci low", method)]=low_
    df_final.loc[category, ("ci high", method)]=high_
    df_final.loc[category, ("se", method)]=se_

In [78]:

# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../human_rating/rating_90/rating_90_O.tsv"

df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)

In [79]:
# Enter the path to the tsv file with the rating from the second reviwer
path_tsv = "../human_rating/rating_90/rating_90_E.tsv"

df_rating_2 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)


In [80]:
# list of categories you want to make statistics for
list_categories = [
        "Models and algorithms",
        "Datasets",
        "Code",
        "Experimental results",
        "Error bars or statistical significance",
        "Code is or will be available",
        "Statement",
        "Comments",
    ]
list_methods = ["bootstrap", "cohen"]

In [81]:
# function to create the DataFrame 

list_stats = ["kappa score", "ci low", "ci high", "se"]

index_line = pd.Index(list_categories + ["Meta-categories", "Agreement", "Repo provided"])
index_column = pd.MultiIndex.from_product( [list_stats, list_methods], names=["stat", "method"])

df_final = pd.DataFrame(index=index_line, columns=index_column)

In [91]:

for category in range(len(list_categories)):
    all_reviews_1 = []
    all_reviews_2 = []
    for i in range(3):
        column_id = i*9 + 3 + category
        
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        list_review_2 = df_rating_2.loc[2:, column_id].values.tolist()

        all_reviews_1 = all_reviews_1 + list_review_1
        all_reviews_2 = all_reviews_2 + list_review_2

    N = len(all_reviews_1)
    
    if list_categories[category] != "Error bars or statistical significance":
        kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=all_reviews_1, y_pred=all_reviews_2)
        write_stat(df_final, list_categories[category], "bootstrap", kappa_btp, low_btp, high_btp, se_btp)
        
    confusion_matrix = create_confusion_matrix(list_1=all_reviews_1, list_2=all_reviews_2)
    po_ = observed_proportion(confusion_matrix)
    pe_ = expected_proportion(confusion_matrix)
    kappa_ = kappa(po_, pe_)

    kappa_sklearn = cohen_kappa_score(all_reviews_1, all_reviews_2)
    df_final.loc[list_categories[category], ("kappa score", "sklearn")]=kappa_sklearn

    data = [all_reviews_1, all_reviews_2]
    data_T = np.array(data).T
    data_fleiss_ = aggregate_raters(data_T)
    kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
    df_final.loc[list_categories[category], ("kappa score", "fleiss")]=kappa_fleiss_
        
    sd_cohen_ = sd_cohen(po_, pe_)
    se_cohen = sd_cohen_ / sqrt(N)
    write_stat(df_final, list_categories[category], "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )


    print(f"For \'{df_rating_1.loc[1, column_id]}\' item (over {N} reviews):")
 
    print(f"Cohen's kappa = {kappa_}")
    ######### For sanity check
    # print(f"kappa cohen sklearn = {kappa_sklearn}")
    # print(f"kappa cohen bootstrap = {kappa_btp}")
    # print(f"kappa fleiss statsmodels = {kappa_fleiss_}")

    print(f"standard error (bootstrap) = {se_btp}")
    ######### For sanity check
    # print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")

    print(f"CI bootstrap = [{low_btp}, {high_btp}]")
    ######### For sanity check
    low_parametric_cohen=-1.96 * se_cohen + kappa_
    high_parametric_cohen=1.96 * se_cohen + kappa_
    #print(f"CI parametric from Cohen's SE = [{low_parametric_cohen}, {high_parametric_cohen}]")
    print("**************************************************")
    

For 'Models and algorithms' item (over 270 reviews):
Cohen's kappa = 0.7544080604534007
standard error (bootstrap) = 0.04572129537387089
CI bootstrap = [0.6642971219193222, 0.8415432666133558]
**************************************************
For 'Datasets' item (over 270 reviews):
Cohen's kappa = 0.9085872576177285
standard error (bootstrap) = 0.026294218817813443
CI bootstrap = [0.8542383379271602, 0.9572242534659395]
**************************************************
For 'Code' item (over 270 reviews):
Cohen's kappa = 0.9107142857142856
standard error (bootstrap) = 0.02452274286617275
CI bootstrap = [0.865600669681564, 0.9558122545118058]
**************************************************
For 'Experimental results' item (over 270 reviews):
Cohen's kappa = 0.8637248539909151
standard error (bootstrap) = 0.03485545968200329
CI bootstrap = [0.7927693548507789, 0.9287536234528341]
**************************************************
For 'Error bars or statistical significance' item (over

In [94]:
# Meta-categories

list_meta_1 = []
list_meta_2 = []

for i in range(3):

    column_id = i + 29
    
    list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
    list_review_2 = df_rating_2.loc[2:, column_id].values.tolist()

    list_meta_1 = list_meta_1 + list_review_1
    list_meta_2 = list_meta_2 + list_review_2


test = list_meta_1.count("Unusable (meta)")
test2 = list_meta_2.count("Unusable (meta)")

N = len(list_meta_1)
if list_categories[category] != "Error bars or statistical significance":
    kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_meta_1, y_pred=list_meta_2)
    write_stat(df_final, "Meta-categories", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)
        
confusion_matrix = create_confusion_matrix(list_1=list_meta_1, list_2=list_meta_2)
po_ = observed_proportion(confusion_matrix)
pe_ = expected_proportion(confusion_matrix)
kappa_ = kappa(po_, pe_)

kappa_sklearn = cohen_kappa_score(list_meta_1, list_meta_2)
df_final.loc["Meta-categories", ("kappa score", "sklearn")]=kappa_sklearn

data = [list_meta_1, list_meta_2]
data_T = np.array(data).T
data_fleiss_ = aggregate_raters(data_T)
kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
df_final.loc["Meta-categories", ("kappa score", "fleiss")]=kappa_fleiss_

sd_cohen_ = sd_cohen(po_, pe_)
se_cohen = sd_cohen_ / sqrt(N)
write_stat(df_final, "Meta-categories", "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )

print(f"For \'Meta-category\' item (over {N} reviews):")
# For sanity check
# print(f"We can count {test} reviews unusable for the first rater and {test2} reviews unusable for the second.")

print(f"Cohen's kappa = {kappa_}")
######### For sanity check
# print(f"kappa cohen sklearn = {kappa_sklearn}")
# print(f"kappa cohen bootstrap = {kappa_btp}")
# print(f"kappa fleiss statsmodels = {kappa_fleiss_}")

print(f"standard error (bootstrap) = {se_btp}")
######### For sanity check
#print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")

print(f"CI bootstrap = [{low_btp}, {high_btp}]")
######### For sanity check
# low_parametric_cohen=-1.96 * se_cohen + kappa_
# high_parametric_cohen=1.96 * se_cohen + kappa_
# print(f"CI parametric from Cohen's SE = [{low_parametric_cohen}, {high_parametric_cohen}]")


For 'Meta-category' item (over 270 reviews):
Cohen's kappa = 0.8014027898179528
standard error (bootstrap) = 0.03513093822996074
CI bootstrap = [0.7293335962912879, 0.8690400826049199]


In [85]:
# add repo provided review

list_repo_1 = df_rating_1.loc[2:, 40].values.tolist()
list_repo_2 = df_rating_2.loc[2:, 40].values.tolist()
N = len(list_repo_1)
kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_repo_1, y_pred=list_repo_2)
write_stat(df_final, "Repo provided", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)

confusion_matrix = create_confusion_matrix(list_1=list_repo_1, list_2=list_repo_2)
po_ = observed_proportion(confusion_matrix)
pe_ = expected_proportion(confusion_matrix)
kappa_ = kappa(po_, pe_)

kappa_sklearn = cohen_kappa_score(list_repo_1, list_repo_2)
df_final.loc["Repo provided", ("kappa score", "sklearn")]=kappa_sklearn

data = [list_repo_1, list_repo_2]
data_T = np.array(data).T
data_fleiss_ = aggregate_raters(data_T)
kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
df_final.loc["Repo provided", ("kappa score", "fleiss")]=kappa_fleiss_

sd_cohen_ = sd_cohen(po_, pe_)
se_cohen = sd_cohen_ / sqrt(N)
write_stat(df_final, "Repo provided", "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen)

print(f"For {df_rating_1.loc[1, 40]} review :")
print(f"Number of reviews = {N}")
# print(f"kappa fleiss statsmodels = {kappa_fleiss_}")
# print(f"kappa cohen sklearn = {kappa_sklearn}")
# print(f"kappa cohen = {kappa_}")
print(f"kappa cohen bootstrap = {kappa_btp}")
print(f"ci bootstrap = [{low_btp}, {high_btp}]")
print(f"standard error (bootstrap) = {se_btp}")
# print(f"standard error (cohen) = {sd_cohen_ / sqrt(N)}")

For Repo provided and not empty review :
Number of reviews = 90
kappa cohen bootstrap = 1.0
ci bootstrap = [1.0, 1.0]
standard error (bootstrap) = 0.0


In [86]:
# save final df to csv file 
output_directory = Path(f"../stats_inter_rater")
if not output_directory.is_dir():
    os.mkdir(output_directory)
path_inter_rater_stats = output_directory / 'inter_rater_stats.csv'
df_final.to_csv(path_inter_rater_stats, index = True, sep=";", encoding='utf-8')
df_final.sort_index(axis=1, inplace=True)

In [87]:
from tabulate import tabulate
print(tabulate(df_final, headers='keys', tablefmt='psql'))

+----------------------------------------+----------------------------+------------------------+---------------------------+-----------------------+--------------------------------+----------------------------+-----------------------------+------------------------------+-----------------------+-------------------+
|                                        |   ('ci high', 'bootstrap') |   ('ci high', 'cohen') |   ('ci low', 'bootstrap') |   ('ci low', 'cohen') |   ('kappa score', 'bootstrap') |   ('kappa score', 'cohen') |   ('kappa score', 'fleiss') |   ('kappa score', 'sklearn') |   ('se', 'bootstrap') |   ('se', 'cohen') |
|----------------------------------------+----------------------------+------------------------+---------------------------+-----------------------+--------------------------------+----------------------------+-----------------------------+------------------------------+-----------------------+-------------------|
| Models and algorithms                  |          

In [88]:
#latex output
file1 = open('../latex/inter_raters_analysis.tex', 'w')
file1.write("\documentclass{article}\n\n")
file1.write("\\usepackage{float}\n\n")
file1.write("\\title{Inter raters analysis}\n\n")
file1.write("\\begin{document}\n\n")
file1.write("\maketitle\n\n")

# categories
file1.write("\section{Kappa table} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_final, headers='keys', tablefmt='latex'))
file1.write("\caption{Different method to calculate the kappa scores with confidence intervals and standard errors}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\end{document}")
file1.close()