In [65]:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
import numpy as np
from math import sqrt
import random

In [66]:
# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../rating/rating_E.tsv"
df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
df_rating_1 = df_rating_1.dropna()

In [67]:
# Enter the path to the tsv file with the rating from the second reviwer
path_tsv = "../rating/rating_O.tsv"
df_rating_2 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
df_rating_2 = df_rating_2.dropna()

In [68]:
# list of categories you want to make statistics for
list_categories = [
        "Models and algorithms",
        "Datasets",
        "Code",
        "Experimental results",
        "Error bars or statistical significance",
        "Statement",
        "Comments",
        "Meta-categories",
    ]
list_methods = ["bootstrap", "cohen", "fleiss"]

In [69]:
# function toi create the DataFrame 
def create_dataframe(list_categories:list, list_methods:list):

    list_stats = ["kappa score", "confidence interval low", "confidence interval high", "standard error"]

    index_line = pd.MultiIndex.from_product([list_categories, ["review 1", "review 2", "review 3"]], names=["category", "review"])
    index_column = pd.MultiIndex.from_product( [list_stats, list_methods], names=["stat", "method"])

    return pd.DataFrame(index=index_line, columns=index_column)

df_final = create_dataframe(list_categories=list_categories, list_methods=list_methods)
print(df_final)

stat                                            kappa score               \
method                                            bootstrap cohen fleiss   
category                               review                              
Models and algorithms                  review 1         NaN   NaN    NaN   
                                       review 2         NaN   NaN    NaN   
                                       review 3         NaN   NaN    NaN   
Datasets                               review 1         NaN   NaN    NaN   
                                       review 2         NaN   NaN    NaN   
                                       review 3         NaN   NaN    NaN   
Code                                   review 1         NaN   NaN    NaN   
                                       review 2         NaN   NaN    NaN   
                                       review 3         NaN   NaN    NaN   
Experimental results                   review 1         NaN   NaN    NaN   
            

In [70]:
# boostrap function for cohen kappa score 
def bootstrap_cqk(y_true, y_pred, quad=False, num_resamples = 999):
    Y = np.array([y_true, y_pred]).T

    weighted_kappas = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        y_true_resample = Y_resample[:, 0]
        y_pred_resample = Y_resample[:, 1]
        if quad==False:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str))
        else: 
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str), weights='quadratic')
        weighted_kappas.append(weighted_kappa)

    return np.mean(weighted_kappas), np.std(weighted_kappas), np.percentile(weighted_kappas, 2.25), np.percentile(weighted_kappas, 97.5)



In [71]:
# function that return the list of differents attributs
def count(list_: list):
    results_list = []
    for i in list_:
        if i not in results_list:
            results_list.append(i)
    return results_list

In [72]:
# function to create confucion matrix 
def create_matrix(list_1: list , list_2: list)-> pd.DataFrame:
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        list_attributs = count(list_1)

        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)))
        for k in range(size):
            for l in range(size):
                att_1 = list_attributs[k]
                att_2 = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_1):
                        if list_2[i] == att_2 :
                                matrix.loc[k,l]+=1
        for i in range(size):
            for j in range(size):
                matrix.loc[size, i] += matrix.loc[j, i]
                matrix.loc[i, size] += matrix.loc[i, j]
        for i in range(size):      
            matrix.loc[size, size] += matrix.loc[i, size]
        matrix = matrix / len(list_1)
    return matrix

In [73]:
# function to calculate the overall proportion of agreement expected by chance
def expected_proportion(matrix: pd.DataFrame):
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

# function to calculate the overall proportion of observed agreement 
def observed_proportion(matrix: pd.DataFrame):
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

In [74]:
# function to calculate the standard error
def se_cohen(po, pe, N):
    se_= sqrt((po*(1-po))/((1-pe)*(1-pe)))
    return se_ /sqrt(N)

In [75]:

def se_fleiss(po, pe, matrix, N):
    y1 = 0
    y2 = 0 
    k = len(matrix) -1

    for i in range (k):
        for j in range(k):
            if j != i:
                carre = (matrix.loc[k,i] + matrix.loc[j,k])
                carre = carre*carre
                y1 += matrix.loc[i,j] * carre
        carre2 = (matrix.loc[k,i] + matrix.loc[i,k])  
        carre2 = carre2*carre2   
        y2 += matrix.loc[i,i] * carre2
    y3 = ((po*pe) - (2*pe) + po)
    y3 = y3*y3
    x = (po*(1-pe)*(1-pe)) + ((1-po)*(1-po)* y1) - (2*(1-pe)*(1-po)*y2) - y3

    
    if x < 0:
        x = 0
        print(x)
    res = sqrt(x)
    se_ = res/((1-pe)*(1-pe))
    return se_ / sqrt(N)



In [76]:
# function to calculcate the kappa score
def kappa(po, pe):
    return (po-pe)/(1-pe)

In [77]:
# function to write stats in the dataframe
def write_stat(df_final, category, review, method, kappa_, low_, high_, se_):
    df_final.loc[(category, review), ("kappa score", method)]=kappa_
    df_final.loc[(category, review), ("confidence interval low", method)]=low_
    df_final.loc[(category, review), ("confidence interval high", method)]=high_
    df_final.loc[(category, review), ("standard error", method)]=se_

In [78]:

for category in range(len(list_categories)):
    all_reviews_1 = []
    all_reviews_2 = []
    for i in range(3):

        if list_categories[category] == "Meta-categories":
            column_id = i + 20
        else: 
            column_id = i*8 + 3 + category
        
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        list_review_2 = df_rating_2.loc[2:, column_id].values.tolist()
        N = len(list_review_1)
        all_reviews_1 = all_reviews_1 + list_review_1
        all_reviews_2 = all_reviews_2 + list_review_2

        if "bootstrap" in list_methods:
            kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_review_1, y_pred=list_review_2)
            write_stat(df_final, list_categories[category], f"review {i + 1}", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)
        
        if "cohen" in list_methods or "fleiss" in list_methods:
            confusion_matrix = create_matrix(list_1=list_review_1, list_2=list_review_2)
            po_ = observed_proportion(confusion_matrix)
            pe_ = expected_proportion(confusion_matrix)
            kappa_ = kappa(po_, pe_)
            if "cohen" in list_methods:
                se_cohen_ = se_cohen(po_, pe_, N)
                write_stat(df_final, list_categories[category], f"review {i + 1}", "cohen", kappa_, -1.96 * se_cohen_ + kappa_, 1.96 * se_cohen_ + kappa_, se_cohen_)
            if "fleiss" in list_methods:
                se_fleiss_ = se_fleiss(po_, pe_, confusion_matrix, N)
                write_stat(df_final, list_categories[category], f"review {i + 1}", "fleiss", kappa_, -1.96 * se_fleiss_ + kappa_, 1.96 * se_fleiss_ + kappa_, se_fleiss_)

    # all reviews 
    N = len(all_reviews_1)
    if "bootstrap" in list_methods:
        kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=all_reviews_1, y_pred=all_reviews_2)
        write_stat(df_final, list_categories[category], "all reviews", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)
    if "cohen" in list_methods or "fleiss" in list_methods:
        confusion_matrix = create_matrix(list_1=all_reviews_1, list_2=all_reviews_2)
        po_ = observed_proportion(confusion_matrix)
        pe_ = expected_proportion(confusion_matrix)
        kappa_ = kappa(po_, pe_)
        if "cohen" in list_methods:
            se_cohen_ = se_cohen(po_, pe_, N)
            write_stat(df_final, list_categories[category], "all reviews", "cohen", kappa_, -1.96 * se_cohen_ + kappa_, 1.96 * se_cohen_ + kappa_, se_cohen_)
        if "fleiss" in list_methods:
            se_fleiss_ = se_fleiss(po_, pe_, confusion_matrix, N)
            write_stat(df_final, list_categories[category], "all reviews", "fleiss", kappa_, -1.96 * se_fleiss_ + kappa_, 1.96 * se_fleiss_ + kappa_, se_fleiss_)

# add agreement review
if "Agreement" in list_categories: 

    list_agreement_1 = df_rating_1.loc[2:, 29].values.tolist()
    list_agreement_2 = df_rating_2.loc[2:, 29].values.tolist()
    N = len(list_agreement_1)
    if "bootstrap" in list_methods:
        kappa_btp, se_btp, low_btp, high_btp = bootstrap_cqk(y_true=list_agreement_1, y_pred=list_agreement_2)
        write_stat(df_final, "Agreement", "all reviews", "bootstrap", kappa_btp, low_btp, high_btp, se_btp)

    if "cohen" in list_methods or "fleiss" in list_methods:
        confusion_matrix = create_matrix(list_1=list_agreement_1, list_2=list_agreement_2)
        po_ = observed_proportion(confusion_matrix)
        pe_ = expected_proportion(confusion_matrix)
        kappa_ = kappa(po_, pe_)

        if "cohen" in list_methods:
            se_cohen_ = se_cohen(po_, pe_, N)
            write_stat(df_final, "Agreement", "all reviews", "cohen", kappa_, -1.96 * se_cohen_ + kappa_, 1.96 * se_cohen_ + kappa_, se_cohen_)
        if "fleiss" in list_methods:        
            se_fleiss_ = se_fleiss(po_, pe_, confusion_matrix, N)
            write_stat(df_final, "Agreement", "all reviews", "fleiss", kappa_, -1.96 * se_fleiss_ + kappa_, 1.96 * se_fleiss_ + kappa_, se_fleiss_)

df_final.sort_index(axis=0, ascending=True, inplace=True)
df_final.to_csv("../rating/rating_analysis.csv", index = True, sep=";", encoding='utf-8')



0
0
0
0


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

0
0


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

0
