In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import random
from tabulate import tabulate
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from pathlib import Path
import os
import datetime
import json 

In [2]:

def kappa(po, pe):
    """
    Calculate the kappa score
    """
    return (po-pe)/(1-pe)

def expected_proportion(matrix: pd.DataFrame):
    """
    Overall proportion of agreement expected by chance
    """
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

def observed_proportion(matrix: pd.DataFrame):
    """
    Overall proportion of observed agreement 
    """
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

def create_confusion_matrix(list_1: list , list_2: list, list_attributs)-> pd.DataFrame:
    """
    Create confusion matrix (in %)
    """
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)), index = list_attributs + ["total"], columns = list_attributs + ["total"])
        for k in range(size):
            for l in range(size):
                att_k = list_attributs[k]
                att_l = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_k):
                        if list_2[i] == att_l :
                            matrix.loc[att_k,att_l]+=1
        for i in range(size):
            for j in range(size):
                att_i = list_attributs[i]
                att_j = list_attributs[j]
                matrix.loc["total", att_i] += matrix.loc[att_j, att_i]
                matrix.loc[att_i, "total"] += matrix.loc[att_i, att_j]
                matrix.loc["total", "total"] += matrix.loc[att_i, att_j]

        if len(list_1) == matrix.loc["total", "total"]:
            matrix = round(matrix* 100/ len(list_1), 2)
    return matrix



In [3]:
def ci_bp(proportion, N):
    """
    Confidence interval (binomial proportion)
    """
    ci_bp_low = (-1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100
    ci_bp_high = (1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100

    return ci_bp_low, ci_bp_high

In [4]:
def ci_bootstrap(data, val, num_resamples = 1000):
    """
    Confidence interval (bootstrap)
    """
    K = len(data)
    outputs = []
    for i in range(num_resamples):
        Y_resample = random.choices(data, k=K)
        out = Y_resample.count(val)
        outputs.append(out*100 / K )

    return np.percentile(outputs, 2.25), np.percentile(outputs, 97.5)


In [5]:
def kappa_fleiss_3(data_1, data_2, data_3):
    """
    Boostrap function for cohen kappa score for 3 raters

    Returns
    -------
    float
        mean of the kappa across bootstrap samples
    """
    data_T = np.array([data_1, data_2, data_3]).T
    data_fleiss_ = aggregate_raters(data_T)
    kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
    return kappa_fleiss_

def ci_bootstrap_3(data_1, data_2, data_3, num_resamples = 1000):
    """
    Confidence interval for a 3 raters kappa (bootstrap)

    Returns
    -------
    float
        95% CI lower bound (2.5 centile of the sorted bootstrap distribution)
    float
        95% CI upper bound (97.5 centile of the sorted bootstrap distribution)
    """
    Y = np.array([data_1, data_2, data_3]).T
    list_kappa = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        data_1 = Y_resample[:, 0]
        data_2 = Y_resample[:, 1]
        data_3 = Y_resample[:, 2]

        kappa = kappa_fleiss_3(data_1.astype(str), data_2.astype(str), data_3.astype(str))
        list_kappa.append(kappa)

    return np.percentile(list_kappa, 2.25), np.percentile(list_kappa, 97.5)


In [6]:
list_0_1= [
    "Models and algorithms",
    "Datasets",
    "Code",
    "Experimental results",
    "Error bars or statistical significance",
    "Code is or will be available"
    ]

statement_list = [
    "3. (+) statement",
    "2. (-) statement ",
    "1. (none) statement",
    "0. Unusable (statement)",
]

comments_list = [
    "4. (-/+) comments",
    "3. (+) comments",
    "2. (-) comments",
    "1. (none) comments",
    "0. Unusable (comments)",
]

meta_categories_list = [
    "(+) meta",
    "(-) meta",
    "Unusable (meta)",
]

In [7]:
# Enter the path to the tsv file with the rating from the first observer
# You can change to analyze the ratings of another observer
path_csv = "../human_rating/rating_90/rating_90_O.csv"
df_rating_1 = pd.read_csv(path_csv, sep = "\t", index_col=False, header= None)

print(f"This notebook analyzes the 3 reviews of {len(df_rating_1)-2} different papers based on the file {path_csv}")


output_directory = Path(f"../miccai2023/stats_rating")
if not output_directory.is_dir():
    os.mkdir(output_directory)
print(f"Outputs will be saved in {output_directory}.")

This notebook analyzes the 3 reviews of 90 different papers based on the file ../human_rating/rating_90/rating_90_O.csv
Outputs will be saved in ../miccai2023/stats_rating.


In [8]:
########################################################################
# Statistics on the rating for the 0/1 categories
########################################################################

# Create the output dataframe
df_category = pd.DataFrame(np.zeros((len(list_0_1),4)), index=list_0_1, columns=["number", "percent", "ci low", "ci high"])

for category in range(len(list_0_1)):
    all_reviews_1 = []
    for i in range(3):
        column_id = i*9 + 3 + category
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        all_reviews_1 = all_reviews_1 + list_review_1

    count_ = all_reviews_1.count("1")
    percent_ = round(count_ * 100 / len(all_reviews_1),2)

    df_category.loc[list_0_1[category], "number"] = count_
    df_category.loc[list_0_1[category], "percent"] = percent_

    print(f"For category {list_0_1[category]}, {percent_}% of reviewers ({count_}/{len(all_reviews_1)}) have commented on at least one of the items of the category.")

    proportion = count_ / len(all_reviews_1)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(all_reviews_1))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(all_reviews_1, '1')

    df_category.loc[list_0_1[category], "ci low"] = ci_bootstrap_low
    df_category.loc[list_0_1[category], "ci high"] = ci_bootstrap_high

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()


df_category.index.rename(f"For {len(all_reviews_1)} reviews", inplace = True)

path_category = output_directory / '1-category.csv'
df_category.to_csv(path_category, index = True, sep=";", encoding='utf-8')

print(tabulate(df_category, headers='keys', tablefmt='psql'))



For category Models and algorithms, 28.89% of reviewers (78/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [23.48%, 34.30%]
confidence intervals (bootstrap): [23.51%, 34.08%]

For category Datasets, 33.33% of reviewers (90/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [27.71%, 38.96%]
confidence intervals (bootstrap): [27.78%, 38.89%]

For category Code, 46.67% of reviewers (126/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [40.72%, 52.62%]
confidence intervals (bootstrap): [40.74%, 52.96%]

For category Experimental results, 25.56% of reviewers (69/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [20.35%, 30.76%]
confidence intervals (bootstrap): [20.37%, 30.75%]

For category Error bars or statistical significance, 1.85% of reviewers (5

In [9]:
########################################################################
# Statistics on the rating for the statements category
########################################################################

# Create the output dataframe
df_statement = pd.DataFrame(np.zeros((len(statement_list),4)), index=statement_list, columns=["number", "percent", "ci low", "ci high"])

# Extract the statements category for the 3 reviews
reviews_statement = []
list_statement_1 = df_rating_1.loc[2:, 9].values.tolist()
list_statement_2 = df_rating_1.loc[2:, 18].values.tolist()
list_statement_3 = df_rating_1.loc[2:, 27].values.tolist()
reviews_statement = list_statement_1 + list_statement_2 + list_statement_3

print(f"Statistics for statements (on {len(reviews_statement)} reviewers):")
print()
N_statement = len(reviews_statement)

for rating in statement_list:
    count_ = reviews_statement.count(rating)
    percent_ = round(count_ * 100 / N_statement, 2)
    df_statement.loc[rating, "number"] = count_
    df_statement.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_statement}) in category {rating}")

    proportion = count_ / len(reviews_statement)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(reviews_statement))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_statement, rating)

    df_statement.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_statement.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

### percent and ci about reviews which provided a statement
new_reviews_statement = []
for i in range(N_statement):
    if (reviews_statement[i] == "3. (+) statement") or (reviews_statement[i] == "2. (-) statement "):
        new_reviews_statement.append(1)
    else :
        new_reviews_statement.append(0)

new_count_ = new_reviews_statement.count(1)
new_percent_ = round(new_count_ * 100 / N_statement, 2)
print(f"- {new_percent_}% of reviews ({new_count_}/{N_statement}) provided a statement")

proportion = new_count_ / N_statement
ci_bp_low, ci_bp_high = ci_bp(proportion, N_statement)
ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(new_reviews_statement, 1)
print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
print()

### Creating distribution of good statement (3 VS 0/1/2/)


df_statement.index.rename(f"For {N_statement} reviews", inplace = True)
print(tabulate(df_statement, headers='keys', tablefmt='psql'))
print()

path_statements = output_directory / '2-statements.csv'
df_statement.to_csv(path_statements, index = True, sep=";", encoding='utf-8')

kappa_fleiss_statement = kappa_fleiss_3(list_statement_1, list_statement_2, list_statement_3)
print(f"kappa fleiss: {kappa_fleiss_statement}")

# from sklearn.metrics import cohen_kappa_score
# kappa_sk_statement = cohen_kappa_score(list_statement_1, list_statement_2)
# print(f"kappa sklearn: {kappa_sk_statement}")

ci_low_fleiss_statement, ci_high_fleiss_statement = ci_bootstrap_3(list_statement_1, list_statement_2, list_statement_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_statement:.2f}, {ci_high_fleiss_statement:.2f}]")
print()

print("Review 1 VS review 2") 
m1_statement = create_confusion_matrix(list_statement_1, list_statement_2, statement_list)
print(tabulate(m1_statement, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_statement = create_confusion_matrix(list_statement_2, list_statement_3, statement_list)
print(tabulate(m2_statement, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_statement = create_confusion_matrix(list_statement_1, list_statement_3, statement_list)
print(tabulate(m3_statement, headers='keys', tablefmt='psql'))
print()


Statistics for statements (on 270 reviewers):

- 48.89% of reviews (132/270) in category 3. (+) statement
confidence intervals (binomial proportion): [42.93%, 54.85%]
confidence intervals (bootstrap): [43.33%, 54.81%]

- 11.85% of reviews (32/270) in category 2. (-) statement 
confidence intervals (binomial proportion): [8.00%, 15.71%]
confidence intervals (bootstrap): [7.78%, 15.93%]

- 37.04% of reviews (100/270) in category 1. (none) statement
confidence intervals (binomial proportion): [31.28%, 42.80%]
confidence intervals (bootstrap): [31.11%, 42.96%]

- 2.22% of reviews (6/270) in category 0. Unusable (statement)
confidence intervals (binomial proportion): [0.46%, 3.98%]
confidence intervals (bootstrap): [0.74%, 4.07%]

- 60.74% of reviews (164/270) provided a statement
confidence intervals (binomial proportion): [54.92%, 66.57%]
confidence intervals (bootstrap): [54.99%, 66.30%]

+-------------------------+----------+-----------+----------+-----------+
| For 270 reviews         

In [10]:
# comments
df_comments = pd.DataFrame(np.zeros((len(comments_list),4)), index=comments_list, columns=["number", "percent", "ci low", "ci high"])

reviews_comments = []
list_comment_1 = df_rating_1.loc[2:, 10].values.tolist()
list_comment_2 = df_rating_1.loc[2:, 19].values.tolist()
list_comment_3 = df_rating_1.loc[2:, 28].values.tolist()
reviews_comments = list_comment_1 + list_comment_2 + list_comment_3

print(f"Statistics for comments (on {len(reviews_comments)} reviewers):")
print()
N_comments = len(reviews_comments)

for rating in comments_list:
    count_ = reviews_comments.count(rating)
    percent_ = round(count_ * 100 / N_comments, 2)
    df_comments.loc[rating, "number"] = count_
    df_comments.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_comments}) in category {rating}")

    proportion = count_ / N_comments
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_comments)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_comments, rating)

    df_comments.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_comments.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

### percent and ci about reviews which provided a statement
new_reviews_comments = []
for i in range(N_comments):
    if (reviews_comments[i] == "4. (-/+) comments") or (reviews_comments[i] == "3. (+) comments") or (reviews_comments[i] == "2. (-) comments"):
        new_reviews_comments.append(1)
    else :
        new_reviews_comments.append(0)

new_count_ = new_reviews_comments.count(1)
new_percent_ = round(new_count_ * 100 / N_statement, 2)
print(f"- {new_percent_}% of reviews ({new_count_}/{N_comments}) provided a comment")

proportion = new_count_ / N_comments
ci_bp_low, ci_bp_high = ci_bp(proportion, N_comments)
ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(new_reviews_comments, 1)
print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
print()


df_comments.index.rename(f"For {N_comments} reviews", inplace = True)
print(tabulate(df_comments, headers='keys', tablefmt='psql'))

path_comments = output_directory / '3-comments.csv'
df_comments.to_csv(path_comments, index = True, sep=";", encoding='utf-8')

kappa_fleiss_comment = kappa_fleiss_3(list_comment_1, list_comment_2, list_comment_3)
print(f"kappa fleiss: {kappa_fleiss_comment}")
ci_low_fleiss_comment, ci_high_fleiss_comment = ci_bootstrap_3(list_comment_1, list_comment_2, list_comment_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_comment:.2f}%, {ci_high_fleiss_comment:.2f}%]")
print()
print("Review 1 VS review 2")
m1_comment = create_confusion_matrix(list_comment_1, list_comment_2, comments_list)
print(tabulate(m1_comment, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_comment = create_confusion_matrix(list_comment_2, list_comment_3, comments_list)
print(tabulate(m2_comment, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_comment =create_confusion_matrix(list_comment_1, list_comment_3, comments_list)
print(tabulate(m3_comment, headers='keys', tablefmt='psql'))
print()


Statistics for comments (on 270 reviewers):

- 22.96% of reviews (62/270) in category 4. (-/+) comments
confidence intervals (binomial proportion): [17.95%, 27.98%]
confidence intervals (bootstrap): [18.15%, 28.15%]

- 30.74% of reviews (83/270) in category 3. (+) comments
confidence intervals (binomial proportion): [25.24%, 36.24%]
confidence intervals (bootstrap): [25.19%, 36.30%]

- 18.52% of reviews (50/270) in category 2. (-) comments
confidence intervals (binomial proportion): [13.89%, 23.15%]
confidence intervals (bootstrap): [14.07%, 23.34%]

- 22.22% of reviews (60/270) in category 1. (none) comments
confidence intervals (binomial proportion): [17.26%, 27.18%]
confidence intervals (bootstrap): [17.21%, 27.04%]

- 5.56% of reviews (15/270) in category 0. Unusable (comments)
confidence intervals (binomial proportion): [2.82%, 8.29%]
confidence intervals (bootstrap): [2.96%, 8.89%]

- 72.22% of reviews (195/270) provided a comment
confidence intervals (binomial proportion): [66.8

In [19]:
# statements and comments 

df_stat_com = pd.DataFrame(np.zeros((len(statement_list)+1,len(comments_list)+1)), index=statement_list+["total"], columns=comments_list+["total"])
df_stat_com.index.rename("number", inplace = True)
for i in range (len(reviews_comments)):
    df_stat_com.loc[reviews_statement[i], reviews_comments[i]] +=1
    df_stat_com.loc["total", reviews_comments[i]] +=1
    df_stat_com.loc[reviews_statement[i], "total"] +=1
df_stat_com.loc["total", "total"] = len(reviews_comments)

print("Analysis of statements VS comments")
print(tabulate(df_stat_com, headers='keys', tablefmt='psql'))
print()

reviews_statement_no_comments = []
for i in range (len(reviews_comments)):
    if reviews_statement[i]=="3. (+) statement": 
        if (reviews_comments[i] == "1. (none) comments" or reviews_comments[i] == "0. (Unusable) comments" ):
            reviews_statement_no_comments.append(1)
        else :
            reviews_statement_no_comments.append(0)

new_count_ = reviews_statement_no_comments.count(1)
new_percent_ = round(new_count_ * 100 / len(reviews_statement_no_comments), 2)
print(f"- {new_percent_}% of reviewers ({new_count_}/{len(reviews_statement_no_comments)}) made a positive statement but didn't provide a comment to substantiate their statement.")
print()

proportion = new_count_ / len(reviews_statement_no_comments)
ci_bp_low, ci_bp_high = ci_bp(proportion, len(reviews_statement_no_comments))
ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_statement_no_comments, 1)
print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
print()

path_statements_comments = output_directory / '4-statements_comments.csv'
df_stat_com.to_csv(path_statements_comments, index = True, sep=";", encoding='utf-8')

df_stat_com_percent = df_stat_com
df_stat_com_percent.index.rename("in %", inplace = True)
df_stat_com_percent = round(df_stat_com*100 /len(reviews_comments), 2)

print(tabulate(df_stat_com_percent, headers='keys', tablefmt='psql'))


Analysis of statements VS comments
+-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------+
| number                  |   4. (-/+) comments |   3. (+) comments |   2. (-) comments |   1. (none) comments |   0. Unusable (comments) |   total |
|-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------|
| 3. (+) statement        |                  21 |                52 |                 7 |                   52 |                        0 |     132 |
| 2. (-) statement        |                   8 |                 0 |                21 |                    3 |                        0 |      32 |
| 1. (none) statement     |                  33 |                31 |                22 |                    5 |                        9 |     100 |
| 0. Unusable (statement) |                   0 |                

In [12]:
# meta categories
df_meta = pd.DataFrame(np.zeros((len(meta_categories_list),4)), index=meta_categories_list, columns=["number", "percent", "ci low", "ci high"])

reviews_meta = []
list_meta_1 = df_rating_1.loc[2:, 29].values.tolist()
list_meta_2 = df_rating_1.loc[2:, 30].values.tolist()
list_meta_3 = df_rating_1.loc[2:, 31].values.tolist()

reviews_meta = list_meta_1 + list_meta_2 + list_meta_3

reviews_meta_python = []
for i in range(len(reviews_statement)):
    if reviews_statement[i] == "3. (+) statement":
        reviews_meta_python.append("(+) meta")
    elif reviews_statement[i] == "2. (-) statement ":
        reviews_meta_python.append("(-) meta")
    elif reviews_comments[i] == "3. (+) comments":
        reviews_meta_python.append("(+) meta")
    elif reviews_comments[i] == "2. (-) comments" or reviews_comments[i] =="4. (-/+) comments":
        reviews_meta_python.append( "(-) meta")
    else:
        reviews_meta_python.append("Unusable (meta)")

if not (reviews_meta==reviews_meta_python):
    reviews_meta = reviews_meta_python
    print("Meta categories calculated values aren't the same as in the provided tsv file.")


print(f"Statistics for meta-categories (on {len(reviews_meta)} reviewers):")
print()

N_meta = len(reviews_meta)
for rating in meta_categories_list:
    count_ = reviews_meta.count(rating)
    percent_ = round(count_ * 100 / N_meta, 2)
    df_meta.loc[rating, "number"] = count_
    df_meta.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_meta}) in category {rating}")

    proportion = count_ / N_meta
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_meta)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_meta, rating)

    df_meta.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_meta.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (multinomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

df_meta.index.rename(f"For {N_meta} reviews", inplace = True)
print(tabulate(df_meta, headers='keys', tablefmt='psql'))

path_meta = output_directory / '5-meta.csv'
df_meta.to_csv(path_meta, index = True, sep=";", encoding='utf-8')

kappa_fleiss_meta = kappa_fleiss_3(list_meta_1, list_meta_2, list_meta_3)
print(f"kappa fleiss: {kappa_fleiss_meta}")
ci_low_fleiss_meta, ci_high_fleiss_meta = ci_bootstrap_3(list_meta_1, list_meta_2, list_meta_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_meta:.2f}%, {ci_high_fleiss_meta:.2f}%]")
print()
print("Review 1 VS review 2")
m1_meta = create_confusion_matrix(list_meta_1, list_meta_2, meta_categories_list)
print(tabulate(m1_meta, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_meta = create_confusion_matrix(list_meta_2, list_meta_3, meta_categories_list)
print(tabulate(m2_meta, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_meta = create_confusion_matrix(list_meta_1, list_meta_3, meta_categories_list)
print(tabulate(m3_meta, headers='keys', tablefmt='psql'))
print()




Statistics for meta-categories (on 270 reviewers):

- 60.37% of reviews (163/270) in category (+) meta
confidence intervals (multinomial proportion): [54.54%, 66.20%]
confidence intervals (bootstrap): [54.07%, 65.93%]

- 32.22% of reviews (87/270) in category (-) meta
confidence intervals (multinomial proportion): [26.65%, 37.80%]
confidence intervals (bootstrap): [26.47%, 37.41%]

- 7.41% of reviews (20/270) in category Unusable (meta)
confidence intervals (multinomial proportion): [4.28%, 10.53%]
confidence intervals (bootstrap): [4.44%, 10.74%]

+-------------------+----------+-----------+----------+-----------+
| For 270 reviews   |   number |   percent |   ci low |   ci high |
|-------------------+----------+-----------+----------+-----------|
| (+) meta          |      163 |     60.37 |    54.07 |     65.93 |
| (-) meta          |       87 |     32.22 |    26.47 |     37.41 |
| Unusable (meta)   |       20 |      7.41 |     4.44 |     10.74 |
+-------------------+----------+-----

In [13]:
# kappa fleiss table 

df_kappas = pd.DataFrame(np.zeros((3,3)), index=["Statements", "Comments", "Meta-categories"], columns=["kappa", "ci low", "ci high"])

df_kappas.loc["Statements", "kappa"] = kappa_fleiss_statement
df_kappas.loc["Statements", "ci low"] = ci_low_fleiss_statement
df_kappas.loc["Statements", "ci high"] = ci_high_fleiss_statement

df_kappas.loc["Comments", "kappa"] = kappa_fleiss_comment
df_kappas.loc["Comments", "ci low"] = ci_low_fleiss_comment
df_kappas.loc["Comments", "ci high"] = ci_high_fleiss_comment

df_kappas.loc["Meta-categories", "kappa"] = kappa_fleiss_meta
df_kappas.loc["Meta-categories", "ci low"] = ci_low_fleiss_meta
df_kappas.loc["Meta-categories", "ci high"] = ci_high_fleiss_meta

df_kappas = round(df_kappas,2)
df_kappas.index.rename("kappa fleiss", inplace=True)
print(tabulate(df_kappas, headers='keys', tablefmt='psql'))

path_kappas = output_directory / '6-kappas.csv'
df_kappas.to_csv(path_kappas, index = True, sep=";", encoding='utf-8')

+-----------------+---------+----------+-----------+
| kappa fleiss    |   kappa |   ci low |   ci high |
|-----------------+---------+----------+-----------|
| Statements      |   -0.05 |    -0.14 |      0.03 |
| Comments        |    0.05 |    -0.03 |      0.12 |
| Meta-categories |    0.02 |    -0.1  |      0.13 |
+-----------------+---------+----------+-----------+


In [14]:
# Code
column_id = 39
code_link_df = df_rating_1.loc[2:, column_id].values.tolist()
column_id = 40
code_avail_df = df_rating_1.loc[2:, column_id].values.tolist()

code_list = [
    "bad link",
    "good link",
    "no link",
]

print(f"Statistics for code (on {len(code_link_df)} papers):")
print()

list_code = []
for i in range(len(code_avail_df)):
    if str(code_link_df[i]).startswith("http"):
        if code_avail_df[i]=="1":
            list_code.append("good link")
        if code_avail_df[i]=="0":
            list_code.append("bad link")
    
    else: 
        list_code.append("no link")

nb_papers= len(list_code)


ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code, "good link")
print(f" - for {(list_code.count('good link') *100 / nb_papers):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('good link')}/{nb_papers}), the repository exists and is not empty.")


ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code, "bad link")
print(f" - for {(list_code.count('bad link') *100 / nb_papers):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('bad link')}/{nb_papers}), the link to the repository is provided but it is empty or wrong.")
ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code, "no link")
print(f" - for {(list_code.count('no link')*100 / nb_papers):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('no link')}/{nb_papers}), no link/code was provided.")
print()

list_code_binary =  []
for i in range(nb_papers):
    if list_code[i] == "good link" or list_code[i] == "bad link"  :
        list_code_binary.append(1)
    else : 
        list_code_binary.append(0)

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code_binary, 1)
print(f" - for {(100*list_code_binary.count(1)/len(list_code_binary)):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code_binary.count(1)}/{len(list_code_binary)}), an associated repository for the code was provided.")
print()        

list_code_bis = [x for x in list_code if (x=="good link" or x=="bad link")]
ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code_bis, "bad link")
print(f" - for {(100*list_code_bis.count('bad link')/len(list_code_bis)):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers for which a link was provided ({list_code.count('bad link')}/{len(list_code_bis)}), the link to the repository is provided but it is empty or wrong.")
print()




Statistics for code (on 90 papers):

 - for 47.78% (95%CI (bootstrap): [37.78%, 57.78%]) of the papers (43/90), the repository exists and is not empty.
 - for 20.00% (95%CI (bootstrap): [11.11%, 28.89%]) of the papers (18/90), the link to the repository is provided but it is empty or wrong.
 - for 32.22% (95%CI (bootstrap): [23.33%, 41.11%]) of the papers (29/90), no link/code was provided.

 - for 67.78% (95%CI (bootstrap): [57.78%, 76.67%]) of the papers (61/90), an associated repository for the code was provided.

 - for 29.51% (95%CI (bootstrap): [18.03%, 40.98%]) of the papers for which a link was provided (18/61), the link to the repository is provided but it is empty or wrong.



In [15]:
# repository and review 

list_code_1 = df_rating_1.loc[2:, 5].values.tolist()
list_code_2 = df_rating_1.loc[2:, 14].values.tolist()
list_code_3 = df_rating_1.loc[2:, 23].values.tolist()

list_code_review = []
for i in range(len(list_code_1)):
    x = list_code_1[i]
    if x== '0' and x==list_code_2[i] and x==list_code_3[i] :
        list_code_review.append("no info")
    else :
        list_code_review.append('code promised')


ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(list_code_review, "code promised")
print(f"For {(list_code_review.count('code promised')*100/len(list_code_review)):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code_review.count('code promised')}/{len(list_code_review)}), at least one of the reviewers said that the code was available.")
print()

df_code_reviews = pd.DataFrame(np.zeros((3,4)), index=["no info", "code promised", "total"], columns=code_list + ["total"])


list_code_final = []
for i in range(len(list_code)):
    if list_code[i] == "good link" and list_code_review[i]=="code promised":
        df_code_reviews.loc["code promised", "good link"]+=1
        list_code_final.append("code promised good link")
    if list_code[i] == "good link" and list_code_review[i]== "no info":
        df_code_reviews.loc["no info", "good link"]+=1
        list_code_final.append("no info good link")

    if list_code[i] == "bad link" and list_code_review[i]=="code promised":
        df_code_reviews.loc["code promised", "bad link"]+=1
        list_code_final.append("code promised bad link")
    if list_code[i] == "bad link" and list_code_review[i]== "no info":
        df_code_reviews.loc["no info", "bad link"]+=1
        list_code_final.append("no info bad link")

    if list_code[i] == "no link" and list_code_review[i]=="code promised":
        df_code_reviews.loc["code promised", "no link"]+=1
        list_code_final.append("code promised no link")
    if list_code[i] == "no link" and list_code_review[i]== "no info":
        df_code_reviews.loc["no info", "no link"]+=1
        list_code_final.append("no info no link")

df_code_reviews.loc["code promised", "total"] = df_code_reviews.loc["code promised", "good link"] + df_code_reviews.loc["code promised", "bad link"] + df_code_reviews.loc["code promised", "no link"]
df_code_reviews.loc["no info", "total"] = df_code_reviews.loc["no info", "good link"] + df_code_reviews.loc["no info", "bad link"] + df_code_reviews.loc["no info", "no link"]

df_code_reviews.loc["total", "good link"] = df_code_reviews.loc["code promised", "good link"] + df_code_reviews.loc["no info", "good link"] 
df_code_reviews.loc["total", "bad link"] = df_code_reviews.loc["code promised", "bad link"] + df_code_reviews.loc["no info", "bad link"] 
df_code_reviews.loc["total", "no link"] = df_code_reviews.loc["code promised", "no link"] + df_code_reviews.loc["no info", "no link"] 

df_code_reviews.loc["total", "total"] = df_code_reviews.loc["code promised", "total"] + df_code_reviews.loc["no info", "total"]

good_review_list = [x for x in list_code_final if x.startswith("code promised")]
nb_good_reviews = len(good_review_list)

print(f"For these {nb_good_reviews} papers:")

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(good_review_list, "code promised good link")
print(f" - for {(df_code_reviews.loc['code promised', 'good link']*100 / nb_good_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'good link']}/{nb_good_reviews}), at least one of the reviewers said that the code was available and it was.")

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(good_review_list, "code promised bad link")
print(f" - for {(df_code_reviews.loc['code promised', 'bad link']*100 / nb_good_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'bad link']}/{nb_good_reviews}), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.")

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(good_review_list, "code promised no link")
print(f" - for {(df_code_reviews.loc['code promised', 'no link']*100 / nb_good_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'no link']}/{nb_good_reviews}), at least one of the reviewers said that the code was available even if no link/code was provided.")
print()



good_reviews_list_bis = []
for i in range(nb_good_reviews):
    if good_review_list[i] == "code promised good link":
        good_reviews_list_bis.append(1)
    else :
        good_reviews_list_bis.append(0)

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(good_reviews_list_bis, 0)
print(f" - for {(good_reviews_list_bis.count(0)*100 / nb_good_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({good_reviews_list_bis.count(0)}/{nb_good_reviews}), at least one of the reviewers said that the code was available even if the code was actually missing in the published version (no link, broken link or empty repository)")
print()


bad_review_list = [x for x in list_code_final if x.startswith("no info")]
nb_bad_reviews = len(bad_review_list)

ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(bad_review_list, "no info good link")
print(f" - for {(df_code_reviews.loc['no info', 'good link']*100 / nb_bad_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['no info', 'good link']}/{nb_bad_reviews}), the reviewers said that the code wasn't available even if it was.")
print()
print("The tables below indicate, for each paper if at least one reviewer said that the \n code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), \n present by with empty repository (empty link) or whether there was no link (no link).")
print()

df_code_reviews.index.rename("code availableness (in number)", inplace=True)
print(tabulate(df_code_reviews, headers='keys', tablefmt='psql'))

path_code = output_directory / '7-code_number.csv'
df_code_reviews.to_csv(path_code, index = True, sep=";", encoding='utf-8')

df_code_review_percent = df_code_reviews
df_code_review_percent.index.rename("code availableness (in %)", inplace=True)
df_code_review_percent = df_code_review_percent*100 /len(list_code)

path_code_percent = output_directory / '8-code_percent.csv'
df_code_review_percent.to_csv(path_code_percent, index = True, sep=";", encoding='utf-8')

print(tabulate(df_code_review_percent, headers='keys', tablefmt='psql'))



For 86.67% (95%CI (bootstrap): [78.89%, 93.33%]) of the papers (78/90), at least one of the reviewers said that the code was available.



For these 78 papers:
 - for 47.44% (95%CI (bootstrap): [36.51%, 58.97%]) of the papers (37.0/78), at least one of the reviewers said that the code was available and it was.
 - for 20.51% (95%CI (bootstrap): [11.54%, 29.49%]) of the papers (16.0/78), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.
 - for 32.05% (95%CI (bootstrap): [21.79%, 42.31%]) of the papers (25.0/78), at least one of the reviewers said that the code was available even if no link/code was provided.

 - for 52.56% (95%CI (bootstrap): [41.03%, 64.10%]) of the papers (41/78), at least one of the reviewers said that the code was available even if the code was actually missing in the published version (no link, broken link or empty repository)

 - for 50.00% (95%CI (bootstrap): [25.00%, 75.00%]) of the papers (6.0/12), the reviewers said that the code wasn't available even if it was.

The tables below indicate, for each paper if 

In [16]:
data = {}
data["nb_papers"] = len(df_rating_1)-2
data["nb_reviews"] = (len(df_rating_1)-2)*3
data["date"]= str(datetime.date.today())
data["time"]=str(datetime.datetime.utcnow())
data["path_rating"]= path_csv


json_data = json.dumps(data, skipkeys=True, indent=4)
json_path =  output_directory/ "data.json"
with open(json_path,"w") as f:
    f.write(json_data)