In [19]:
import pandas as pd
import numpy as np
from math import sqrt
import random
from tabulate import tabulate
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters


In [20]:

def kappa(po, pe):
    """
    Calculate the kappa score
    """
    return (po-pe)/(1-pe)

def expected_proportion(matrix: pd.DataFrame):
    """
    Overall proportion of agreement expected by chance
    """
    pe = 0
    k = len(matrix)-1
    for i in range(k):
        pe += matrix.loc[i, k] * matrix.loc[k, i]
    return pe

def observed_proportion(matrix: pd.DataFrame):
    """
    Overall proportion of observed agreement 
    """
    po = 0
    k = len(matrix) - 1
    for i in range(k):
        po += matrix.loc[i, i]
    return po

def create_confusion_matrix(list_1: list , list_2: list, list_attributs)-> pd.DataFrame:
    """
    Create confusion matrix (in %)
    """
    if not len(list_1)==len(list_2):
        print("reviwer 1 and 2 may haven't rated the same list of subjects")

    else:
        size = len(list_attributs)
        matrix = pd.DataFrame(np.zeros((size + 1, size + 1)), index = list_attributs + ["total"], columns = list_attributs + ["total"])
        for k in range(size):
            for l in range(size):
                att_k = list_attributs[k]
                att_l = list_attributs[l]
                for i in range(len(list_1)):
                    if (list_1[i]== att_k):
                        if list_2[i] == att_l :
                            matrix.loc[att_k,att_l]+=1
        for i in range(size):
            for j in range(size):
                att_i = list_attributs[i]
                att_j = list_attributs[j]
                matrix.loc["total", att_i] += matrix.loc[att_j, att_i]
                matrix.loc[att_i, "total"] += matrix.loc[att_i, att_j]
                matrix.loc["total", "total"] += matrix.loc[att_i, att_j]

        if len(list_1) == matrix.loc["total", "total"]:
            matrix = round(matrix* 100/ len(list_1), 2)
    return matrix



In [21]:
list_0_1= [
    "Models and algorithms",
    "Datasets",
    "Code",
    "Experimental results",
    "Error bars or statistical significance",
    "Code is or will be available"
    ]

statement_list = [
    "3. (+) statement",
    "2. (-) statement ",
    "1. (none) statement",
    "0. Unusable (statement)",
]

comments_list = [
    "4. (-/+) comments",
    "3. (+) comments",
    "2. (-) comments",
    "1. (none) comments",
    "0. Unusable (comments)",
]

meta_categories_list = [
    "(+) meta",
    "(-) meta",
    "Unusable (meta)",
]

agreement_list = [
    "Agreement",
    "Disagreement",
    "Unusable",
]

In [22]:
def ci_bp(proportion, N):
    """
    Confidence interval (binomial proportion)
    """
    ci_bp_low = (-1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100
    ci_bp_high = (1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100

    return ci_bp_low, ci_bp_high

In [23]:
def ci_bootstrap(data, val, num_resamples = 999):
    """
    Confidence interval (bootstrap)
    """
    K = len(data)
    outputs = []
    for i in range(num_resamples):
        Y_resample = random.choices(data, k=K)
        out = Y_resample.count(val)
        outputs.append(out*100 / K )

    return np.percentile(outputs, 2.25), np.percentile(outputs, 97.5)


In [24]:
def kappa_fleiss_3(data_1, data_2, data_3):
    """
    Calculate kappa score for 3 raters
    """
    data_T = np.array([data_1, data_2, data_3]).T
    data_fleiss_ = aggregate_raters(data_T)
    kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
    return kappa_fleiss_

def ci_bootstrap_3(data_1, data_2, data_3, num_resamples = 999):
    """
    Confidence interval for a 3 raters kappa (bootstrap)
    """
    Y = np.array([data_1, data_2, data_3]).T
    list_kappa = []
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        data_1 = Y_resample[:, 0]
        data_2 = Y_resample[:, 1]
        data_3 = Y_resample[:, 2]

        kappa = kappa_fleiss_3(data_1.astype(str), data_2.astype(str), data_3.astype(str))
        list_kappa.append(kappa)

    return np.percentile(list_kappa, 2.25), np.percentile(list_kappa, 97.5)



In [25]:
# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../rating/rating_90/rating_90_O.tsv"
df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)


print(f"This notebook analyzes the 3 reviews of {len(df_rating_1)-2} different papers.")

This notebook analyzes the 3 reviews of the 90 different papers.


In [26]:
df_category = pd.DataFrame(np.zeros((len(list_0_1),4)), index=list_0_1, columns=["number", "percent", "ci low", "ci high"])

for category in range(len(list_0_1)):
    all_reviews_1 = []
    for i in range(3):
        column_id = i*9 + 3 + category
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        all_reviews_1 = all_reviews_1 + list_review_1

    count_ = all_reviews_1.count("1")
    percent_ = round(count_ * 100 / len(all_reviews_1),2)

    df_category.loc[list_0_1[category], "number"] = count_
    df_category.loc[list_0_1[category], "percent"] = percent_

    print(f"For category {list_0_1[category]}, {percent_}% of reviewers ({count_}/{len(all_reviews_1)}) have commented on at least one of the items of the category.")

    proportion = count_ / len(all_reviews_1)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(all_reviews_1))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(all_reviews_1, '1')

    df_category.loc[list_0_1[category], "ci low"] = round(ci_bootstrap_low,2)
    df_category.loc[list_0_1[category], "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

df_category.index.rename(f"For {len(all_reviews_1)} reviews", inplace = True)
print(tabulate(df_category, headers='keys', tablefmt='psql'))



For category Models and algorithms, 28.89% of reviewers (78/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [23.48%, 34.30%]
confidence intervals (bootstrap): [23.33%, 34.46%]

For category Datasets, 33.33% of reviewers (90/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [27.71%, 38.96%]
confidence intervals (bootstrap): [27.78%, 39.26%]

For category Code, 46.67% of reviewers (126/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [40.72%, 52.62%]
confidence intervals (bootstrap): [40.74%, 52.59%]

For category Experimental results, 25.56% of reviewers (69/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [20.35%, 30.76%]
confidence intervals (bootstrap): [20.74%, 30.74%]

For category Error bars or statistical significance, 1.85% of reviewers (5

In [27]:
# statements
df_statement = pd.DataFrame(np.zeros((len(statement_list),4)), index=statement_list, columns=["number", "percent", "ci low", "ci high"])

reviews_statement = []
list_statement_1 = df_rating_1.loc[2:, 9].values.tolist()
list_statement_2 = df_rating_1.loc[2:, 18].values.tolist()
list_statement_3 = df_rating_1.loc[2:, 27].values.tolist()

reviews_statement = list_statement_1 + list_statement_2 + list_statement_3

print(f"Statistics for statements (on {len(reviews_statement)} reviewers):")
print()
N_statement = len(reviews_statement)

for rating in statement_list:
    count_ = reviews_statement.count(rating)
    percent_ = round(count_ * 100 / N_statement, 2)
    df_statement.loc[rating, "number"] = count_
    df_statement.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_statement}) in category {rating}")

    proportion = count_ / len(reviews_statement)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(reviews_statement))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_statement, rating)

    df_statement.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_statement.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()


df_statement.index.rename(f"For {N_statement} reviews", inplace = True)
print(tabulate(df_statement, headers='keys', tablefmt='psql'))
print()

kappa_fleiss_statement = kappa_fleiss_3(list_statement_1, list_statement_2, list_statement_3)
print(f"kappa fleiss: {kappa_fleiss_statement}")

# from sklearn.metrics import cohen_kappa_score
# kappa_sk_statement = cohen_kappa_score(list_statement_1, list_statement_2)
# print(f"kappa sklearn: {kappa_sk_statement}")

ci_low_fleiss_statement, ci_high_fleiss_statement = ci_bootstrap_3(list_statement_1, list_statement_2, list_statement_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_statement:.2f}, {ci_high_fleiss_statement:.2f}]")
print()



print("Review 1 VS review 2") 
m1_statement = create_confusion_matrix(list_statement_1, list_statement_2, statement_list)
print(tabulate(m1_statement, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_statement = create_confusion_matrix(list_statement_2, list_statement_3, statement_list)
print(tabulate(m2_statement, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_statement = create_confusion_matrix(list_statement_1, list_statement_3, statement_list)
print(tabulate(m3_statement, headers='keys', tablefmt='psql'))
print()


Statistics for statements (on 270 reviewers):

- 48.89% of reviews (132/270) in category 3. (+) statement
confidence intervals (binomial proportion): [42.93%, 54.85%]
confidence intervals (bootstrap): [42.96%, 55.19%]

- 11.85% of reviews (32/270) in category 2. (-) statement 
confidence intervals (binomial proportion): [8.00%, 15.71%]
confidence intervals (bootstrap): [7.78%, 15.93%]

- 37.04% of reviews (100/270) in category 1. (none) statement
confidence intervals (binomial proportion): [31.28%, 42.80%]
confidence intervals (bootstrap): [31.48%, 42.96%]

- 2.22% of reviews (6/270) in category 0. Unusable (statement)
confidence intervals (binomial proportion): [0.46%, 3.98%]
confidence intervals (bootstrap): [0.74%, 4.07%]

+-------------------------+----------+-----------+----------+-----------+
| For 270 reviews         |   number |   percent |   ci low |   ci high |
|-------------------------+----------+-----------+----------+-----------|
| 3. (+) statement        |      132 |    

In [28]:
# comments
df_comments = pd.DataFrame(np.zeros((len(comments_list),4)), index=comments_list, columns=["number", "percent", "ci low", "ci high"])

reviews_comments = []
list_comment_1 = df_rating_1.loc[2:, 10].values.tolist()
list_comment_2 = df_rating_1.loc[2:, 19].values.tolist()
list_comment_3 = df_rating_1.loc[2:, 28].values.tolist()
reviews_comments = list_comment_1 + list_comment_2 + list_comment_3

print(f"Statistics for comments (on {len(reviews_comments)} reviewers):")
print()
N_comments = len(reviews_comments)

for rating in comments_list:
    count_ = reviews_comments.count(rating)
    percent_ = round(count_ * 100 / N_comments, 2)
    df_comments.loc[rating, "number"] = count_
    df_comments.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_comments}) in category {rating}")

    proportion = count_ / N_comments
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_comments)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_comments, rating)

    df_comments.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_comments.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

df_comments.index.rename(f"For {N_comments} reviews", inplace = True)
print(tabulate(df_comments, headers='keys', tablefmt='psql'))

kappa_fleiss_comment = kappa_fleiss_3(list_comment_1, list_comment_2, list_comment_3)
print(f"kappa fleiss: {kappa_fleiss_comment}")
ci_low_fleiss_comment, ci_high_fleiss_comment = ci_bootstrap_3(list_comment_1, list_comment_2, list_comment_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_comment:.2f}%, {ci_high_fleiss_comment:.2f}%]")
print()
print("Review 1 VS review 2")
m1_comment = create_confusion_matrix(list_comment_1, list_comment_2, comments_list)
print(tabulate(m1_comment, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_comment = create_confusion_matrix(list_comment_2, list_comment_3, comments_list)
print(tabulate(m2_comment, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_comment =create_confusion_matrix(list_comment_1, list_comment_3, comments_list)
print(tabulate(m3_comment, headers='keys', tablefmt='psql'))
print()


Statistics for comments (on 270 reviewers):

- 22.96% of reviews (62/270) in category 4. (-/+) comments
confidence intervals (binomial proportion): [17.95%, 27.98%]
confidence intervals (bootstrap): [17.78%, 27.78%]

- 30.74% of reviews (83/270) in category 3. (+) comments
confidence intervals (binomial proportion): [25.24%, 36.24%]
confidence intervals (bootstrap): [25.56%, 36.30%]

- 18.52% of reviews (50/270) in category 2. (-) comments
confidence intervals (binomial proportion): [13.89%, 23.15%]
confidence intervals (bootstrap): [13.70%, 23.33%]

- 22.22% of reviews (60/270) in category 1. (none) comments
confidence intervals (binomial proportion): [17.26%, 27.18%]
confidence intervals (bootstrap): [17.41%, 27.04%]

- 5.56% of reviews (15/270) in category 0. Unusable (comments)
confidence intervals (binomial proportion): [2.82%, 8.29%]
confidence intervals (bootstrap): [2.96%, 8.52%]

+------------------------+----------+-----------+----------+-----------+
| For 270 reviews        

In [29]:
# statements and comments 

df_stat_com = pd.DataFrame(np.zeros((len(statement_list)+1,len(comments_list)+1)), index=statement_list+["total"], columns=comments_list+["total"])
df_stat_com.index.rename("number", inplace = True)
for i in range (len(reviews_comments)):
    df_stat_com.loc[reviews_statement[i], reviews_comments[i]] +=1
    df_stat_com.loc["total", reviews_comments[i]] +=1
    df_stat_com.loc[reviews_statement[i], "total"] +=1
df_stat_com.loc["total", "total"] = len(reviews_comments)

print(tabulate(df_stat_com, headers='keys', tablefmt='psql'))
print()
df_stat_com_percent = df_stat_com
df_stat_com_percent.index.rename("in %", inplace = True)
df_stat_com_percent = round(df_stat_com*100 /len(reviews_comments), 2)

print(tabulate(df_stat_com_percent, headers='keys', tablefmt='psql'))


+-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------+
| number                  |   4. (-/+) comments |   3. (+) comments |   2. (-) comments |   1. (none) comments |   0. Unusable (comments) |   total |
|-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------|
| 3. (+) statement        |                  21 |                52 |                 7 |                   52 |                        0 |     132 |
| 2. (-) statement        |                   8 |                 0 |                21 |                    3 |                        0 |      32 |
| 1. (none) statement     |                  33 |                31 |                22 |                    5 |                        9 |     100 |
| 0. Unusable (statement) |                   0 |                 0 |                 0 |           

In [30]:
# meta categories
df_meta = pd.DataFrame(np.zeros((len(meta_categories_list),4)), index=meta_categories_list, columns=["number", "percent", "ci low", "ci high"])

reviews_meta = []
list_meta_1 = df_rating_1.loc[2:, 29].values.tolist()
list_meta_2 = df_rating_1.loc[2:, 30].values.tolist()
list_meta_3 = df_rating_1.loc[2:, 31].values.tolist()

reviews_meta = list_meta_1 + list_meta_2 + list_meta_3

reviews_meta_python = []
for i in range(len(reviews_statement)):
    if reviews_statement[i] == "3. (+) statement":
        reviews_meta_python.append("(+) meta")
    elif reviews_statement[i] == "2. (-) statement ":
        reviews_meta_python.append("(-) meta")
    elif reviews_comments[i] == "3. (+) comments":
        reviews_meta_python.append("(+) meta")
    elif reviews_comments[i] == "2. (-) comments" or reviews_comments[i] =="4. (-/+) comments":
        reviews_meta_python.append( "(-) meta")
    else:
        reviews_meta_python.append("Unusable (meta)")

if not (reviews_meta==reviews_meta_python):
    reviews_meta = reviews_meta_python
    print("Meta categories calculated values aren't the same as in the provided tsv file.")


print(f"Statistics for meta-categories (on {len(reviews_meta)} reviewers):")
print()

N_meta = len(reviews_meta)
for rating in meta_categories_list:
    count_ = reviews_meta.count(rating)
    percent_ = round(count_ * 100 / N_meta, 2)
    df_meta.loc[rating, "number"] = count_
    df_meta.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_meta}) in category {rating}")

    proportion = count_ / N_meta
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_meta)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_meta, rating)

    df_meta.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_meta.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (multinomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

df_meta.index.rename(f"For {N_meta} reviews", inplace = True)
print(tabulate(df_meta, headers='keys', tablefmt='psql'))

kappa_fleiss_meta = kappa_fleiss_3(list_meta_1, list_meta_2, list_meta_3)
print(f"kappa fleiss: {kappa_fleiss_meta}")
ci_low_fleiss_meta, ci_high_fleiss_meta = ci_bootstrap_3(list_meta_1, list_meta_2, list_meta_3)
print(f"confidence intervals (bootstrap): [{ci_low_fleiss_meta:.2f}%, {ci_high_fleiss_meta:.2f}%]")
print()
print("Review 1 VS review 2")
m1_meta = create_confusion_matrix(list_meta_1, list_meta_2, meta_categories_list)
print(tabulate(m1_meta, headers='keys', tablefmt='psql'))
print()
print("Review 2 VS review 3")
m2_meta = create_confusion_matrix(list_meta_2, list_meta_3, meta_categories_list)
print(tabulate(m2_meta, headers='keys', tablefmt='psql'))
print()
print("Review 1 VS review 3")
m3_meta = create_confusion_matrix(list_meta_1, list_meta_3, meta_categories_list)
print(tabulate(m3_meta, headers='keys', tablefmt='psql'))
print()




Statistics for meta-categories (on 270 reviewers):

- 60.37% of reviews (163/270) in category (+) meta
confidence intervals (multinomial proportion): [54.54%, 66.20%]
confidence intervals (bootstrap): [54.44%, 65.93%]

- 32.22% of reviews (87/270) in category (-) meta
confidence intervals (multinomial proportion): [26.65%, 37.80%]
confidence intervals (bootstrap): [26.67%, 38.15%]

- 7.41% of reviews (20/270) in category Unusable (meta)
confidence intervals (multinomial proportion): [4.28%, 10.53%]
confidence intervals (bootstrap): [4.44%, 10.74%]

+-------------------+----------+-----------+----------+-----------+
| For 270 reviews   |   number |   percent |   ci low |   ci high |
|-------------------+----------+-----------+----------+-----------|
| (+) meta          |      163 |     60.37 |    54.44 |     65.93 |
| (-) meta          |       87 |     32.22 |    26.67 |     38.15 |
| Unusable (meta)   |       20 |      7.41 |     4.44 |     10.74 |
+-------------------+----------+-----

In [31]:
# Agreement
df_agreement = pd.DataFrame(np.zeros((len(agreement_list),4)), index=agreement_list, columns=["number", "percent", "ci low", "ci high"])

column_id = 38
reviews_agreement = df_rating_1.loc[2:, column_id].values.tolist()

print(f"Statistics for agreement (on {len(reviews_agreement)} reviewers):")
print()
N_agreement = len(reviews_agreement)
for rating in agreement_list:
    count_ = reviews_agreement.count(rating)
    percent_ = round(count_ * 100 / N_agreement, 2)
    df_agreement.loc[rating, "number"] = count_
    df_agreement.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of papers ({count_}/{N_agreement}) in category {rating}")

    proportion = count_ / N_agreement
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_agreement)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_agreement, rating)

    df_agreement.loc[rating, "ci low"] = round(ci_bootstrap_low,2)
    df_agreement.loc[rating, "ci high"] = round(ci_bootstrap_high,2)

    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

df_agreement.index.rename(f"For {N_agreement} papers", inplace = True)
print(tabulate(df_agreement, headers='keys', tablefmt='psql'))


Statistics for agreement (on 90 reviewers):

- 35.56% of papers (32/90) in category Agreement
confidence intervals (binomial proportion): [25.67%, 45.45%]
confidence intervals (bootstrap): [25.56%, 45.56%]

- 62.22% of papers (56/90) in category Disagreement
confidence intervals (binomial proportion): [52.21%, 72.24%]
confidence intervals (bootstrap): [52.22%, 72.22%]

- 2.22% of papers (2/90) in category Unusable
confidence intervals (binomial proportion): [-0.82%, 5.27%]
confidence intervals (bootstrap): [0.00%, 5.56%]

+-----------------+----------+-----------+----------+-----------+
| For 90 papers   |   number |   percent |   ci low |   ci high |
|-----------------+----------+-----------+----------+-----------|
| Agreement       |       32 |     35.56 |    25.56 |     45.56 |
| Disagreement    |       56 |     62.22 |    52.22 |     72.22 |
| Unusable        |        2 |      2.22 |     0    |      5.56 |
+-----------------+----------+-----------+----------+-----------+


In [32]:
# kappa fleiss table 

df_kappas = pd.DataFrame(np.zeros((3,3)), index=["Statements", "Comments", "Meta-categories"], columns=["kappa", "ci low", "ci high"])

df_kappas.loc["Statements", "kappa"] = kappa_fleiss_statement
df_kappas.loc["Statements", "ci low"] = ci_low_fleiss_statement
df_kappas.loc["Statements", "ci high"] = ci_high_fleiss_statement

df_kappas.loc["Comments", "kappa"] = kappa_fleiss_comment
df_kappas.loc["Comments", "ci low"] = ci_low_fleiss_comment
df_kappas.loc["Comments", "ci high"] = ci_high_fleiss_comment

df_kappas.loc["Meta-categories", "kappa"] = kappa_fleiss_meta
df_kappas.loc["Meta-categories", "ci low"] = ci_low_fleiss_meta
df_kappas.loc["Meta-categories", "ci high"] = ci_high_fleiss_meta

df_kappas = round(df_kappas,2)
df_kappas.index.rename("kappa fleiss", inplace=True)
print(tabulate(df_kappas, headers='keys', tablefmt='psql'))

+-----------------+---------+----------+-----------+
| kappa fleiss    |   kappa |   ci low |   ci high |
|-----------------+---------+----------+-----------|
| Statements      |   -0.05 |    -0.14 |      0.03 |
| Comments        |    0.05 |    -0.03 |      0.12 |
| Meta-categories |    0.02 |    -0.08 |      0.13 |
+-----------------+---------+----------+-----------+


In [33]:
# Code
column_id = 39
code_link_df = df_rating_1.loc[2:, column_id].values.tolist()
column_id = 40
code_avail_df = df_rating_1.loc[2:, column_id].values.tolist()


print(f"Statistics for code (on {len(code_link_df)} papers):")
print()
df_code = pd.DataFrame(np.zeros((3,3)), index=["0", "1", "total"], columns=["link", "N/A", "total"])
df_code.index.rename("code availableness (in %)", inplace=True)

for i in range(len(code_avail_df)):
    if str(code_link_df[i]).startswith("http"):
        df_code.loc[code_avail_df[i], "link"] +=1
        df_code.loc["total", "link"] += 1
     
    else: 
        df_code.loc[code_avail_df[i], "N/A"] +=1
        df_code.loc["total", "N/A"] += 1

    df_code.loc[code_avail_df[i], "total"] += 1
    df_code.loc["total", "total"] +=1


nb_papers= len(code_avail_df)
print(f" - for {round(df_code.loc['1', 'link'] *100 / nb_papers,2)}% of the papers ({df_code.loc['1', 'link']}/{nb_papers}), the repository exists and is not empty.")
print(f" - for {round(df_code.loc['0', 'link'] *100 / nb_papers,2)}% of the papers ({df_code.loc['0', 'link']}/{nb_papers}), the link to the repository is provided but it is empty or wrong.")
print(f" - for {round(df_code.loc['0', 'N/A'] *100 / nb_papers,2)}% of the papers ({df_code.loc['0', 'N/A']}/{nb_papers}), no link/code was provided.")
print()
print(f" - for {round(100*df_code.loc['0', 'link']/(df_code.loc['0', 'link']+df_code.loc['1', 'link']),2)}% of the papers for which a link was provided ({df_code.loc['0', 'link']}/{df_code.loc['0', 'link']+df_code.loc['1', 'link']}), the link to the repository is provided but it is empty or wrong.")
print()


df_code = round(df_code*100 /len(code_avail_df), 2)

#print(tabulate(df_code, headers='keys', tablefmt='psql'))
#print(tabulate(df_code, headers='keys', tablefmt='latex'))

Statistics for code (on 90 papers):

 - for 47.78% of the papers (43.0/90), the repository exists and is not empty.
 - for 20.0% of the papers (18.0/90), the link to the repository is provided but it is empty or wrong.
 - for 32.22% of the papers (29.0/90), no link/code was provided.

 - for 29.51% of the papers for which a link was provided (18.0/61.0), the link to the repository is provided but it is empty or wrong.



In [34]:
# repository and review 

list_code_1 = df_rating_1.loc[2:, 5].values.tolist()
list_code_2 = df_rating_1.loc[2:, 14].values.tolist()
list_code_3 = df_rating_1.loc[2:, 23].values.tolist()

list_code = []
for i in range(len(list_code_1)):
    x = list_code_1[i]
    if x== '0' and x==list_code_2[i] and x==list_code_3[i] :
        list_code.append(x)
    else :
        list_code.append('1')

print(f"For {round(list_code.count('1')*100/len(list_code), 2)}% of the papers ({round(list_code.count('1'))}/{round(len(list_code), 2)}), at least one of the reviewers said that the code was available.")
print()

df_code_review = pd.DataFrame(np.zeros((3,4)), index=["0", "1", "total"], columns=["good link", "empty link", "no link", "total"])


for i in range(len(list_code)):
    if str(code_link_df[i]).startswith("http"):
        if code_avail_df[i] == '1':
            df_code_review.loc[list_code[i], "good link"] +=1
            df_code_review.loc["total", "good link"] += 1

        elif code_avail_df[i] == '0' :
            df_code_review.loc[list_code[i], "empty link"] +=1
            df_code_review.loc["total", "empty link"] += 1
            
    else: 
        df_code_review.loc[list_code[i], "no link"] +=1
        df_code_review.loc["total", "no link"] += 1

    df_code_review.loc[list_code[i], "total"] += 1
    df_code_review.loc["total", "total"] +=1

nb_code_papers = df_code_review.loc['1', "total"]

print(f"For these {nb_code_papers} papers:")
print(f" - for {round(df_code_review.loc['1', 'good link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'good link'],2)}/{nb_code_papers}), a least one of the reviewers said that the code was available and it was.")
print(f" - for {round(df_code_review.loc['1', 'empty link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'empty link'],2)}/{nb_code_papers}), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.")
print(f" - for {round(df_code_review.loc['1', 'no link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'no link'],2)}/{nb_code_papers}), at least one of the reviewers said that the code was available even if no link/code was provided.")
print()
nb_others = len(list_code) - nb_code_papers
print(f"For the {nb_others} others:")
print(f" - for {round(df_code_review.loc['0', 'good link'] *100 / nb_others, 2)}% of the papers ({round(df_code_review.loc['0', 'good link'],2)}/{nb_others}), no one of the reviewers said that the code was available even if it was.")



print()
print("The tables below indicate, for each paper if at least one reviewer said that the \n code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), \n present by with empty repository (empty link) or whether there was no link (no link).")

df_code_review.index.rename("code availableness (in number)", inplace=True)
print(tabulate(df_code_review, headers='keys', tablefmt='psql'))

df_code_review_percent = df_code_review
df_code_review_percent.index.rename("code availableness (in %)", inplace=True)
df_code_review_percent = round(df_code_review_percent*100 /len(list_code), 2)
print(tabulate(df_code_review_percent, headers='keys', tablefmt='psql'))


For 86.67% of the papers (78/90), at least one of the reviewers said that the code was available.

For these 78.0 papers:
 - for 47.44% of the papers (37.0/78.0), a least one of the reviewers said that the code was available and it was.
 - for 20.51% of the papers (16.0/78.0), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.
 - for 32.05% of the papers (25.0/78.0), at least one of the reviewers said that the code was available even if no link/code was provided.

For the 12.0 others:
 - for 50.0% of the papers (6.0/12.0), no one of the reviewers said that the code was available even if it was.

The tables below indicate, for each paper if at least one reviewer said that the 
 code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), 
 present by with empty repository (empty link) or whether there was no link (no link).
+----------------------------------

In [35]:


#latex output
file1 = open('../latex/rating_analysis.tex', 'w')
file1.write("\documentclass{article}\n\n")
file1.write("\\usepackage{float}\n\n")
file1.write("\\title{Rating analysis}\n\n")
file1.write("\\begin{document}\n\n")
file1.write("\maketitle\n\n")
file1.write(f"This notebook analyzes the 3 reviews of the {len(df_rating_1)-2} different accepted papers at MICCAI 2023.\n\n")

# categories
file1.write("\section{Categories} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_category, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")


# Statement
file1.write("\section{Statements} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_statement, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 2} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m1_statement, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 2 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m2_statement, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m3_statement, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# Comments 
file1.write("\section{Comments} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_comments, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 2} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m1_comment, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 2 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m2_comment, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m3_comment, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# statement VS comments 
file1.write("\section{Statements VS comments} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_stat_com, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_stat_com_percent, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# Meta 
file1.write("\section{Meta categories} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_meta, headers='keys', tablefmt='latex'))
file1.write("\caption{Percent of reviewers that have commented on at least one of the items of the category}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 2} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m1_meta, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 2 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m2_meta, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\subsection{Review 1 VS review 3} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(m3_meta, headers='keys', tablefmt='latex'))
file1.write("\caption{Confusion matrix in percent}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# Agreement 
file1.write("\section{Agreement} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_agreement, headers='keys', tablefmt='latex'))
file1.write("\caption{Satistics about whether or not the reviewers agreed on the reproducibility of the paper.}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# Kappa Fleiss 
file1.write("\section{Kappa Fleiss} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_kappas, headers='keys', tablefmt='latex'))
file1.write("\caption{Kappa Fleiss for multi rating (3 reviewers).}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

# Code review
file1.write("\section{Code Review} \n\n")
file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_code_review, headers='keys', tablefmt='latex'))
file1.write("\caption{The table above indicates, for each paper if at least one reviewer said that the \n code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), \n present by with empty repository (empty link) or whether there was no link (no link).}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\\begin{table}[H]\n\n")
file1.write("\centering\n\n")
file1.write(tabulate(df_code_review_percent, headers='keys', tablefmt='latex'))
file1.write("\caption{The table above indicates, for each paper if at least one reviewer said that the \n code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), \n present by with empty repository (empty link) or whether there was no link (no link).}\n\n")
file1.write("\end{table}\n\n")
file1.write("\n\n")

file1.write("\end{document}")

file1.close()
