In [267]:
import pandas as pd
import numpy as np
from math import sqrt
import random
from tabulate import tabulate

In [282]:
list_0_1= [
    "Models and algorithms",
    "Datasets",
    "Code",
    "Experimental results",
    "Error bars or statistical significance",
    ]

statement_list = [
    "3. (+) statement",
    "2. (-) statement ",
    "1. (none) statement",
    "0. Unusable (statement)",
]

comments_list = [
    "4. (-/+) comments",
    "3. (+) comments",
    "2. (-) comments",
    "1. (none) comments",
    "0. Unusable (comments)",
]

meta_categories_list = [
    "Reproducible",
    "Irreproducible",
    "Unusable",
]

agreement_list = [
    "Agreement",
    "Disagreement",
    "Unusable",
]

In [283]:
def ci_bp(proportion, N):
    ci_bp_low = (-1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100
    ci_bp_high = (1.96 * sqrt(proportion*(1-proportion)/N) + proportion) * 100

    return ci_bp_low, ci_bp_high

In [284]:
# boostrap function for proportion 
def ci_bootstrap(data, val, num_resamples = 999):
    K = len(data)
    outputs = []
    for i in range(num_resamples):
        Y_resample = random.choices(data, k=K)
        out = Y_resample.count(val)
        outputs.append(out*100 / K )

    return np.percentile(outputs, 2.25), np.percentile(outputs, 97.5)


In [285]:
# Enter the path to the tsv file with the rating from the first reviwer
path_tsv = "../rating/rating_90/rating_90_O.tsv"
df_rating_1 = pd.read_csv(path_tsv, sep = "\t", index_col=False, header= None)
# df_rating_1 = df_rating_1.dropna()  

print(f"This notebook analyzes the 3 reviews of the {len(df_rating_1)-2} different papers.")

This notebook analyzes the 3 reviews of the 90 different papers.


In [286]:



for category in range(len(list_0_1)):
    all_reviews_1 = []
    for i in range(3):
        column_id = i*9 + 3 + category
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        all_reviews_1 = all_reviews_1 + list_review_1

    count_ = all_reviews_1.count("1")

    print(f"For category {list_0_1[category]}, {(count_ * 100 / len(all_reviews_1)):.2f}% of reviewers ({count_}/{len(all_reviews_1)}) have commented on at least one of the items of the category.")
    
    proportion = count_ / len(all_reviews_1)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(all_reviews_1))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(all_reviews_1, '1')


    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()


For category Models and algorithms, 28.89% of reviewers (78/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [23.48%, 34.30%]
confidence intervals (bootstrap): [23.70%, 34.44%]

For category Datasets, 33.33% of reviewers (90/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [27.71%, 38.96%]
confidence intervals (bootstrap): [27.78%, 39.26%]

For category Code, 46.67% of reviewers (126/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [40.72%, 52.62%]
confidence intervals (bootstrap): [40.91%, 52.59%]

For category Experimental results, 25.56% of reviewers (69/270) have commented on at least one of the items of the category.
confidence intervals (binomial proportion): [20.35%, 30.76%]
confidence intervals (bootstrap): [20.00%, 30.74%]

For category Error bars or statistical significance, 1.85% of reviewers (5

In [287]:
# statements
reviews_statement = []
for i in range(3):
    column_id = i*9 + 9
    list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
    reviews_statement = reviews_statement + list_review_1

print(f"Statistics for statements (on {len(reviews_statement)} reviewers):")
print()
N_statement = len(reviews_statement)

for rating in statement_list:
    count_ = reviews_statement.count(rating)
    print(f"- {(count_ * 100 / N_statement):.2f}% of reviews ({count_}/{N_statement}) in category {rating}")

    proportion = count_ / len(reviews_statement)
    ci_bp_low, ci_bp_high = ci_bp(proportion, len(reviews_statement))
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_statement, rating)


    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()



Statistics for statements (on 270 reviewers):

- 48.89% of reviews (132/270) in category 3. (+) statement
confidence intervals (binomial proportion): [42.93%, 54.85%]
confidence intervals (bootstrap): [42.96%, 54.81%]

- 11.85% of reviews (32/270) in category 2. (-) statement 
confidence intervals (binomial proportion): [8.00%, 15.71%]
confidence intervals (bootstrap): [8.15%, 15.56%]

- 37.04% of reviews (100/270) in category 1. (none) statement
confidence intervals (binomial proportion): [31.28%, 42.80%]
confidence intervals (bootstrap): [31.48%, 42.59%]

- 2.22% of reviews (6/270) in category 0. Unusable (statement)
confidence intervals (binomial proportion): [0.46%, 3.98%]
confidence intervals (bootstrap): [0.74%, 4.07%]



In [288]:
# comments
reviews_comments = []
for i in range(3):
    column_id = i*9 + 10
    list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
    reviews_comments = reviews_comments + list_review_1

print(f"Statistics for comments (on {len(reviews_comments)} reviewers):")
print()
N_comments = len(reviews_comments)
for rating in comments_list:
    count_ = reviews_comments.count(rating)
    print(f"- {(count_ * 100 / N_comments):.2f}% of reviews ({count_}/{N_comments}) in category {rating}")

    proportion = count_ / N_comments
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_comments)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_comments, rating)


    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()


Statistics for comments (on 270 reviewers):

- 22.96% of reviews (62/270) in category 4. (-/+) comments
confidence intervals (binomial proportion): [17.95%, 27.98%]
confidence intervals (bootstrap): [17.78%, 28.52%]

- 30.74% of reviews (83/270) in category 3. (+) comments
confidence intervals (binomial proportion): [25.24%, 36.24%]
confidence intervals (bootstrap): [25.19%, 36.30%]

- 18.52% of reviews (50/270) in category 2. (-) comments
confidence intervals (binomial proportion): [13.89%, 23.15%]
confidence intervals (bootstrap): [14.07%, 22.96%]

- 22.22% of reviews (60/270) in category 1. (none) comments
confidence intervals (binomial proportion): [17.26%, 27.18%]
confidence intervals (bootstrap): [17.41%, 27.04%]

- 5.56% of reviews (15/270) in category 0. Unusable (comments)
confidence intervals (binomial proportion): [2.82%, 8.29%]
confidence intervals (bootstrap): [2.96%, 8.52%]



In [298]:
# statements and comments 

df_stat_com = pd.DataFrame(np.zeros((len(statement_list)+1,len(comments_list)+1)), index=statement_list+["total"], columns=comments_list+["total"])
df_stat_com.index.rename("in proportion", inplace = True)
for i in range (len(reviews_comments)):
    df_stat_com.loc[reviews_statement[i], reviews_comments[i]] +=1
    df_stat_com.loc["total", reviews_comments[i]] +=1
    df_stat_com.loc[reviews_statement[i], "total"] +=1
df_stat_com.loc["total", "total"] = len(reviews_comments)

print(tabulate(df_stat_com, headers='keys', tablefmt='psql'))
print()

df_stat_com.index.rename("in %", inplace = True)
df_stat_com = round(df_stat_com*100 /len(reviews_comments), 2)

print(tabulate(df_stat_com, headers='keys', tablefmt='psql'))


+-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------+
| in proportion           |   4. (-/+) comments |   3. (+) comments |   2. (-) comments |   1. (none) comments |   0. Unusable (comments) |   total |
|-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------|
| 3. (+) statement        |                  21 |                52 |                 7 |                   52 |                        0 |     132 |
| 2. (-) statement        |                   8 |                 0 |                21 |                    3 |                        0 |      32 |
| 1. (none) statement     |                  33 |                31 |                22 |                    5 |                        9 |     100 |
| 0. Unusable (statement) |                   0 |                 0 |                 0 |           

In [290]:
# meta categories
reviews_meta = []
for i in range(3):
    column_id = i + 29
    list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
    reviews_meta = reviews_meta + list_review_1

reviews_meta_python = []
for i in range(len(reviews_statement)):
    if reviews_statement[i] == "3. (+) statement":
        reviews_meta_python.append("Reproducible")
    elif reviews_statement[i] == "2. (-) statement ":
        reviews_meta_python.append("Irreproducible")
    elif reviews_comments[i] == "3. (+) comments":
        reviews_meta_python.append("Reproducible")
    elif reviews_comments[i] == "2. (-) comments" or reviews_comments[i] =="4. (-/+) comments":
        reviews_meta_python.append( "Irreproducible")
    else:
        reviews_meta_python.append("Unusable")

#reviews_meta = reviews_meta_python

print(f"Statistics for meta-categories (on {len(reviews_meta)} reviewers):")
print()
N_meta = len(reviews_meta)
for rating in meta_categories_list:
    count_ = reviews_meta.count(rating)
    print(f"- {(count_ * 100 / N_meta):.2f}% of reviews ({count_}/{N_meta}) in category {rating}")

    proportion = count_ / N_meta
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_meta)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_meta, rating)


    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()


Statistics for meta-categories (on 270 reviewers):

- 60.37% of reviews (163/270) in category Reproducible
confidence intervals (binomial proportion): [54.54%, 66.20%]
confidence intervals (bootstrap): [54.81%, 65.93%]

- 32.22% of reviews (87/270) in category Irreproducible
confidence intervals (binomial proportion): [26.65%, 37.80%]
confidence intervals (bootstrap): [26.67%, 37.78%]

- 7.41% of reviews (20/270) in category Unusable
confidence intervals (binomial proportion): [4.28%, 10.53%]
confidence intervals (bootstrap): [4.44%, 10.74%]



In [291]:
# Agreement
column_id = 38
reviews_agreement = df_rating_1.loc[2:, column_id].values.tolist()

print(f"Statistics for agreement (on {len(reviews_agreement)} reviewers):")
print()
N_agreement = len(reviews_agreement)
for rating in agreement_list:
    count_ = reviews_agreement.count(rating)
    print(f"- {(count_ * 100 / N_agreement):.2f}% of reviews ({count_}/{N_agreement}) in category {rating}")

    proportion = count_ / N_agreement
    ci_bp_low, ci_bp_high = ci_bp(proportion, N_agreement)
    ci_bootstrap_low, ci_bootstrap_high = ci_bootstrap(reviews_agreement, rating)


    print(f"confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]")
    print(f"confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]")
    print()

Statistics for agreement (on 90 reviewers):

- 35.56% of reviews (32/90) in category Agreement
confidence intervals (binomial proportion): [25.67%, 45.45%]
confidence intervals (bootstrap): [25.56%, 45.56%]

- 62.22% of reviews (56/90) in category Disagreement
confidence intervals (binomial proportion): [52.21%, 72.24%]
confidence intervals (bootstrap): [52.22%, 72.22%]

- 2.22% of reviews (2/90) in category Unusable
confidence intervals (binomial proportion): [-0.82%, 5.27%]
confidence intervals (bootstrap): [0.00%, 5.56%]



In [292]:
# Code
column_id = 39
code_link_df = df_rating_1.loc[2:, column_id].values.tolist()
column_id = 40
code_avail_df = df_rating_1.loc[2:, column_id].values.tolist()


print(f"Statistics for code (on {len(code_link_df)} reviewers):")
print()
df_code = pd.DataFrame(np.zeros((3,3)), index=["0", "1", "total"], columns=["link", "N/A", "total"])
df_code.index.rename("code availableness (in %)", inplace=True)

for i in range(len(code_avail_df)):
    if str(code_link_df[i]).startswith("http"):
        df_code.loc[code_avail_df[i], "link"] +=1
        df_code.loc["total", "link"] += 1
     
    else: 
        df_code.loc[code_avail_df[i], "N/A"] +=1
        df_code.loc["total", "N/A"] += 1

    df_code.loc[code_avail_df[i], "total"] += 1
    df_code.loc["total", "total"] +=1


nb_papers= len(code_avail_df)
print(f" - for {round(df_code.loc['1', 'link'] *100 / nb_papers,2)}% of the papers ({df_code.loc['1', 'link']}/{nb_papers}), the repository exists and is not empty.")
print(f" - for {round(df_code.loc['0', 'link'] *100 / nb_papers,2)}% of the papers ({df_code.loc['0', 'link']}/{nb_papers}), the link to the repository is provided but it is empty or wrong.")
print(f" - for {round(df_code.loc['0', 'N/A'] *100 / nb_papers,2)}% of the papers ({df_code.loc['0', 'N/A']}/{nb_papers}), no link/code was provided.")
print()

df_code = round(df_code*100 /len(code_avail_df), 2)

print(tabulate(df_code, headers='keys', tablefmt='psql'))


Statistics for code (on 90 reviewers):

 - for 47.78% of the papers (43.0/90), the repository exists and is not empty.
 - for 20.0% of the papers (18.0/90), the link to the repository is provided but it is empty or wrong.
 - for 32.22% of the papers (29.0/90), no link/code was provided.

+-----------------------------+--------+-------+---------+
| code availableness (in %)   |   link |   N/A |   total |
|-----------------------------+--------+-------+---------|
| 0                           |  20    | 32.22 |   52.22 |
| 1                           |  47.78 |  0    |   47.78 |
| total                       |  67.78 | 32.22 |  100    |
+-----------------------------+--------+-------+---------+


In [293]:
# repository and review 

list_code_1 = df_rating_1.loc[2:, 5].values.tolist()
list_code_2 = df_rating_1.loc[2:, 14].values.tolist()
list_code_3 = df_rating_1.loc[2:, 23].values.tolist()

list_code = []
for i in range(len(list_code_1)):
    x = list_code_1[i]
    if x== '0' and x==list_code_2[i] and x==list_code_3[i] :
        list_code.append(x)
    else :
        list_code.append('1')

print(f"For {round(list_code.count('1')*100/len(list_code), 2)}% of the papers ({round(list_code.count('1'))}/{round(len(list_code), 2)}), at least one of the reviewers said that the code was available.")
print()

df_code_review = pd.DataFrame(np.zeros((3,4)), index=["0", "1", "total"], columns=["good link", "empty link", "no link", "total"])
df_code_review.index.rename("code availableness (in %)", inplace=True)

for i in range(len(list_code)):
    if str(code_link_df[i]).startswith("http"):
        if code_avail_df[i] == '1':
            df_code_review.loc[list_code[i], "good link"] +=1
            df_code_review.loc["total", "good link"] += 1

        elif code_avail_df[i] == '0' :
            df_code_review.loc[list_code[i], "empty link"] +=1
            df_code_review.loc["total", "empty link"] += 1
            
    else: 
        df_code_review.loc[list_code[i], "no link"] +=1
        df_code_review.loc["total", "no link"] += 1

    df_code_review.loc[list_code[i], "total"] += 1
    df_code_review.loc["total", "total"] +=1

nb_code_papers = df_code_review.loc['1', "total"]

print(f"For these {nb_code_papers} papers:")
print(f" - for {round(df_code_review.loc['1', 'good link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'good link'],2)}/{nb_code_papers}), a least one of the reviewers said that the code was available and it was.")
print(f" - for {round(df_code_review.loc['1', 'empty link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'empty link'],2)}/{nb_code_papers}), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.")
print(f" - for {round(df_code_review.loc['1', 'no link'] *100 / nb_code_papers,2)}% of the papers ({round(df_code_review.loc['1', 'no link'],2)}/{nb_code_papers}), at least one of the reviewers said that the code was available even if no link/code was provided.")
print()
nb_others = len(list_code) - nb_code_papers
print(f"For the {nb_others} others:")
print(f" - for {round(df_code_review.loc['0', 'good link'] *100 / nb_others, 2)}% of the papers ({round(df_code_review.loc['0', 'good link'],2)}/{nb_others}), no one of the reviewers said that the code was available even if it was.")

df_code_review = round(df_code_review*100 /len(list_code), 2)

print()
print(tabulate(df_code_review, headers='keys', tablefmt='psql'))


For 86.67% of the papers (78/90), at least one of the reviewers said that the code was available.

For these 78.0 papers:
 - for 47.44% of the papers (37.0/78.0), a least one of the reviewers said that the code was available and it was.
 - for 20.51% of the papers (16.0/78.0), at least one of the reviewers said that the code was available even if the link was leading to an error message or to an empty repository.
 - for 32.05% of the papers (25.0/78.0), at least one of the reviewers said that the code was available even if no link/code was provided.

For the 12.0 others:
 - for 50.0% of the papers (6.0/12.0), no one of the reviewers said that the code was available even if it was.

+-----------------------------+-------------+--------------+-----------+---------+
| code availableness (in %)   |   good link |   empty link |   no link |   total |
|-----------------------------+-------------+--------------+-----------+---------|
| 0                           |        6.67 |         2.22 |

In [279]:
# TODO

# Regrouper “unusable” et “none”

# Code
# Repo present et non vide
# Compte
# Cohérence repo / review
# Compte

# add function to save all results in a file
