In [134]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from tabulate import tabulate
import datetime
import json

In [135]:
# List representing binary choices (0 or 1) for specific content categories
list_0_1 = [
    "Models and algorithms",
    "Datasets",
    "Code",
    "Experimental results",
    "Error bars or statistical significance",
    "Code is or will be available"
]

# List representing different levels or types of statements
statement_list = [
    "3. (+) statement",
    "2. (-) statement",
    "1. (none) statement",
    "0. Unusable (statement)",
]

# List representing different levels or types of comments
comments_list = [
    "4. (-/+) comments",
    "3. (+) comments",
    "2. (-) comments",
    "1. (none) comments",
    "0. Unusable (comments)",
]

# List representing meta-categories
meta_categories_list = [
    "(+) meta",
    "(-) meta",
    "Unusable (meta)",
]


In [3]:
N_REVIEWS = 3

# Enter the path to the TSV file with the rating from the first observer
# You can change to analyze the ratings of another observer
path_csv = Path("../human_rating/rating_90/rating_90_O.csv")
df_rating_1 = pd.read_csv(path_csv, sep="\t", index_col=False, header=None)

# Display information about the analysis
print(f"This notebook analyzes the {N_REVIEWS} reviews of {len(df_rating_1) - 2} different papers based on the file {path_csv}")

# Set up the output directory path
output_directory = Path(f"../miccai2023/stats_rating")

# Check if the output directory exists, if not, create it
if not output_directory.is_dir():
    output_directory.mkdir()

# Print the output directory path
print(f"Outputs will be saved in {output_directory}.")


This notebook analyzes the 3 reviews of 90 different papers based on the file ../human_rating/rating_90/rating_90_O.csv
Outputs will be saved in ../miccai2023/stats_rating.


### Statistics on the rating for the 0/1 categories

In [4]:
def compute_ci_bp(observed_proportion: float, number_of_observation: int) -> tuple[float, float]:
    """Return the confidence interval (binomial proportion).
    
    Parameters
    ----------
    proportion: (float) The observed proportion.
    N: (int) The total number of observations.
        
    Returns
    -------
    float :
        The lower bound of the confidence interval.
    float :
        The upper bound of the confidence interval.
    """
    from math import sqrt
    
    ratio = sqrt(observed_proportion * (1 - observed_proportion) / number_of_observation)
    return (
        (-1.96 * ratio + observed_proportion) * 100,
        (1.96 * ratio + proportion) * 100
    )

In [5]:
def compute_ci_bootstrap(data: list, val: float, num_resamples: int = 1000) -> tuple[float, float]:
    """Return the confidence interval (bootstrap).
    
    Parameters
    ----------
    data : list
        The original data.
    val : float
        The value for which the confidence interval is calculated.
    num_resamples : int, optional
        The number of bootstrap resamples, by default 1000.
    
    Returns
    -------
    float :
        The lower bound of the confidence interval.
    float :
        The upper bound of the confidence interval.
    """
    outputs = []
    # Generate bootstrap samples and calculate the statistic of interest.
    for i in range(num_resamples):
        Y_resample = random.choices(data, k=len(data))
        out = Y_resample.count(val)
        outputs.append(out * 100 / len(data))

    # Calculate the percentiles to obtain the confidence interval.
    return np.percentile(outputs, 2.25), np.percentile(outputs, 97.5)

In [6]:
# Initialize the output dataframe to store statistics with zeros
df_category = pd.DataFrame(
    np.zeros((len(list_0_1), 4)),
    index=list_0_1,
    columns=["number", "percent", "ci low", "ci high"],
)

# Iterate over each category and calculate statistics
for category_idx, category in enumerate(list_0_1):
    # List to store all reviews for the current category from all reviewers
    all_reviews_1 = []
    
    # Iterate over each reviewer (3 in total)
    for i in range(N_REVIEWS):
        # Extract the ratings for the current category from the corresponding column
        column_id = i * 9 + 3 + category_idx
        list_review_1 = df_rating_1.loc[2:, column_id].values.tolist()
        all_reviews_1 = all_reviews_1 + list_review_1

    # Calculate the number of '1' and percentage of '1' for the current category
    count_ = all_reviews_1.count("1")
    percent_ = round(count_ * 100 / len(all_reviews_1), 2)

    # Populate the output dataframe with the calculated statistics
    df_category.loc[category, "number"] = count_
    df_category.loc[category, "percent"] = percent_

    # Print information about the category and its statistics
    print(
        f"For category {category}, {percent_}% of reviewers "
        f"({count_}/{len(all_reviews_1)}) have commented on at least one of the items of the category."
    )
    # Calculate confidence intervals using binomial proportion and bootstrap methods
    proportion = count_ / len(all_reviews_1)
    ci_bp_low, ci_bp_high = compute_ci_bp(proportion, len(all_reviews_1))
    ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(all_reviews_1, '1')

    # Populate the output dataframe with confidence interval values (only bootstrap)
    df_category.loc[category, "ci low"] = ci_bootstrap_low
    df_category.loc[category, "ci high"] = ci_bootstrap_high

    # Print confidence intervals
    print(
        f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
        f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
    )

For category Models and algorithms, 28.89% of reviewers (78/270) have commented on at least one of the items of the category.
Confidence intervals (binomial proportion): [23.48%, 34.30%]
Confidence intervals (bootstrap): [23.51%, 34.44%]

For category Datasets, 33.33% of reviewers (90/270) have commented on at least one of the items of the category.
Confidence intervals (binomial proportion): [27.71%, 38.96%]
Confidence intervals (bootstrap): [27.78%, 38.89%]

For category Code, 46.67% of reviewers (126/270) have commented on at least one of the items of the category.
Confidence intervals (binomial proportion): [40.72%, 52.62%]
Confidence intervals (bootstrap): [40.74%, 52.59%]

For category Experimental results, 25.56% of reviewers (69/270) have commented on at least one of the items of the category.
Confidence intervals (binomial proportion): [20.35%, 30.76%]
Confidence intervals (bootstrap): [20.00%, 30.75%]

For category Error bars or statistical significance, 1.85% of reviewers (5

In [140]:
# Rename the index of the dataframe
df_category.index.rename(f"For {len(all_reviews_1)} reviews", inplace=True)
df_category.to_csv(output_directory / "1-category.csv", index=True, sep=";", encoding='utf-8')

# Print the dataframe in tabular format
print(tabulate(df_category, headers='keys', tablefmt='psql'))

+----------------------------------------+----------+-----------+----------+-----------+
| For 270 reviews                        |   number |   percent |   ci low |   ci high |
|----------------------------------------+----------+-----------+----------+-----------|
| Models and algorithms                  |       78 |     28.89 | 23.5102  |   34.4444 |
| Datasets                               |       90 |     33.33 | 27.7778  |   38.8889 |
| Code                                   |      126 |     46.67 | 40.7407  |   52.5926 |
| Experimental results                   |       69 |     25.56 | 20       |   30.75   |
| Error bars or statistical significance |        5 |      1.85 |  0.37037 |    3.7037 |
| Code is or will be available           |       63 |     23.33 | 18.1481  |   28.5185 |
+----------------------------------------+----------+-----------+----------+-----------+


### Statistics on the rating for the statements category

In [141]:
# Initialize the output dataframe to store statistics on statements with zeros.
df_statement = pd.DataFrame(
    np.zeros((len(statement_list), 4)),
    index=statement_list,
    columns=["number", "percent", "ci low", "ci high"],
)

# Extract the statements category for the 3 reviewers
reviews_statement = []
list_statement_1 = df_rating_1.loc[2:, 9].values.tolist()
list_statement_2 = df_rating_1.loc[2:, 18].values.tolist()
list_statement_3 = df_rating_1.loc[2:, 27].values.tolist()
reviews_statement = list_statement_1 + list_statement_2 + list_statement_3

# Display information about the analysis
print(f"Statistics for statements (on {len(reviews_statement)} reviewers):\n")

# Iterate over each rating level in the statements list and calculate statistics
for rating in statement_list:
    # Calculate number and percentage of reviews for the current rating level
    count_ = reviews_statement.count(rating)
    percent_ = round(count_ * 100 / len(reviews_statement), 2)

    # Populate the output dataframe with the calculated statistics
    df_statement.loc[rating, "number"] = count_
    df_statement.loc[rating, "percent"] = percent_

    # Print information about the rating level and its statistics
    print(f"- {percent_}% of reviews ({count_}/{len(reviews_statement)}) in category {rating}")

    # Calculate confidence intervals using binomial proportion and bootstrap methods
    proportion = count_ / len(reviews_statement)
    ci_bp_low, ci_bp_high = compute_ci_bp(proportion, len(reviews_statement))
    ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(reviews_statement, rating)

    # Populate the output dataframe with confidence interval values (only bootstrap)
    df_statement.loc[rating, "ci low"] = round(ci_bootstrap_low, 2)
    df_statement.loc[rating, "ci high"] = round(ci_bootstrap_high, 2)

    # Print confidence intervals
    print(
        f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
        f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
    )

Statistics for statements (on 270 reviewers):

- 48.89% of reviews (132/270) in category 3. (+) statement


Confidence intervals (binomial proportion): [42.93%, 54.85%]
Confidence intervals (bootstrap): [42.59%, 55.19%]

- 11.85% of reviews (32/270) in category 2. (-) statement
Confidence intervals (binomial proportion): [8.00%, 15.71%]
Confidence intervals (bootstrap): [8.15%, 15.93%]

- 37.04% of reviews (100/270) in category 1. (none) statement
Confidence intervals (binomial proportion): [31.28%, 42.80%]
Confidence intervals (bootstrap): [31.48%, 42.59%]

- 2.22% of reviews (6/270) in category 0. Unusable (statement)
Confidence intervals (binomial proportion): [0.46%, 3.98%]
Confidence intervals (bootstrap): [0.74%, 4.07%]



In [9]:
# Calculate percentage and confidence intervals for reviewers that provided a statement
new_reviews_statement = [
    1 if (review_statement == "3. (+) statement" or review_statement == "2. (-) statement ")
    else 0 for review_statement in reviews_statement
]
new_count_ = new_reviews_statement.count(1)
new_percent_ = round(new_count_ * 100 / len(reviews_statement), 2)

# Display information about reviews that provided a statement
print(f"- {new_percent_}% of reviews ({new_count_}/{len(reviews_statement)}) provided a statement")

# Calculate confidence intervals for reviews that provided a statement
proportion = new_count_ / len(reviews_statement)
ci_bp_low, ci_bp_high = compute_ci_bp(proportion, len(reviews_statement))
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(new_reviews_statement, 1)
print(
    f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
    f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
)
# Display statistics dataframe for statements
df_statement.index.rename(f"For {len(reviews_statement)} reviews", inplace=True)
print(tabulate(df_statement, headers='keys', tablefmt='psql'))
print()

- 48.89% of reviews (132/270) provided a statement
Confidence intervals (binomial proportion): [42.93%, 54.85%]
Confidence intervals (bootstrap): [42.59%, 54.44%]

+-------------------------+----------+-----------+----------+-----------+
| For 270 reviews         |   number |   percent |   ci low |   ci high |
|-------------------------+----------+-----------+----------+-----------|
| 3. (+) statement        |      132 |     48.89 |    42.59 |     55.19 |
| 2. (-) statement        |       32 |     11.85 |     8.15 |     15.93 |
| 1. (none) statement     |      100 |     37.04 |    31.48 |     42.59 |
| 0. Unusable (statement) |        6 |      2.22 |     0.74 |      4.07 |
+-------------------------+----------+-----------+----------+-----------+



In [10]:
def compute_kappa_fleiss_three_raters(data_1, data_2, data_3) -> float:
    """Compute Cohen's kappa score for 3 raters (bootstrap).

    Parameters
    ----------
    data_1 : array-like
        Labels from rater 1.
    data_2 : array-like
        Labels from rater 2.
    data_3 : array-like
        Labels from rater 3.

    Returns
    -------
    The mean of the kappa across bootstrap samples.
    """
    from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

    data_fleiss_ = aggregate_raters(np.array([data_1, data_2, data_3]).T)
    return fleiss_kappa(data_fleiss_[0])

In [11]:
def compute_ci_bootstrap_three_raters(data_1, data_2, data_3, num_resamples: int = 1000) -> tuple[float, float]:
    """Compute confidence interval for a 3 raters kappa (bootstrap).

    Parameters
    ----------
    data_1 : array-like
        Labels from rater 1.
    data_2 : array-like
        Labels from rater 2.
    data_3 : array-like
        Labels from rater 3.
    num_resamples : int, optional
        The number of bootstrap resamples, by default 1000.

    Returns
    -------
    float :
        95% CI lower bound (2.5 percentile of the sorted bootstrap distribution).
    float :
        95% CI upper bound (97.5 percentile of the sorted bootstrap distribution).
    """
    Y = np.array([data_1, data_2, data_3]).T
    list_kappa = []

    # Generate bootstrap samples and calculate the kappa for each sample.
    for i in range(num_resamples):
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        list_kappa.append(
            compute_kappa_fleiss_three_raters(
                Y_resample[:, 0].astype(str),
                Y_resample[:, 1].astype(str),
                Y_resample[:, 2].astype(str),
            )
        )
    # Calculate the percentiles to obtain the confidence interval.
    return np.percentile(list_kappa, 2.5), np.percentile(list_kappa, 97.5)

In [12]:
def create_confusion_matrix(list_1: list, list_2: list, list_attributs: list) -> pd.DataFrame:
    """Create a confusion matrix in percentage.
    
    Parameters
    ----------
    list_1: List of labels from reviewer 1.
    list_2: List of labels from reviewer 2.
    list_attributs: List of attribute labels.

    Returns
    -------
    pd.DataFrame :
        The confusion matrix as a DataFrame (in %).
    """
    if len(list_1) != len(list_2):
        raise ValueError("Reviewer 1 and 2 may not have rated the same list of subjects.")
    size = len(list_attributs)    
    # Initialize the confusion matrix with zeros.
    matrix = pd.DataFrame(
        np.zeros((size + 1, size + 1)),
        index=list_attributs + ["total"],
        columns=list_attributs + ["total"],
    )    
    # Populate the confusion matrix based on reviewer ratings.
    for k, att_k in enumerate(list_attributs):
        for l, att_l in enumerate(list_attributs):
             for i, elt in enumerate(list_1):
                if elt == att_k:
                    if list_2[i] == att_l:
                        matrix.loc[att_k, att_l] += 1
        
    # Calculate row and column totals, and total observations.
    for i, att_i in enumerate(list_attributs):
        for j, att_j in enumerate(list_attributs):
            matrix.loc["total", att_i] += matrix.loc[att_j, att_i]
            matrix.loc[att_i, "total"] += matrix.loc[att_i, att_j]
            matrix.loc["total", "total"] += matrix.loc[att_i, att_j]

    # Convert the matrix to percentages if the number of observations matches the total.
    if len(list_1) == matrix.loc["total", "total"]:
        matrix = round(matrix * 100 / len(list_1), 2)
    
    return matrix

In [13]:
df_statement.to_csv(output_directory / '2-statements.csv', index=True, sep=";", encoding='utf-8')

# Calculate Fleiss' Kappa for the statements
kappa_fleiss_statement = compute_kappa_fleiss_three_raters(
    list_statement_1,
    list_statement_2,
    list_statement_3,
)
print(f"Fleiss' Kappa: {kappa_fleiss_statement}")

### Sanity check ###
# from sklearn.metrics import cohen_kappa_score
# kappa_sk_statement = cohen_kappa_score(list_statement_1, list_statement_2)
# print(f"kappa sklearn: {kappa_sk_statement}")

# Calculate confidence intervals for Fleiss' Kappa
ci_low_fleiss_statement, ci_high_fleiss_statement = compute_ci_bootstrap_three_raters(
    list_statement_1,
    list_statement_2,
    list_statement_3,
)
print(f"Confidence intervals (bootstrap): [{ci_low_fleiss_statement:.2f}, {ci_high_fleiss_statement:.2f}]\n")

# Display confusion matrices for pairwise comparisons of reviews
print("Review 1 VS review 2")
m1_statement = create_confusion_matrix(list_statement_1, list_statement_2, statement_list)
print(tabulate(m1_statement, headers='keys', tablefmt='psql'))
print()

print("Review 2 VS review 3")
m2_statement = create_confusion_matrix(list_statement_2, list_statement_3, statement_list)
print(tabulate(m2_statement, headers='keys', tablefmt='psql'))
print()

print("Review 1 VS review 3")
m3_statement = create_confusion_matrix(list_statement_1, list_statement_3, statement_list)
print(tabulate(m3_statement, headers='keys', tablefmt='psql'))
print()


Fleiss' Kappa: -0.051648054755043166
Confidence intervals (bootstrap): [-0.14, 0.04]

Review 1 VS review 2
+-------------------------+--------------------+--------------------+-----------------------+---------------------------+---------+
|                         |   3. (+) statement |   2. (-) statement |   1. (none) statement |   0. Unusable (statement) |   total |
|-------------------------+--------------------+--------------------+-----------------------+---------------------------+---------|
| 3. (+) statement        |              18.89 |              10    |                 18.89 |                      0    |   47.78 |
| 2. (-) statement        |               6.67 |               2.22 |                  4.44 |                      0    |   13.33 |
| 1. (none) statement     |              20    |               0    |                 16.67 |                      1.11 |   37.78 |
| 0. Unusable (statement) |               0    |               1.11 |                  0    |        

### Statistics on the rating for the comments category

In [14]:
# Initialize the output dataframe to store statistics on comments with zeros.
df_comments = pd.DataFrame(
    np.zeros((len(comments_list), 4)),
    index=comments_list,
    columns=["number", "percent", "ci low", "ci high"],
)
# Extract the comments category for the 3 reviews
reviews_comments = []
list_comment_1 = df_rating_1.loc[2:, 10].values.tolist()
list_comment_2 = df_rating_1.loc[2:, 19].values.tolist()
list_comment_3 = df_rating_1.loc[2:, 28].values.tolist()
reviews_comments = list_comment_1 + list_comment_2 + list_comment_3

# Display information about the analysis
print(f"Statistics for comments (on {len(reviews_comments)} reviewers):\n")
N_comments = len(reviews_comments)

# Iterate over each rating level in the comments list and calculate statistics
for rating in comments_list:
    # Calculate number and percentage of reviews for the current rating level
    count_ = reviews_comments.count(rating)
    percent_ = round(count_ * 100 / N_comments, 2)

    # Populate the output dataframe with the calculated statistics
    df_comments.loc[rating, "number"] = count_
    df_comments.loc[rating, "percent"] = percent_

    # Print information about the rating level and its statistics
    print(f"- {percent_}% of reviews ({count_}/{N_comments}) in category {rating}")

    # Calculate confidence intervals using binomial proportion and bootstrap methods
    proportion = count_ / N_comments
    ci_bp_low, ci_bp_high = compute_ci_bp(proportion, N_comments)
    ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(reviews_comments, rating)

    # Populate the output dataframe with confidence interval values
    df_comments.loc[rating, "ci low"] = round(ci_bootstrap_low, 2)
    df_comments.loc[rating, "ci high"] = round(ci_bootstrap_high, 2)

    # Print confidence intervals
    print(
        f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
        f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
    )

Statistics for comments (on 270 reviewers):

- 22.96% of reviews (62/270) in category 4. (-/+) comments
Confidence intervals (binomial proportion): [17.95%, 27.98%]
Confidence intervals (bootstrap): [17.78%, 28.15%]

- 30.74% of reviews (83/270) in category 3. (+) comments
Confidence intervals (binomial proportion): [25.24%, 36.24%]
Confidence intervals (bootstrap): [25.19%, 36.67%]

- 18.52% of reviews (50/270) in category 2. (-) comments
Confidence intervals (binomial proportion): [13.89%, 23.15%]
Confidence intervals (bootstrap): [14.07%, 22.97%]

- 22.22% of reviews (60/270) in category 1. (none) comments
Confidence intervals (binomial proportion): [17.26%, 27.18%]
Confidence intervals (bootstrap): [17.41%, 27.04%]

- 5.56% of reviews (15/270) in category 0. Unusable (comments)
Confidence intervals (binomial proportion): [2.82%, 8.29%]
Confidence intervals (bootstrap): [2.96%, 8.52%]



In [15]:
# Calculate percentage and confidence intervals for reviews that provided a comment
new_reviews_comments = [
    1 if review_comment in ("4. (-/+) comments", "3. (+) comments", "2. (-) comments")
    else 0 for review_comment in reviews_comments
]
new_count_ = new_reviews_comments.count(1)
new_percent_ = round(new_count_ * 100 / N_comments, 2)

# Display information about reviews that provided a comment
print(f"- {new_percent_}% of reviews ({new_count_}/{N_comments}) provided a comment")

# Calculate confidence intervals for reviews that provided a comment
proportion = new_count_ / N_comments
ci_bp_low, ci_bp_high = compute_ci_bp(proportion, N_comments)
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(new_reviews_comments, 1)
print(
    f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
    f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
)

# Save values needed to extract latex data
comments_latex = [ new_percent_, new_count_, N_comments, ci_bootstrap_low, ci_bootstrap_high]

# Display statistics dataframe for comments
df_comments.index.rename(f"For {N_comments} reviews", inplace=True)
print(tabulate(df_comments, headers='keys', tablefmt='psql'))

df_comments.to_csv(output_directory / '3-comments.csv', index=True, sep=";", encoding='utf-8')

# Calculate Fleiss' Kappa for the comments
kappa_fleiss_comment = compute_kappa_fleiss_three_raters(
    list_comment_1,
    list_comment_2,
    list_comment_3,
)
print(f"Fleiss' Kappa: {kappa_fleiss_comment}")

# Calculate confidence intervals for Fleiss' Kappa
ci_low_fleiss_comment, ci_high_fleiss_comment = compute_ci_bootstrap_three_raters(
    list_comment_1,
    list_comment_2,
    list_comment_3,
)
print(f"Confidence intervals (bootstrap): [{ci_low_fleiss_comment:.2f}%, {ci_high_fleiss_comment:.2f}%]\n")

# Display confusion matrices for pairwise comparisons of reviews
print("Review 1 VS review 2")
m1_comment = create_confusion_matrix(list_comment_1, list_comment_2, comments_list)
print(tabulate(m1_comment, headers='keys', tablefmt='psql'))
print()

print("Review 2 VS review 3")
m2_comment = create_confusion_matrix(list_comment_2, list_comment_3, comments_list)
print(tabulate(m2_comment, headers='keys', tablefmt='psql'))
print()

print("Review 1 VS review 3")
m3_comment = create_confusion_matrix(list_comment_1, list_comment_3, comments_list)
print(tabulate(m3_comment, headers='keys', tablefmt='psql'))
print()

- 72.22% of reviews (195/270) provided a comment
Confidence intervals (binomial proportion): [66.88%, 77.56%]
Confidence intervals (bootstrap): [67.04%, 77.41%]

+------------------------+----------+-----------+----------+-----------+
| For 270 reviews        |   number |   percent |   ci low |   ci high |
|------------------------+----------+-----------+----------+-----------|
| 4. (-/+) comments      |       62 |     22.96 |    18.15 |     28.15 |
| 3. (+) comments        |       83 |     30.74 |    25.19 |     36.67 |
| 2. (-) comments        |       50 |     18.52 |    14.07 |     22.97 |
| 1. (none) comments     |       60 |     22.22 |    17.41 |     27.41 |
| 0. Unusable (comments) |       15 |      5.56 |     2.96 |      8.52 |
+------------------------+----------+-----------+----------+-----------+
Fleiss' Kappa: 0.0474911357043086
Confidence intervals (bootstrap): [-0.03%, 0.12%]

Review 1 VS review 2
+------------------------+---------------------+-------------------+-------

### Analysis of statement category VS comment category

In [16]:
# Initialize with zeros a DataFrame to store the cross-tabulation of statements and comments
df_stat_com = pd.DataFrame(
    np.zeros((len(statement_list) + 1, len(comments_list) + 1)),
    index=statement_list + ["total"],
    columns=comments_list + ["total"],
)
df_stat_com.index.rename("number", inplace=True)

# Populate the cross-tabulation DataFrame based on the reviews' statements and comments
for i, review_comment in enumerate(reviews_comments):
    df_stat_com.loc[reviews_statement[i], review_comment] += 1
    df_stat_com.loc["total", review_comment] += 1
    df_stat_com.loc[reviews_statement[i], "total"] += 1

# Set the total count for the total row and column
df_stat_com.loc["total", "total"] = len(reviews_comments)

# Display the cross-tabulation of statements and comments
print("Analysis of statements VS comments")
print(tabulate(df_stat_com, headers='keys', tablefmt='psql'))
print()

# Identify reviewers who made a positive statement but didn't provide a comment
reviews_statement_no_comments = []
for i, review_comment in enumerate(reviews_comments):
    if reviews_statement[i] == "3. (+) statement":
        reviews_statement_no_comments.append(
            1 if review_comment in ("1. (none) comments", "0. (Unusable) comments") else 0
        )
# Calculate percentage and confidence intervals for reviewers 
# who made a positive statement but didn't provide a comment
new_count_ = reviews_statement_no_comments.count(1)
new_percent_ = round(new_count_ * 100 / len(reviews_statement_no_comments), 2)
print(
    f"{new_percent_}% of reviewers ({new_count_}/{len(reviews_statement_no_comments)}) "
    "made a positive statement but didn't provide a comment to substantiate their statement."
)

proportion = new_count_ / len(reviews_statement_no_comments)
ci_bp_low, ci_bp_high = compute_ci_bp(proportion, len(reviews_statement_no_comments))
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(reviews_statement_no_comments, 1)
print(
    f"Confidence intervals (binomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
    f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
)

df_stat_com.to_csv(output_directory / '4-statements_comments.csv', index=True, sep=";", encoding='utf-8')

# Create a new DataFrame to store the percentage representation of statements and comments
df_stat_com_percent = df_stat_com
df_stat_com_percent.index.rename("in %", inplace=True)
df_stat_com_percent = round(df_stat_com * 100 / len(reviews_comments), 2)

# Display the percentage representation of statements and comments
print(tabulate(df_stat_com_percent, headers='keys', tablefmt='psql'))

Analysis of statements VS comments
+-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------+
| number                  |   4. (-/+) comments |   3. (+) comments |   2. (-) comments |   1. (none) comments |   0. Unusable (comments) |   total |
|-------------------------+---------------------+-------------------+-------------------+----------------------+--------------------------+---------|
| 3. (+) statement        |                  21 |                52 |                 7 |                   52 |                        0 |     132 |
| 2. (-) statement        |                   8 |                 0 |                21 |                    3 |                        0 |      32 |
| 1. (none) statement     |                  33 |                31 |                22 |                    5 |                        9 |     100 |
| 0. Unusable (statement) |                   0 |                

### Statistics on the rating for the meta category

In [17]:
# Initialize with zeros a DataFrame to store the statistics on the rating for the meta category
df_meta = pd.DataFrame(
    np.zeros((len(meta_categories_list), 4)),
    index=meta_categories_list,
    columns=["number", "percent", "ci low", "ci high"],
)
# Combine the ratings for the meta category from the three reviews
reviews_meta = []
list_meta_1 = df_rating_1.loc[2:, 29].values.tolist()
list_meta_2 = df_rating_1.loc[2:, 30].values.tolist()
list_meta_3 = df_rating_1.loc[2:, 31].values.tolist()
reviews_meta = list_meta_1 + list_meta_2 + list_meta_3

# Convert reviews_meta to match provided categories for consistency
reviews_meta_python = []
for i, review_statement in enumerate(reviews_statement):
    if review_statement == "3. (+) statement":
        reviews_meta_python.append("(+) meta")
    elif review_statement == "2. (-) statement ":
        reviews_meta_python.append("(-) meta")
    elif reviews_comments[i] == "3. (+) comments":
        reviews_meta_python.append("(+) meta")
    elif reviews_comments[i] in ("2. (-) comments", "4. (-/+) comments"):
        reviews_meta_python.append("(-) meta")
    else:
        reviews_meta_python.append("Unusable (meta)")

# Check and update reviews_meta if necessary
if reviews_meta != reviews_meta_python:
    reviews_meta = reviews_meta_python
    print("Meta categories calculated values aren't the same as in the provided tsv file.")

print(f"Statistics for meta-categories (on {len(reviews_meta)} reviewers):\n")

# Calculate and display statistics for the meta category
N_meta = len(reviews_meta)
for rating in meta_categories_list:
    count_ = reviews_meta.count(rating)
    percent_ = round(count_ * 100 / N_meta, 2)
    df_meta.loc[rating, "number"] = count_
    df_meta.loc[rating, "percent"] = percent_
    print(f"- {percent_}% of reviews ({count_}/{N_meta}) in category {rating}")

    proportion = count_ / N_meta
    ci_bp_low, ci_bp_high = compute_ci_bp(proportion, N_meta)
    ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(reviews_meta, rating)

    df_meta.loc[rating, "ci low"] = round(ci_bootstrap_low, 2)
    df_meta.loc[rating, "ci high"] = round(ci_bootstrap_high, 2)

    print(
        f"Confidence intervals (multinomial proportion): [{ci_bp_low:.2f}%, {ci_bp_high:.2f}%]\n"
        f"Confidence intervals (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]\n"
    )

Meta categories calculated values aren't the same as in the provided tsv file.
Statistics for meta-categories (on 270 reviewers):

- 60.37% of reviews (163/270) in category (+) meta
Confidence intervals (multinomial proportion): [54.54%, 66.20%]
Confidence intervals (bootstrap): [54.07%, 66.30%]

- 31.11% of reviews (84/270) in category (-) meta
Confidence intervals (multinomial proportion): [25.59%, 36.63%]
Confidence intervals (bootstrap): [25.36%, 36.67%]

- 8.52% of reviews (23/270) in category Unusable (meta)
Confidence intervals (multinomial proportion): [5.19%, 11.85%]
Confidence intervals (bootstrap): [5.19%, 11.85%]



In [18]:
# Save the statistics DataFrame to a CSV file
df_meta.index.rename(f"For {N_meta} reviews", inplace=True)
print(tabulate(df_meta, headers='keys', tablefmt='psql'))
df_meta.to_csv(output_directory / '5-meta.csv', index=True, sep=";", encoding='utf-8')

# Calculate Fleiss' Kappa for the meta category
kappa_fleiss_meta = compute_kappa_fleiss_three_raters(
    list_meta_1,
    list_meta_2,
    list_meta_3,
)
print(f"kappa fleiss: {kappa_fleiss_meta}")

# Calculate confidence intervals for Fleiss' Kappa
ci_low_fleiss_meta, ci_high_fleiss_meta = compute_ci_bootstrap_three_raters(
    list_meta_1,
    list_meta_2,
    list_meta_3,
)
print(f"Confidence intervals (bootstrap): [{ci_low_fleiss_meta:.2f}%, {ci_high_fleiss_meta:.2f}%]\n")

# Display confusion matrices for pairwise comparisons of reviews
print("Review 1 VS review 2")
m1_meta = create_confusion_matrix(list_meta_1, list_meta_2, meta_categories_list)
print(tabulate(m1_meta, headers='keys', tablefmt='psql'))
print()

print("Review 2 VS review 3")
m2_meta = create_confusion_matrix(list_meta_2, list_meta_3, meta_categories_list)
print(tabulate(m2_meta, headers='keys', tablefmt='psql'))
print()

print("Review 1 VS review 3")
m3_meta = create_confusion_matrix(list_meta_1, list_meta_3, meta_categories_list)
print(tabulate(m3_meta, headers='keys', tablefmt='psql'))
print()


+-------------------+----------+-----------+----------+-----------+
| For 270 reviews   |   number |   percent |   ci low |   ci high |
|-------------------+----------+-----------+----------+-----------|
| (+) meta          |      163 |     60.37 |    54.07 |     66.3  |
| (-) meta          |       84 |     31.11 |    25.36 |     36.67 |
| Unusable (meta)   |       23 |      8.52 |     5.19 |     11.85 |
+-------------------+----------+-----------+----------+-----------+
kappa fleiss: 0.021688128877534983
Confidence intervals (bootstrap): [-0.08%, 0.12%]

Review 1 VS review 2
+-----------------+------------+------------+-------------------+---------+
|                 |   (+) meta |   (-) meta |   Unusable (meta) |   total |
|-----------------+------------+------------+-------------------+---------|
| (+) meta        |      33.33 |      23.33 |              3.33 |   60    |
| (-) meta        |      22.22 |      10    |              1.11 |   33.33 |
| Unusable (meta) |       3.33 |     

### Creating table of kappa values

In [19]:
# Create a DataFrame to store kappa Fleiss values and confidence intervals for different categories
df_kappas = pd.DataFrame(
    np.zeros((3, 3)),
    index=["Statements", "Comments", "Meta-categories"],
    columns=["kappa", "ci low", "ci high"],
)
# Populate the DataFrame with kappa Fleiss values and confidence intervals for the statements category
df_kappas.loc["Statements", "kappa"] = kappa_fleiss_statement
df_kappas.loc["Statements", "ci low"] = ci_low_fleiss_statement
df_kappas.loc["Statements", "ci high"] = ci_high_fleiss_statement

# Populate the DataFrame with kappa Fleiss values and confidence intervals for the comments category
df_kappas.loc["Comments", "kappa"] = kappa_fleiss_comment
df_kappas.loc["Comments", "ci low"] = ci_low_fleiss_comment
df_kappas.loc["Comments", "ci high"] = ci_high_fleiss_comment

# Populate the DataFrame with kappa Fleiss values and confidence intervals for the meta-categories
df_kappas.loc["Meta-categories", "kappa"] = kappa_fleiss_meta
df_kappas.loc["Meta-categories", "ci low"] = ci_low_fleiss_meta
df_kappas.loc["Meta-categories", "ci high"] = ci_high_fleiss_meta

# Round the values in the DataFrame for better presentation
df_kappas = round(df_kappas, 2)

# Rename the index for clarity
df_kappas.index.rename("kappa fleiss", inplace=True)

# Display the kappa Fleiss values and confidence intervals in tabular format
print(tabulate(df_kappas, headers='keys', tablefmt='psql'))

df_kappas.to_csv(output_directory / '6-kappas.csv', index=True, sep=";", encoding='utf-8')

+-----------------+---------+----------+-----------+
| kappa fleiss    |   kappa |   ci low |   ci high |
|-----------------+---------+----------+-----------|
| Statements      |   -0.05 |    -0.14 |      0.04 |
| Comments        |    0.05 |    -0.03 |      0.12 |
| Meta-categories |    0.02 |    -0.08 |      0.12 |
+-----------------+---------+----------+-----------+


### Statistics on the rating for the code category

In [20]:
# Define column indices for code-related information
column_id_link = 38
code_link_df = df_rating_1.loc[2:, column_id_link].values.tolist()

column_id_avail = 39
code_avail_df = df_rating_1.loc[2:, column_id_avail].values.tolist()

# Define code categories
code_list = [
    "bad link",
    "good link",
    "no link",
]

# Display statistics for code availability on papers
print(f"Statistics for code (on {len(code_link_df)} papers):\n")

# Create a list to store code categories for each paper
list_code = []

# Determine code categories based on code links and availability
for i in range(len(code_avail_df)):
    if str(code_link_df[i]).startswith("http"):
        if code_avail_df[i] == "1":
            list_code.append("good link")
        if code_avail_df[i] == "0":
            list_code.append("bad link")
    else:
        list_code.append("no link")

# Calculate the number of papers
nb_papers = len(list_code)

# Display statistics for each code category
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code, "good link")
print(
    f" - for {(list_code.count('good link') * 100 / nb_papers):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('good link')}/{nb_papers}), the repository exists and is not empty."
)

ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code, "bad link")
print(
    f" - for {(list_code.count('bad link') * 100 / nb_papers):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('bad link')}/{nb_papers}), the link to the repository is provided but it is empty or wrong."
)

ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code, "no link")
print(
    f" - for {(list_code.count('no link') * 100 / nb_papers):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code.count('no link')}/{nb_papers}), no link/code was provided.\n"
)

# Create a binary list to represent the presence or absence of code links
list_code_binary = [1 if x in ("good link", "bad link") else 0 for x in list_code]

# Display statistics for the presence of code links
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code_binary, 1)
print(
    f" - for {(100 * list_code_binary.count(1) / len(list_code_binary)):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code_binary.count(1)}/{len(list_code_binary)}), an associated repository for the code was provided.\n"
)


# Filter the code list to include only "good link" and "bad link" categories
list_code_bis = [x for x in list_code if x in ("good link", "bad link")]

# Display statistics for the "bad link" category among papers with provided links
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code_bis, "bad link")
print(
    f" - for {(100 * list_code_bis.count('bad link') / len(list_code_bis)):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers for which a link was provided ({list_code.count('bad link')}/{len(list_code_bis)}), the link to the repository is provided but it is empty or wrong.\n"
)

Statistics for code (on 90 papers):

 - for 47.78% (95% CI (bootstrap): [37.78%, 58.89%]) of the papers (43/90), the repository exists and is not empty.
 - for 20.00% (95% CI (bootstrap): [12.22%, 27.78%]) of the papers (18/90), the link to the repository is provided but it is empty or wrong.
 - for 32.22% (95% CI (bootstrap): [23.33%, 42.22%]) of the papers (29/90), no link/code was provided.

 - for 67.78% (95% CI (bootstrap): [57.78%, 76.67%]) of the papers (61/90), an associated repository for the code was provided.

 - for 29.51% (95% CI (bootstrap): [19.67%, 40.98%]) of the papers for which a link was provided (18/61), the link to the repository is provided but it is empty or wrong.



### Statistics on the rating for the code availibility 

In [21]:
# Define columns for code availability in different reviews
list_code_1 = df_rating_1.loc[2:, 5].values.tolist()
list_code_2 = df_rating_1.loc[2:, 14].values.tolist()
list_code_3 = df_rating_1.loc[2:, 23].values.tolist()

# Create a list to store the final code review categories
list_code_review = []

# Determine code review categories based on code availability in different reviews
# (code promised: At least 1 reviewer said the code will be made available)
for i in range(len(list_code_1)):
    x = list_code_1[i]
    if x == '0' and x == list_code_2[i] and x == list_code_3[i]:
        list_code_review.append("no info")
    else:
        list_code_review.append('code promised')

# Calculate and display statistics for code availability based on reviews
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(list_code_review, "code promised")
print(
    f"For {(list_code_review.count('code promised')*100/len(list_code_review)):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({list_code_review.count('code promised')}/{len(list_code_review)}), at least one of the reviewers said that the code was available.\n"
)

# Create a DataFrame to store code availability and review information
df_code_reviews = pd.DataFrame(
    np.zeros((3, 4)),
    index=["no info", "code promised", "total"],
    columns=code_list + ["total"],
)

# Further categorize code reviews and update the DataFrame
list_code_final = []

# Iterate over each paper's code information and respective reviews
for code, code_review in zip(list_code, list_code_review):
    # Check if the code link is labeled as "good link" and the review indicates "code promised"
    if code == "good link" and code_review == "code promised":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["code promised", "good link"] += 1
        list_code_final.append("code promised good link")
    
    # Check if the code link is labeled as "good link" but the review has "no info"
    if code == "good link" and code_review == "no info":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["no info", "good link"] += 1
        list_code_final.append("no info good link")

    # Check if the code link is labeled as "bad link" and the review indicates "code promised"
    if code == "bad link" and code_review == "code promised":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["code promised", "bad link"] += 1
        list_code_final.append("code promised bad link")
    
    # Check if the code link is labeled as "bad link" but the review has "no info"
    if code == "bad link" and code_review == "no info":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["no info", "bad link"] += 1
        list_code_final.append("no info bad link")

    # Check if the code link is labeled as "no link" and the review indicates "code promised"
    if code == "no link" and code_review == "code promised":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["code promised", "no link"] += 1
        list_code_final.append("code promised no link")
    
    # Check if the code link is labeled as "no link" but the review has "no info"
    if code == "no link" and code_review == "no info":
        # Update the DataFrame and append the categorization to the list
        df_code_reviews.loc["no info", "no link"] += 1
        list_code_final.append("no info no link")

# Update totals in the DataFrame
df_code_reviews.loc["code promised", "total"] = (
    df_code_reviews.loc["code promised", "good link"] +
    df_code_reviews.loc["code promised", "bad link"] +
    df_code_reviews.loc["code promised", "no link"]
)
df_code_reviews.loc["no info", "total"] = (
    df_code_reviews.loc["no info", "good link"] +
    df_code_reviews.loc["no info", "bad link"] +
    df_code_reviews.loc["no info", "no link"]
)
df_code_reviews.loc["total", "good link"] = (
    df_code_reviews.loc["code promised", "good link"] +
    df_code_reviews.loc["no info", "good link"]
)
df_code_reviews.loc["total", "bad link"] = (
    df_code_reviews.loc["code promised", "bad link"] +
    df_code_reviews.loc["no info", "bad link"]
)
df_code_reviews.loc["total", "no link"] = (
    df_code_reviews.loc["code promised", "no link"] +
    df_code_reviews.loc["no info", "no link"]
)
df_code_reviews.loc["total", "total"] = (
    df_code_reviews.loc["code promised", "total"] +
    df_code_reviews.loc["no info", "total"]
)

# Calculate statistics for good reviews and bad reviews
# Filter out reviews categorized as "code promised"
good_review_list = [x for x in list_code_final if x.startswith("code promised")]
nb_good_reviews = len(good_review_list)

# Display the number of papers with at least one reviewer stating that the code is available
print(f"For these {nb_good_reviews} papers:")

# Calculate and display the percentage of papers with "code promised" for each sub-category
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(good_review_list, "code promised good link")
print(
    f" - for {(df_code_reviews.loc['code promised', 'good link']*100 / nb_good_reviews):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'good link']}/{nb_good_reviews}), at least one reviewer stated that the code was available and the link was valid."
)
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(good_review_list, "code promised bad link")
print(
    f" - for {(df_code_reviews.loc['code promised', 'bad link']*100 / nb_good_reviews):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'bad link']}/{nb_good_reviews}), at least one reviewer stated that the code was available, even if the link led to an error message or an empty repository."
)
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(good_review_list, "code promised no link")
print(
    f" - for {(df_code_reviews.loc['code promised', 'no link']*100 / nb_good_reviews):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['code promised', 'no link']}/{nb_good_reviews}), at least one reviewer stated that the code was available, even if no link/code was provided.\n"
)

# Calculate statistics for papers with misleading information about code availability
good_reviews_list_bis = [1 if x == "code promised good link" else 0 for x in good_review_list]

# Display the percentage of papers where reviewers falsely claimed code availability
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(good_reviews_list_bis, 0)
print(
    f" - for {(good_reviews_list_bis.count(0)*100 / nb_good_reviews):.2f}% (95% CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({good_reviews_list_bis.count(0)}/{nb_good_reviews}), at least one reviewer stated that the code was available, even if the code was actually missing in the published version (no link, broken link, or empty repository)\n"
)

# Calculate statistics for papers with misleading information about code availability
# Filter out reviews categorized as "no info"
bad_review_list = [x for x in list_code_final if x.startswith("no info")]
nb_bad_reviews = len(bad_review_list)

# Commented out to avoid displaying an incomplete statement; consider uncommenting as needed
ci_bootstrap_low, ci_bootstrap_high = compute_ci_bootstrap(bad_review_list, "no info good link")
# print(f" - for {(df_code_reviews.loc['no info', 'good link']*100 / nb_bad_reviews):.2f}% (95%CI (bootstrap): [{ci_bootstrap_low:.2f}%, {ci_bootstrap_high:.2f}%]) of the papers ({df_code_reviews.loc['no info', 'good link']}/{nb_bad_reviews}), the reviewers said that the code wasn't available even if it was.")

# Display information about the tables and the code reviews
print(
    "The tables below indicate, for each paper, whether at least one reviewer stated that the \n"
    "code was/will be available (1) or not (0), and whether the link was present and non-empty (good link), \n"
    "present but with an empty repository (empty link), or whether there was no link (no link).\n"
)
# Rename the index for clarity
df_code_reviews.index.rename("code availableness (in number)", inplace=True)

# Display the table showing the number of papers in each category
print(tabulate(df_code_reviews, headers='keys', tablefmt='psql'))

df_code_reviews.to_csv(output_directory / '7-code_number.csv', index=True, sep=";", encoding='utf-8')

# Create a DataFrame representing percentages and rename the index for clarity
df_code_review_percent = df_code_reviews
df_code_review_percent.index.rename("code availableness (in %)", inplace=True)
df_code_review_percent = df_code_review_percent * 100 / len(list_code)

df_code_review_percent.to_csv(output_directory / '8-code_percent.csv', index=True, sep=";", encoding='utf-8')

# Display the table showing the percentage of papers in each category
print(tabulate(round(df_code_review_percent,2), headers='keys', tablefmt='psql'))

For 86.67% (95% CI (bootstrap): [80.00%, 93.33%]) of the papers (78/90), at least one of the reviewers said that the code was available.

For these 78 papers:
 - for 47.44% (95% CI (bootstrap): [35.90%, 57.72%]) of the papers (37.0/78), at least one reviewer stated that the code was available and the link was valid.
 - for 20.51% (95% CI (bootstrap): [11.54%, 29.49%]) of the papers (16.0/78), at least one reviewer stated that the code was available, even if the link led to an error message or an empty repository.
 - for 32.05% (95% CI (bootstrap): [23.08%, 42.31%]) of the papers (25.0/78), at least one reviewer stated that the code was available, even if no link/code was provided.

 - for 52.56% (95% CI (bootstrap): [40.36%, 64.10%]) of the papers (41/78), at least one reviewer stated that the code was available, even if the code was actually missing in the published version (no link, broken link, or empty repository)

The tables below indicate, for each paper, whether at least one rev

In [22]:
# Create a dictionary 'data' to store relevant information
data = {}

# Store the number of papers in the dictionnary
data["nb_papers"] = len(df_rating_1) - 2  # Subtracting 2 to exclude header rows

# Calculate the total number of reviews (assuming 3 reviews per paper)
data["nb_reviews"] = (len(df_rating_1) - 2) * 3

# Store the current date and time using datetime module
data["date"] = str(datetime.date.today())  # Store the current date
data["time"] = str(datetime.datetime.utcnow())  # Store the current UTC time

# Store the path of the CSV file used for ratings
data["path_rating"] = str(path_csv)

# Convert the 'data' dictionary to a JSON-formatted string with indentation
json_data = json.dumps(data, skipkeys=True, indent=4)

with open(output_directory / "data.json", "w") as f:
    f.write(json_data)

In [156]:
# Generate Latex Paragraph 
latex_directory = Path(output_directory) / ".." / "latex"
if not latex_directory.is_dir():
    os.mkdir(latex_directory)


tex_file_=latex_directory / 'paragraph_3.tex'
with open(tex_file_, 'w') as f_:

    df_cat_bis = df_category

    max_df = pd.DataFrame(df_cat_bis.sort_values('percent')[-1:])
    max_cat = max_df.index[0]

    print(f"The checklist category that was most often commented upon was '{max_cat}' (${max_df.loc[max_cat]['percent']:.0f}\\%$ of reviews $({max_df.loc[max_cat]['number']:.2f}/{nb_papers})$, $95\\% \\text{{CI}} [{max_df.loc[max_cat]['ci low']:.0f}\\%, {max_df.loc[max_cat]['ci high']:.0f}\\%]$).", file=f_)
    
    min_df = df_cat_bis.sort_values('percent')[:1]
    min_cat = min_df.index[0]

    print(f"The lowest frequency was found for the '{min_cat}' category (${min_df.loc[min_cat]['percent']:.0f}\\%$ of reviews $({min_df.loc[min_cat]['number']:.2f}/{nb_papers})$, $95\\% \\text{{CI}} [{min_df.loc[min_cat]['ci low']:.0f}\\%, {min_df.loc[min_cat]['ci high']:.0f}\\%]$).", file=f_)
    
    print(f"Around {df_category.loc['Error bars or statistical significance']['percent']:.0f}\\% of reviews commented upon “Error bars and/or statistical significance”.", file=f_)
    print("", file=f_)

    print(f"${statement_latex[0]}\\%$ of reviews provided a statement ({statement_latex[1]}/{statement_latex[2]}, $95\\% \\text{{CI }}$ $[{statement_latex[3]:.0f}\\%, {statement_latex[4]:.0f}\\%]$) and ", file=f_)
    print(f"${comments_latex[0]}\\%$ came with a comments ({comments_latex[1]}/{comments_latex[2]}, $95\\% \\text{{CI }}$ $[{comments_latex[3]:.0f}\\%, {comments_latex[4]:.0f}\\%]$)\\footnote{{'Unusable' statements and comments are not taken into account.}}. ", file=f_)
    print(f"Of note, ${stat_com_latex[0]}\\%$ of reviewers which had made a positive statement regarding the reproducibility (indicating that they found that the reproducibility of the paper was overall satisfactory) provided no comments to substantiate their statement ({stat_com_latex[1]}/{stat_com_latex[2]}, $95\\% \\text{{CI }}$ $[{stat_com_latex[3]:.0f}\\%, {stat_com_latex[4]:.0f}\\%]$).", file=f_)
    print("", file=f_)

    print(f"Importantly, there was no agreement between reviewers with respective Fleiss'$\kappa$ values of ${df_kappas.loc['Statements']['kappa']}$ ($95\\% \\text{{CI }} [{df_kappas.loc['Statements']['ci low']:.0f}\\%, {df_kappas.loc['Statements']['ci high']:.0f}\\%]$) for statement ", file=f_)
    print(f"and ${df_kappas.loc['Meta-categories']['kappa']:.0f}$ ($95\\% \\text{{CI }} [{df_kappas.loc['Meta-categories']['ci low']:.0f}\\%, {df_kappas.loc['Meta-categories']['ci high']:.0f}\\%]$) for meta-category.", file=f_)
    print("", file=f_)

    print(f"For {code_avail_list[0]:.0f}\\% of papers, at least one of the reviewers said that the code was or will be available ({code_avail_list[1]}/{code_avail_list[2]}, $95\\%\\text{{CI }} [{code_avail_list[3]:.0f}\\%, {code_avail_list[4]:.0f}\\%]$). ", file=f_)
    print(f"However, for {code_avail_false_list[0]:.0f}\\% of these,  the code was actually missing in the published version (no link, broken link or empty repository) ({code_avail_false_list[1]}/{code_avail_false_list[2]}, $95\\%\\text{{CI }} [{code_avail_false_list[3]:.0f}\\%, {code_avail_false_list[4]:.0f}\\%]$). ", file=f_)
    print("", file=f_)
    
    print(f"Finally, {code_link_list[0]:.0f}\\% of published papers provided an associated repository for the code ({code_link_list[1]}/{code_link_list[2]}, $95\\%\\text{{CI }} [{code_link_list[3]:.0f}\\%, {code_link_list[4]:.0f}\\%]$). ", file=f_)
    print(f"However, for {code_link_binary_list[0]:.0f}\\% of these, the link was broken or the repository was empty ({code_link_binary_list[1]}/{code_link_binary_list[2]}, $95\\%\\text{{CI }} [{code_link_binary_list[3]:.0f}\\%, {code_link_binary_list[4]:.0f}\\%]$). ", file=f_)


print(f"Wrote tex file {tex_file_}")

Wrote tex file ../miccai2023/stats_rating/../latex/paragraph_3.tex
