# Inter Raters Analysis

In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np
from math import sqrt
import random
from pathlib import Path
import os
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
import datetime
import json 

In [14]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from sklearn.metrics import cohen_kappa_score
from dataclasses import dataclass

from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

In [2]:
def get_review_tuple(path_review_1: Path, path_review_2: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Read the TSV files into Pandas DataFrames."""
    reviews = [pd.read_csv(path, sep = "\t", index_col=False, header= None) for path in (path_review_1, path_review_2)]
    reviews = [df.fillna('0') for df in reviews]
    return tuple(reviews)

In [3]:
# Path to the tsv file with the rating from the reviewers
path1_tsv = Path("../human_rating/rating_90/rating_90_O.csv")
path2_tsv = Path("../human_rating/rating_90/rating_90_E.csv")
df_ratings = get_review_tuple(path1_tsv, path2_tsv)

In [4]:
# List of categories you want to make statistics for
CATEGORIES = [
    "Models and algorithms",
    "Datasets",
    "Code",
    "Experimental results",
    "Error bars or statistical significance",
    "Code is or will be available",
    "Statement",
    "Comments",
]

N_REVIEWS = 3

In [5]:
def initialize_dataframe(list_stats: list[str]) -> pd.DataFrame:
    """Initialize an empty DataFrame with the specified row and column indices."""
    return pd.DataFrame(
        index=pd.Index(CATEGORIES + ["Meta-categories", "Repo provided"]),
        columns=pd.Index(list_stats),
    )

df_final = initialize_dataframe(["kappa score", "ci low", "ci high", "se"])

In [6]:
def create_confusion_matrix(list_1: list, list_2: list) -> pd.DataFrame:
    """Create a confusion matrix for comparing two lists of attributes.

    Parameters
    ----------
    list_1: list
        First list of attributes.
    list_2: list
        Second list of attributes.

    Returns
    -------
    pd.DataFrame :
        The confusion matrix showing the counts of matching attributes.

    Notes
    -----
    - If the lengths of list_1 and list_2 are not equal, a message is printed indicating potential mismatch.
    - The confusion matrix includes row and column totals, providing a comprehensive summary.

    Example
    -------
    - create_confusion_matrix(['A', 'B', 'A', 'C'], ['B', 'B', 'A', 'C'])
    """
    if not len(list_1) == len(list_2):
        raise ValueError("Reviewer 1 and 2 may not have rated the same list of subjects.")

    list_attributes = sorted(list(set(list_1) | set(list_2)))
    size = len(list_attributes)
    matrix = pd.DataFrame(np.zeros((size + 1, size + 1)))

    # Populate the confusion matrix
    for k, att_1 in enumerate(list_attributes):
        for l, att_2 in enumerate(list_attributes):
            for i in range(len(list_1)):
                if list_1[i] == att_1 and list_2[i] == att_2:
                    matrix.loc[k, l] += 1

    # Calculate row and column totals
    for i in range(size):
        for j in range(size):
            matrix.loc[size, i] += matrix.loc[j, i]
            matrix.loc[i, size] += matrix.loc[i, j]

    # Calculate the overall total
    for i in range(size):
        matrix.loc[size, size] += matrix.loc[i, size]

    # Normalize the matrix by dividing by the total number of ratings
    matrix = matrix / len(list_1)

    return matrix

In [7]:
def compute_kappa_score(observed_proportion: float, expected_proportion: float) -> float:
    """Calculate the kappa cohen score."""
    return (observed_proportion - expected_proportion) / (1 - expected_proportion)

In [8]:
def expected_proportion(matrix: pd.DataFrame) -> float:
    """Calculate the overall proportion of agreement expected by chance from the confusion matrix..

    Parameters
    ----------
    matrix: pd.DataFrame
        Confusion matrix representing the counts of matching attributes.

    Returns
    -------
    The overall proportion of agreement expected by chance (expected agreement).

    Example
    -------
    - expected_proportion(create_confusion_matrix(['A', 'B', 'A', 'C'], ['B', 'B', 'A', 'C']))
    """
    proportion = 0
    k = len(matrix) - 1
    for i in range(k):
        proportion += matrix.loc[i, k] * matrix.loc[k, i]
    return proportion


def observed_proportion(matrix: pd.DataFrame) -> float:
    """Calculate the overall proportion of observed agreement from the confusion matrix.

    Parameters
    ----------
    matrix: pd.DataFrame
        Confusion matrix representing the counts of matching attributes.

    Returns
    -------
    The overall proportion of observed agreement.

    Example
    -------
    - observed_proportion(create_confusion_matrix(['A', 'B', 'A', 'C'], ['B', 'B', 'A', 'C']))
    """
    proportion = 0
    for i in range(len(matrix) - 1):
        proportion += matrix.loc[i, i]
    return proportion


In [9]:
@dataclass
class KappaStats:
    """Class describing the statistics for a Kappa score over the bootstrap samples.

    Attributes
    ----------
    mean : float
        Mean of the kappa across bootstrap samples.
    std : float
        Standard error of kappa (standard deviation of the sampling distribution).
    low_ci : float
        95% CI lower bound (2.5th percentile of the sorted bootstrap distribution).
    hight_ci : float
        95% CI upper bound (97.5th percentile of the sorted bootstrap distribution).
    """
    mean: float
    std: float
    low_ci: float
    high_ci: float

    @property
    def ci(self) -> tuple[float, float]:
        return self.low_ci, self.high_ci

    @classmethod
    def from_weighted_kappas(cls, weighted_kappas: list[float]):
        return cls(
            np.mean(weighted_kappas),
            np.std(weighted_kappas),
            np.percentile(weighted_kappas, 2.5),
            np.percentile(weighted_kappas, 97.5),
        )


def bootstrap_cohen_quadratic_kappa_score(
    y_true, y_pred, quad: bool = False, num_resamples: int = 1000,
) -> KappaStats:
    """Bootstrap function for Cohen's Quadratic Kappa Score.

    Parameters
    ----------
    y_true: array-like
        True labels.
    y_pred: array-like
        Predicted labels.
    quad: bool, optional (default=False)
        If True, use quadratic weighting; otherwise, use linear weighting.
    num_resamples: int, optional (default=1000)
        Number of bootstrap resamples.

    Returns
    -------
    KappaStats :
        The Kappa statistics accross bootstrap samples.
    """
    # Combine true and predicted labels
    Y = np.array([y_true, y_pred]).T

    # List to store weighted kappas for each bootstrap sample
    weighted_kappas = []
    
    # Bootstrap resampling
    for i in range(num_resamples):
        # Randomly sample from the combined true and predicted labels
        Y_resample = np.array(random.choices(Y, k=len(Y)))
        y_true_resample = Y_resample[:, 0]
        y_pred_resample = Y_resample[:, 1]
        
        # Calculate Cohen's Kappa Score based on weighting option
        if not quad:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str))
        else:
            weighted_kappa = cohen_kappa_score(y_true_resample.astype(str), y_pred_resample.astype(str), weights='quadratic')
        
        # Append the calculated kappa score to the list
        weighted_kappas.append(weighted_kappa)

    # Calculate mean, standard error, and confidence intervals of the kappa scores
    return KappaStats.from_weighted_kappas(weighted_kappas)

In [10]:
@dataclass
class KappaScore:
    score: float
    stats: KappaStats

def write_stat(df_final: pd.DataFrame, category: str, kappa_score: KappaScore):
    df_final.loc[category, ("kappa score")] = kappa_score.score
    df_final.loc[category, ("ci low")] = kappa_score.stats.low_ci
    df_final.loc[category, ("ci high")] = kappa_score.stats.high_ci
    df_final.loc[category, ("se")] = kappa_score.stats.std

def compute_kappa_score_from_lists(lists: tuple[list, list]) -> KappaScore:
    # Create a confusion matrix to compare the ratings of the two reviewers
    confusion_matrix = create_confusion_matrix(list_1=lists[0], list_2=lists[1])
    
    # Calculate Cohen's kappa between observed and expected proportions
    kappa = compute_kappa_score(
        observed_proportion(confusion_matrix),
        expected_proportion(confusion_matrix),
    )
    # If kappa is not 1, perform bootstrap resampling to estimate standard error and confidence interval
    kappa_stats = KappaStats(1.0, 0.0, 1.0, 1.0)
    if kappa != 1:
        kappa_stats = bootstrap_cohen_quadratic_kappa_score(y_true=lists[0], y_pred=lists[1])
    return KappaScore(kappa, kappa_stats)

In [16]:
for idx_category, category in enumerate(CATEGORIES):    
    all_reviews = ([], [])
    
    # Iterate over the three reviews for the current category
    for idx_review in range(N_REVIEWS):
        # Calculate the column index for the current category and review
        column_id = idx_review * 9 + 3 + idx_category

        for j, df_rating in enumerate(df_ratings):
            all_reviews[j].extend(df_rating.loc[2:, column_id].values.tolist())

    print(f"For \'{df_ratings[0].loc[1, column_id]}\' item (over {len(all_reviews[0])} reviews):")
    
    kappa_score = compute_kappa_score_from_lists(all_reviews)
    write_stat(df_final, category, kappa_score)

    # Print the results
    print(f"Cohen's kappa = {kappa_score.score}")
    print(f"Standard error (bootstrap) = {kappa_score.stats.std}")
    print(f"CI bootstrap = {kappa_score.stats.ci}")

    # ######## For sanity check
    kappa_sklearn = cohen_kappa_score(*all_reviews)
    print(f"kappa cohen sklearn = {kappa_sklearn}")
    df_final.loc[category, ("kappa score", "sklearn")] = kappa_sklearn

    # ######## For sanity check
    # print(f"kappa cohen bootstrap = {kappa_btp}")

    # ######## For sanity check
    data = [all_reviews[0], all_reviews[1]]
    data_T = np.array(data).T
    data_fleiss_ = aggregate_raters(data_T)
    kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
    df_final.loc[category, ("kappa score", "fleiss")] = kappa_fleiss_
    print(f"kappa fleiss statsmodels = {kappa_fleiss_}")

    # ######## For sanity check
    # print(f"standard error (cohen) = {sd_cohen_ / sqrt(len(all_reviews_1))}")
    
    # ######## For sanity check
    # sd_cohen_ = sd_cohen(po_, pe_)
    # se_cohen = sd_cohen_ / sqrt(len(all_reviews_1))
    # low_parametric_cohen=-1.96 * se_cohen + kappa_
    # high_parametric_cohen=1.96 * se_cohen + kappa_
    # write_stat(df_final, category, "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )

    # print(f"CI parametric from Cohen's SE = [{low_parametric_cohen}, {high_parametric_cohen}]")
    print("**************************************************")
 
    

For 'Models and algorithms' item (over 270 reviews):
Cohen's kappa = 0.7544080604534007
Standard error (bootstrap) = 0.04414755323717636
CI bootstrap = (0.6677945752385603, 0.8412028086758719)
kappa cohen sklearn = 0.7544080604534005
kappa fleiss statsmodels = 0.7537878787878788
**************************************************
For 'Datasets' item (over 270 reviews):
Cohen's kappa = 0.9085872576177285
Standard error (bootstrap) = 0.02745110370119088
CI bootstrap = (0.8539060089062436, 0.9579839444859959)
kappa cohen sklearn = 0.9085872576177285
kappa fleiss statsmodels = 0.9085858508133396
**************************************************
For 'Code' item (over 270 reviews):
Cohen's kappa = 0.9107142857142856
Standard error (bootstrap) = 0.024752865333389745
CI bootstrap = (0.8647837903095287, 0.9553129951994158)
kappa cohen sklearn = 0.9107142857142857
kappa fleiss statsmodels = 0.9107142857142858
**************************************************
For 'Experimental results' item (ove

## Function definitions

In [5]:
def sd_cohen(observed_proportion: float, expected_proportion: float) -> float:
    """Cohen standard deviation."""
    return sqrt(
        (observed_proportion * (1 - observed_proportion)) /
        ((1 - expected_proportion) * (1 - expected_proportion))
    )

In [17]:
# Initialize lists to store reviews from both reviewers for meta-categories
list_meta = ([], [])

# Iterate over the three reviews for meta-categories
for idx_review in range(N_REVIEWS):
    # Calculate the column index for the current meta-category and review
    column_id = idx_review + 29
    
    for j, df_rating in enumerate(df_ratings):
        list_meta[j].extend(df_rating.loc[2:, column_id].values.tolist())

# Count the occurrences of "Unusable (meta)" in both lists
test = tuple(l.count("Unusable (meta)") for l in list_meta)

kappa_score = compute_kappa_score_from_lists(list_meta)

# Write the calculated statistics to the DataFrame for meta-categories
write_stat(df_final, "Meta-categories", kappa_score)

# Print the results for meta-categories
print(f"For \'Meta-category\' item (over {len(list_meta[0])} reviews):")
print(f"Cohen's kappa = {kappa_score.score}")
print(f"Standard error (bootstrap) = {kappa_score.stats.std}")
print(f"CI bootstrap = {kappa_score.stats.ci}")
print("**************************************************")

# ######## For sanity check
# print(f"We can count {test} reviews unusable for the first rater and {test2} reviews unusable for the second.")

# ######## For sanity check
# kappa_sklearn = cohen_kappa_score(all_reviews_1, all_reviews_2)
# print(f"kappa cohen sklearn = {kappa_sklearn}")
# kappa_sklearn = cohen_kappa_score(all_reviews_1, all_reviews_2)
# df_final.loc[list_categories[category], ("kappa score", "sklearn")]=kappa_sklearn

# ######## For sanity check
# print(f"kappa cohen bootstrap = {kappa_btp}")

# ######## For sanity check
# data = [all_reviews_1, all_reviews_2]
# data_T = np.array(data).T
# data_fleiss_ = aggregate_raters(data_T)
# kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
# df_final.loc[list_categories[category], ("kappa score", "fleiss")]=kappa_fleiss_
# print(f"kappa fleiss statsmodels = {kappa_fleiss_}")

# ######## For sanity check
# sd_cohen_ = sd_cohen(po_, pe_)
# print(f"standard error (cohen) = {sd_cohen_ / sqrt(len(list_meta_1))}")

# ######## For sanity check
# se_cohen = sd_cohen_ / sqrt(len(list_meta_1))
# low_parametric_cohen=-1.96 * se_cohen + kappa_
# high_parametric_cohen=1.96 * se_cohen + kappa_
# write_stat(df_final, list_categories[category], "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )

# print(f"CI parametric from Cohen's SE = [{low_parametric_cohen}, {high_parametric_cohen}]")

For 'Meta-category' item (over 270 reviews):
Cohen's kappa = 0.8014027898179524
Standard error (bootstrap) = 0.035861760223139603
CI bootstrap = (0.728436860376502, 0.8680258232176784)
**************************************************


### Add repo provided review

In [18]:
# Extract values from specific columns in dataframes and convert to lists
list_repo = tuple(df.loc[2:, 39].values.tolist() for df in df_ratings)

kappa_score = compute_kappa_score_from_lists(list_repo)

# Write statistical results to a dataframe
write_stat(df_final, "Repo provided", kappa_score)

# Print results for 'Repo provided' item
print(f"For 'Repo provided' item (over {len(list_repo[0])} reviews):")
print(f"Cohen's kappa = {kappa_score.score}")
print(f"Standard error (bootstrap) = {kappa_score.stats.std}")
print(f"CI bootstrap = {kappa_score.stats.ci}")
print("**************************************************")


######### For sanity check
# print(f"We can count {test} reviews unusable for the first rater and {test2} reviews unusable for the second.")

######### For sanity check
# print(f"kappa cohen sklearn = {kappa_sklearn}")
# kappa_sklearn = cohen_kappa_score(all_reviews_1, all_reviews_2)
# df_final.loc[list_categories[category], ("kappa score", "sklearn")]=kappa_sklearn

######### For sanity check
# print(f"kappa cohen bootstrap = {kappa_btp}")

######### For sanity check
# data = [all_reviews_1, all_reviews_2]
# data_T = np.array(data).T
# data_fleiss_ = aggregate_raters(data_T)
# kappa_fleiss_ = fleiss_kappa(data_fleiss_[0])
# df_final.loc[list_categories[category], ("kappa score", "fleiss")]=kappa_fleiss_
# print(f"kappa fleiss statsmodels = {kappa_fleiss_}")

######### For sanity check
# print(f"standard error (cohen) = {sd_cohen_ / sqrt(len(list_repo_1))}")

######### For sanity check
# sd_cohen_ = sd_cohen(po_, pe_)
# se_cohen = sd_cohen_ / sqrt(N)
# low_parametric_cohen=-1.96 * se_cohen + kappa_
# high_parametric_cohen=1.96 * se_cohen + kappa_
#write_stat(df_final, list_categories[category], "cohen", kappa_, -1.96 * se_cohen + kappa_, 1.96 * se_cohen + kappa_, se_cohen )

#print(f"CI parametric from Cohen's SE = [{low_parametric_cohen}, {high_parametric_cohen}]")


For 'Repo provided' item (over 90 reviews):
Cohen's kappa = 1.0
Standard error (bootstrap) = 0.0
CI bootstrap = (1.0, 1.0)
**************************************************


### Save final dataframe to a CSV file

In [15]:
# Define the output directory path
output_directory = Path(f"../miccai2023/stats_inter_rater")

# Check if the output directory exists, and create it if it doesn't
if not output_directory.is_dir():
    output_directory.mkdir()

# Save the final dataframe to the CSV file
df_final.to_csv(output_directory / 'inter_rater_stats.csv', index=True, sep="\t", encoding='utf-8')

In [16]:
# Create a dictionary to store metadata information
data = {}

# Calculate and store the number of papers (assuming papers are represented in all_reviews_1)
data["nb_papers"] = len(all_reviews[0]) / N_REVIEWS  # Assuming each paper has 3 reviews

# Store the total number of reviews
data["nb_reviews"] = len(all_reviews[0])

# Store the current date and time
data["date"] = str(datetime.date.today())
data["time"] = str(datetime.datetime.utcnow())

# Store the file paths of the input data
data["data_path1"] = str(path1_tsv)
data["data_path2"] = str(path2_tsv)

# Convert the dictionary to a JSON-formatted string with indentation for readability
json_data = json.dumps(data, skipkeys=True, indent=4)

# Write the JSON data to the file
with open(output_directory / "data.json", "w") as f:
    f.write(json_data)

In [19]:
from tabulate import tabulate
print(tabulate(df_final, headers='keys', tablefmt='psql', floatfmt=".2f"))

+----------------------------------------+---------------+----------+-----------+------+
|                                        |   kappa score |   ci low |   ci high |   se |
|----------------------------------------+---------------+----------+-----------+------|
| Models and algorithms                  |          0.75 |     0.66 |      0.84 | 0.05 |
| Datasets                               |          0.91 |     0.85 |      0.96 | 0.03 |
| Code                                   |          0.91 |     0.86 |      0.96 | 0.03 |
| Experimental results                   |          0.86 |     0.79 |      0.93 | 0.04 |
| Error bars or statistical significance |          1.00 |     1.00 |      1.00 | 0.00 |
| Code is or will be available           |          0.89 |     0.81 |      0.95 | 0.04 |
| Statement                              |          0.74 |     0.67 |      0.81 | 0.04 |
| Comments                               |          0.82 |     0.76 |      0.87 | 0.03 |
| Meta-categories    