In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

plt.style.use('ggplot')

def contrast_specific_pwd_violin(dataframe, methods, contrasts, ref_contrast="t2w"):
    """Plots several violin plots of each method representing
    the Pair-Wise Difference (PWD) between a contrast's CSA 
    and the reference contrast's CSA. The number of plots is 
    equal to the length of contrasts.
    Args:
        df (pandas.Dataframe): contains PWD values for each contrast
            across patients.
        methods (list): list of method names
        contrasts (list): contrast names exclusing reference contrast
        ref_contrast (str): Reference contrast
    """
    quotient, remainder = divmod(len(contrasts), 2)
    nrows = quotient + 1 if remainder==1 else quotient
    cols_cat = ["#ff6767", "#8edba3"]
    
    fig, axs = plt.subplots(ncols=2, nrows=nrows, figsize=(12,nrows*8))
    filtered_axs = axs.reshape(-1)
    if remainder == 1:
        filtered_axs[-1].set_axis_off()        

    for contrast, ax in zip(contrasts, filtered_axs):
        models_contrast = [method+"_"+contrast for method in methods]
        cols_dic = {name: "#989e9a" if (k == 0 or k == 1) 
                else cols_cat[k%2] for k, name in enumerate(models_contrast)}
        sns.violinplot(data=dataframe[models_contrast], 
            ax=ax, 
            inner="box",
            palette=cols_dic)
        labels = models_contrast
        ax.set_title(
            f"{contrast} CSA % difference across methods w.r.t to {ref_contrast}")
        ax.xaxis.set_tick_params(direction='out')
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticks(np.arange(0, len(labels)), labels, rotation=45, ha='right')
        ax.set_xlabel('Methods')
        ax.set_ylabel(f'% Difference in CSA values w.r.t {ref_contrast}')
        yabs_max = abs(max(ax.get_ylim(), key=abs))
        ax.set_ylim(ymin=-yabs_max, ymax=yabs_max)
        
        bench_patch = mpatches.Patch(color="#989e9a", label='Benchmark')
        singleGT_patch = mpatches.Patch(color="#ff6767", label='Single GT')
        meanGT_patch = mpatches.Patch(color="#8edba3", label='Mean GT')
        ax.legend(title= "Method type", 
            handles=[bench_patch, singleGT_patch, meanGT_patch])
        
    plt.tight_layout()
    
def macro_pwd_violin(df, methods, ref_contrast="t2w"):
    """Plots a violin plot of each method representing the overall
    performance of a method across contrasts. This performance is
    measured via CSA pair-wise difference (PWD) w.r.t. a reference
    contrast.
    Args:
        df (pandas.Dataframe): contains PWD values for each contrast
            across patients.
        methods (list): method's performance column names following 
            this format [method_name]_perf_[measure_type] 
        ref_contrast (str): Reference contrast
    """
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(9,9))
    cols_cat = ["#ff6767", "#8edba3"]
    cols_dic = {name: "#989e9a" if (k == 0 or k == 1) else cols_cat[k%2] 
                    for k, name in enumerate(methods)}
    sns.violinplot(data=df[methods], ax=ax, inner="box", palette=cols_dic)

    labels = methods
    ax.set_title(" % Difference in CSA across all contrasts")
    ax.xaxis.set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(0, len(labels)), labels, rotation=45, ha='right')
    ax.set_xlabel('Methods')
    ax.set_ylabel(f'CSA % difference w.r.t {ref_contrast}')
    yabs_max = abs(max(ax.get_ylim(), key=abs))
    ax.set_ylim(ymin=-yabs_max, ymax=yabs_max)
    bench_patch = mpatches.Patch(color="#989e9a", label='Benchmark')
    singleGT_patch = mpatches.Patch(color="#ff6767", label='Single GT')
    meanGT_patch = mpatches.Patch(color="#8edba3", label='Mean GT')
    ax.legend(title= "Method type", 
        handles=[bench_patch, singleGT_patch, meanGT_patch])
    
def macro_sd_violin(df, methods):
    """Plots a violin plot of each method representing the overall
    performance of a method across contrasts. This performance is
    measured via the CSA\'s standard deviation across contrasts.
    Args:
        df (pandas.Dataframe): contains SD values for each contrast
            across patients.
        methods (list): method's performance column names following 
            this format [method_name]_perf_[measure_type]
    """
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(9,9))
    cols_cat = ["#ff6767", "#8edba3"]
    cols_dic = {name: "#989e9a" if (k == 0 or k == 1) else cols_cat[k%2] 
                    for k, name in enumerate(methods)}

    sns.violinplot(data=df[methods], ax=ax, inner="box", palette=cols_dic)
    labels = methods
    ax.set_title("CSA's standard deviation across contrasts for each method")
    ax.xaxis.set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(0, len(labels)), labels, rotation=45, ha='right')
    ax.set_xlabel('Methods')
    ax.set_ylabel(f'Standard deviation')
    #yabs_max = abs(max(ax.get_ylim(), key=abs))
    #ax.set_ylim(ymin=0, ymax=yabs_max)
    bench_patch = mpatches.Patch(color="#989e9a", label='Benchmark')
    singleGT_patch = mpatches.Patch(color="#ff6767", label='Single GT')
    meanGT_patch = mpatches.Patch(color="#8edba3", label='Mean GT')
    ax.legend(title= "Method type", 
        handles=[bench_patch, singleGT_patch, meanGT_patch])

In [None]:
# Creating mock data
n_samples = 15
df = pd.DataFrame({"patient_number": np.arange(n_samples),
                   "manual_gt_t1w": 73 + np.random.randn(n_samples),
                   "manual_gt_t2w": 74 + np.random.randn(n_samples),
                   "manual_gt_t2star": 76 + np.random.randn(n_samples),
                   "deepseg_t1w": 73 + np.random.randn(n_samples),
                   "deepseg_t2w": 74 + np.random.randn(n_samples),
                   "deepseg_t2star": 76 + np.random.randn(n_samples),
                   "singleGT_hard_t1w": 73 + np.random.randn(n_samples),
                   "singleGT_hard_t2w": 74 + np.random.randn(n_samples),
                   "singleGT_hard_t2star": 76 + np.random.randn(n_samples),
                   "singleGT_hard_all_t1w": 73 + np.random.randn(n_samples),
                   "singleGT_hard_all_t2w": 74 + np.random.randn(n_samples),
                   "singleGT_hard_all_t2star": 76 + np.random.randn(n_samples),
                   "singleGT_soft_t1w": 73.3 + np.random.randn(n_samples),
                   "singleGT_soft_t2w": 74.2 + np.random.randn(n_samples),
                   "singleGT_soft_t2star": 75.8 + np.random.randn(n_samples),
                   "singleGT_soft_all_t1w": 73.3 + np.random.randn(n_samples),
                   "singleGT_soft_all_t2w": 74.2 + np.random.randn(n_samples),
                   "singleGT_soft_all_t2star": 75.8 + np.random.randn(n_samples),
                   "meanGT_hard_t1w": 74.4 + np.random.randn(n_samples),
                   "meanGT_hard_t2w": 74.7 + np.random.randn(n_samples),
                   "meanGT_hard_t2star": 75.1 + np.random.randn(n_samples),
                   "meanGT_hard_all_t1w": 74.2 + np.random.randn(n_samples),
                   "meanGT_hard_all_t2w": 74.8 + np.random.randn(n_samples),
                   "meanGT_hard_all_t2star": 75.7 + np.random.randn(n_samples),
                   "meanGT_soft_t1w": 75 + np.random.randn(n_samples),
                   "meanGT_soft_t2w": 75 + np.random.randn(n_samples),
                   "meanGT_soft_t2star": 75 + np.random.randn(n_samples),
                   "meanGT_soft_all_t1w": 74.6 + np.random.randn(n_samples),
                   "meanGT_soft_all_t2w": 75 + np.random.randn(n_samples),
                   "meanGT_soft_all_t2star": 75.4 + np.random.randn(n_samples),
                  } )

def merge_csv(csv_folder):
    """Assumes a folder structure as one csv per contrast. 
    TODO: Complete this function so we can create the main
    dataframe out of all sub-csv.
    """
    raise NotImplementedError
    
    
df.head(5)

# Dataframe transformations
We create 2 type of dataframes associated to the Pair-Wise Difference (PWD) and Standard Deviation (SD) methods:
* **PWD**: We compute the pair-wise difference between individual contrast and the reference contrast for each method and for each patient, then store the data in dedicated column.
* **SD**: We compute the standard deviation of all contrast CSA values for each method and for each patient.

In [None]:
def create_perf_df_pwd(dataframe, methods, contrasts, ref_contrast="t2w", perf_suffix="_perf_pwd"):
    """Creates a copy of the original dataframe containing 2
    transformations:
        1) Transforms raw CSA into CSA difference with ref_contrast
        2) Creates new columns averaging performance across contrast
            for each method.
    Args:
        dataframe (pandas.Dataframe): Original dataframe 
        methods (list): list of method names
        contrasts (list): list of contrast names
        ref_contrast (str): reference contrast
        perf_suffix (str): suffix to use for performance column creation
    Returns:
        df_copy (pandas.Dataframe): Modifies dataframe contrast columns  
            with the PWD against the ref_contrast.
            Also, method performance columns are added following this 
            format [method_name]_perf_[measure_type]
        macro_perf_pwd_names (list): list of method performance 
            column names
    """
    df_copy = dataframe.copy()
    macro_perf_pwd_names = [method+perf_suffix for method in methods]

    for model_prefix in methods:
        agg_cols = [model_prefix+"_"+contrast for contrast in contrasts]
        model_ref_col = model_prefix+"_"+ref_contrast
        for model_contrast in agg_cols:
            #c_col = model_prefix+"_"+contrast
            df_copy[model_contrast] = 100 * (df_copy[model_ref_col] - 
                df_copy[model_contrast])/df_copy[model_ref_col]
        df_copy[model_prefix+perf_suffix] = np.mean([df_copy[col] for col in agg_cols], axis=0)
    
    return df_copy, macro_perf_pwd_names

def create_perf_df_sd(dataframe, methods, contrasts, perf_suffix="_perf_sd"):
    """Creates a copy of the original dataframe with added performance columns.
    Performance columns are patient-wise standard deviation of each method 
    across contrasts.
    Args:
        dataframe (pandas.Dataframe): Original dataframe containing
        methods (list): list of method names
        contrasts (list): list of all contrasts
        perf_suffix (str): suffix to use for performance column creation
    Returns:
        df_copy (pandas.Dataframe): Adds new performance columns to 
            dataframe following this format [method_name]_perf_[measure_type]
        macro_perf_sd_names (list): list of method performance column names
    """
    df_copy = dataframe.copy()
    macro_perf_sd_names = [method+perf_suffix for method in methods]
    
    for model_prefix in methods:
        agg_cols = [model_prefix+"_"+contrast for contrast in contrasts]
        df_copy[model_prefix+perf_suffix] = np.std([df_copy[col] for col in agg_cols], axis=0)
    
    return df_copy, macro_perf_sd_names

methods = ["manual_gt", "deepseg", "singleGT_hard", "meanGT_hard", "singleGT_soft", 
          "meanGT_soft", "singleGT_hard_all", "meanGT_hard_all", "singleGT_soft_all",
          "meanGT_soft_all"]
contrasts = ["t1w", "t2star"]
ref_contrast = "t2w"
perf_df_pwd, macro_perf_pwd_names = create_perf_df_pwd(df, methods, contrasts, ref_contrast)
perf_df_sd, macro_perf_sd_names = create_perf_df_sd(df, methods, contrasts+[ref_contrast])

perf_df_sd.head(3)

# Individual contrast performance against reference contrast
We visualize contrast-specific CSA w.r.t the reference contrast, which is **t2w** here. Pair-wise difference is preferred here to grasp any positive or negative bias of a contrast's CSA against the reference contrast's CSA. 

More specifically, the **SD** method measures the overall dispersion of a patient CSA values across contrasts, whereas **PWD** provides bias information regarding a contrast's CSA against the reference contrast's CSA.


In [None]:
contrast_specific_pwd_violin(perf_df_pwd, methods, contrasts)

# Macro performance overview
We want a chart that would allow us to evaluate the performance of a model or group of models (e.g. contrast specific) in CSA robustness. 
In order to do that, we create a violin plot aggregating CSA dispersion for each method.

The chosen approach here is the SD method. We compute a measure of dispersion of a patient's CSA values across contrasts. This measure of dispersion is the standard deviation in this example.  

More specifically, the **Average-PWD** approach could not be used here as it introduce bias because CSA difference of a contrast w.r.t. the reference contrast can be negative and positive, leading to potential cancellation effects.

As for the measure of dispersion, another approach would have been to use the Mean Absolute Deviation or RMSE. 

---
The following cell is showcasing both Average-PWD and SD methods. Notice the bias introduced by the former method.

In [None]:
macro_pwd_violin(perf_df_pwd, methods=macro_perf_pwd_names)
macro_sd_violin(perf_df_sd, methods=macro_perf_sd_names)