# Predicting Unstable Software Benchmarks Using Static Source Code Features
## Classification study

The following Python Jupyter Notebook can be used to interactively reproduce the study we performed
in our paper with the title *Predicting Unstable Software Benchmarks Using Static Source Code Features*.

### Initialization

We import the needed Python modules.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import plotnine as p9
import scikit_posthocs as sp
from scipy import stats

from study_conf_labels import *
from study_conf_palettes import *
from study_data_utils import *
from study_plots_utils import *
from study_stats_utils import *
from utils import apply_binary_threshold, approximate_zeros, remove_negative_values

### Configuration

First, we configure some parameters for the script.

`DATA_CSV_FILE_PATH` specifies the path for the data `CSV` file analyzed bu the notebook.
`METRICS` is the list of metrics considered by the study.

`BASELINE_MODELS` and `COMPARED_MODELS` differentiate, respectively, the list of models used as baseline and those for comparison.

`ITERATIONS` and `THRESHOLDS` represent the values considered for the respective parameters.

In [None]:
DATA_CSV_FILE_PATHS = {
    5: 'resources/variabilities_5_iterations.csv',
    10: 'resources/variabilities_10_iterations.csv',
    20: 'resources/variabilities_20_iterations.csv',
    30: 'resources/variabilities_30_iterations.csv',
}
CLASSIFICATION_RESULTS_CSV_FILE_PATH = 'resources/classification_results.csv.xz'
PLOTS_OUTPUT_DIRECTORY_PATH = 'resources/output/plots'

METRICS = ['precision', 'recall', 'fmeasure', 'auc', 'mcc']

BASELINE_MODELS = [
    'DummyClassifier(strategy=\'most_frequent\')', 'DummyClassifier(strategy=\'prior\')',
    'DummyClassifier(strategy=\'stratified\')', 'DummyClassifier(strategy=\'uniform\')',
]

COMPARED_MODELS = [
    'GaussianNB()', 'KNeighborsClassifier()', 'LogisticRegression()', 'MLPClassifier()', 'LinearDiscriminantAnalysis()',
    'DecisionTreeClassifier()', 'SVC(kernel=\'linear\')', 'SVC(kernel=\'rbf\')', 'RandomForestClassifier()',
    'AdaBoostClassifier()', 'GradientBoostingClassifier()',
]

SELECTORS = ['None', 'AutoSpearmanSelector()']
FOCUS_SELECTORS = [
    'None',
]

SAMPLERS = ['None', 'SMOTE()']
FOCUS_SAMPLERS = [
    'None',
]

PREPROCESSING_GROUPS = ['None/None', 'None/SMOTE()', 'AutoSpearmanSelector()/None', 'AutoSpearmanSelector()/SMOTE()']

ITERATIONS = [
    5,
    10,
    20,
    30,
]

THRESHOLDS = [
    1,
    3,
    5,
    10,
]

CROSS_VALIDATION_FOLDS = 10
CROSS_VALIDATION_REPETITIONS = 30
TOTAL_CROSS_VALIDATION_FOLDS = CROSS_VALIDATION_FOLDS * CROSS_VALIDATION_REPETITIONS

DEPENDENT_VARIABLES = [
    'rciw99',
    'rciw99mjhd',
    'rmadhd',
]
FOCUS_DEPENDENT_VARIABLES = [
    'rciw99mjhd',
]

SIGNIFICANCE_LEVEL = 0.01

### Study

#### Distributions of stable and unstable values

First of all, we read from the CSV files the data to plot the distributions considering the iterations and used thresholds.

In [None]:
# Open the CSV files and concatenate them.
iterations_df = pd.DataFrame()
for iterations_number in ITERATIONS:
    temp_df = pd.read_csv(DATA_CSV_FILE_PATHS[iterations_number])
    temp_df['iterations'] = iterations_number
    iterations_df = pd.concat([iterations_df, temp_df])

# Transform the iterations column to categorical type for easy sorting.
iterations_df['iterations'] = pd.Categorical(iterations_df['iterations'], categories=ITERATIONS)

# Print the dataframe.
display(iterations_df)

# Print some statistics.
print(f"Number of functions: {iterations_df['function'].unique().shape[0]}")

Clean the data.

In [None]:
for dep_var in DEPENDENT_VARIABLES:
    # Clean the data from negative values.
    iterations_df = remove_negative_values(iterations_df, dep_var)

    # Clean the data from 0 values.
    iterations_df = approximate_zeros(iterations_df, dep_var)

# Print the dataframe.
display(iterations_df)

Apply the thresholds to prepare the data for plotting.

In [None]:
# Apply the binarization for all the dependent variables.
thresholds_df = pd.DataFrame()

for threshold in THRESHOLDS:
    temp_df = iterations_df[['iterations', 'project_name', 'function', *DEPENDENT_VARIABLES]]

    for dep_var in DEPENDENT_VARIABLES:
        temp_df[dep_var] = apply_binary_threshold(temp_df[dep_var], threshold)

    temp_df['threshold'] = threshold
    thresholds_df = pd.concat([thresholds_df, temp_df])

# Transform the values into categorical.
thresholds_df['iterations'] = pd.Categorical(thresholds_df['iterations'], categories=ITERATIONS)
thresholds_df['threshold'] = pd.Categorical(thresholds_df['threshold'], categories=THRESHOLDS)

# Convert the dataframe from wide to long.
thresholds_df = pd.melt(thresholds_df, id_vars=['iterations', 'project_name', 'function', 'threshold'], value_vars=DEPENDENT_VARIABLES)
thresholds_df['value'] = pd.Categorical(thresholds_df['value'], categories=[0, 1])

display(thresholds_df)

display(thresholds_df.groupby(['iterations', 'threshold', 'variable', 'value']).count())

We create and print the distribution plots.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    distributions_barplot = plot_distributions_bar_plot_grid(thresholds_df.query(f'variable == "{dep_var}"'), figure_size=(14, 1.5))
    print(dep_var)
    display(distributions_barplot)

#### Classifiers results

We read the data from the `CSV` file, extract some elements, and computing some basic descriptive statistics.

In [None]:
# Open the CSV.
df = pd.read_csv(CLASSIFICATION_RESULTS_CSV_FILE_PATH)

# Select according to the given configuration.
df = df[df['model'].isin(BASELINE_MODELS + COMPARED_MODELS)]
df = df[df['iterations'].isin(ITERATIONS)]
df = df[df['threshold'].isin(THRESHOLDS)]
df = df[df['dependent_variable'].isin(DEPENDENT_VARIABLES)]
df = df[df['selector'].isin(SELECTORS)]
df = df[df['sampler'].isin(SAMPLERS)]

# Transform some of the columns to categorical type for easy sorting.
df['model'] = pd.Categorical(df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)
df['iterations'] = pd.Categorical(df['iterations'], categories=ITERATIONS)
df['threshold'] = pd.Categorical(df['threshold'], categories=THRESHOLDS)
df['dependent_variable'] = pd.Categorical(df['dependent_variable'], categories=DEPENDENT_VARIABLES)
df['selector'] = pd.Categorical(df['selector'], categories=SELECTORS)
df['sampler'] = pd.Categorical(df['sampler'], categories=SAMPLERS)

# Create the subdataframes.
baseline_df = df[df['model'].isin(BASELINE_MODELS)]
baseline_df['model'] = pd.Categorical(baseline_df['model'], categories=BASELINE_MODELS)
compared_df = df[df['model'].isin(COMPARED_MODELS)]
compared_df['model'] = pd.Categorical(compared_df['model'], categories=COMPARED_MODELS)

# Print the head of the dataframe.
display(df)

# Print some statistics.
print(f"Number of experiments: {df.shape[0]}")
print(f"Number of folds per combination: {df['fold'].unique().shape[0]}")
print(f"Models: {list(df['model'].unique())}")
print(f"Benchmark iterations: {list(df['iterations'].unique())}")
print(f"Stability thresholds: {list(df['threshold'].unique())}")
print(f"Dependent variables: {list(df['dependent_variable'].unique())}")
print(f"Selectors: {list(df['selector'].unique())}")
print(f"Samplers: {list(df['sampler'].unique())}")

We check whether there are missing experiments.

In [None]:
experiments_df = df.groupby(['dependent_variable', 'iterations', 'threshold', 'selector', 'sampler', 'model'], as_index=False)['fold'].count()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(experiments_df[experiments_df['fold'] != TOTAL_CROSS_VALIDATION_FOLDS])

#### Normality test

We test if the distributions for **all** the combinations (model, iterations, and threshold) are normal.
To do that, we apply the *D’Agostino's K^2 Test*.

> *Null hypothesis*: the observations come from a normal distribution.
>
> `p-value >= 0.01`: accept the null hypothesis, normal
>
> `p-value < 0.01`: reject the null hypothesis, not normal

If not all are normal, i.e., the tests accept the null hypotheses in some cases, then we use non parametric tests and plot the median.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    # Create a dataframe with the p-values of the normality test.
    normal_test_df = pivot_table_grouping(
        df.query('dependent_variable == @dep_var'),
        index=['dependent_variable', 'model', 'iterations', 'selector', 'sampler'],
        columns='threshold',
        metrics=METRICS,
        index_sort=[DEPENDENT_VARIABLES, BASELINE_MODELS + COMPARED_MODELS, ITERATIONS, SELECTORS, SAMPLERS],
        columns_sort=[METRICS, THRESHOLDS],
        aggfunc=lambda x: stats.normaltest(x)[1],
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(dep_var)
    display(normal_test_df
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
    )

#### Median values comparison

Since there are many non-normal groups of observations, we apply non-parametric tests that compare median values.

We produce a pivot table for comparing the median of each combination of model, iterations, and threshold.

In [None]:
# Create the dataframe.
median_pivot_df = pivot_table_grouping(
    df,
    index=['dependent_variable', 'model', 'iterations', 'selector', 'sampler'],
    columns='threshold',
    metrics=METRICS,
    index_sort=[DEPENDENT_VARIABLES, BASELINE_MODELS + COMPARED_MODELS, ITERATIONS, SELECTORS, SAMPLERS],
    columns_sort=[METRICS, THRESHOLDS],
    aggfunc=np.median,
)

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    # Print the dataframe showing the bars in the background.
    print(dep_var)
    display(median_pivot_df.query('dependent_variable == @dep_var').droplevel(0)
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show the median values with reduced decimal digits.
    .style.format('{:.4f}')
    # Show a background bar as indication.
    .bar(vmin=0.0, vmax=1.0, color='#5fba7d')
    )

We plot the medians values for each of the target metrics.
We variate the models on the columns.
In the rows, we report the diffent benchmark iterations.
For each model and iterations number, we draw a lineplot with all the stability thresholds.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            baseline_models_median_df = median_long_dataframe(
                df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                models=BASELINE_MODELS,
                metrics=METRICS,
            )

            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            baseline_models_median_lineplot = plot_metrics_comparison_lineplot_grid(
                baseline_models_median_df,
                models_labels=MODELS_LABELS,
                metrics_labels=METRICS_LABELS,
                figure_size=(7, 5),
            )
            display(baseline_models_median_lineplot)

Plot the compared models grid.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            compared_models_median_df = median_long_dataframe(
                df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                models=COMPARED_MODELS,
                metrics=METRICS,
            )

            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            compared_models_median_lineplot = plot_metrics_comparison_lineplot_grid(
                compared_models_median_df,
                models_labels=MODELS_LABELS,
                metrics_labels=METRICS_LABELS,
                figure_size=(14, 5),
            )
            display(compared_models_median_lineplot)

In the following we inspect the trends of performance from different points of view.
In particular, we investigate considering the three dimensions involved in our study: *thresholds*, *iterations*, and *models*.
In each of the following sections, we fix two of the dimensions and variate the other.

#### Thresholds comparison

For each combination of model and iterations value, we are interested in seeing if there is any significative difference when the threshold value increases.
To do so, we test the metric values against a statistics test on median values with multiple samples of observations.
Namely, we apply the *Kruskal-Wallis Test*, which is the non-parametric version of the *ANOVA* test.

> *Null hypothesis*: the population median of all the groups are equal.
>
> `p-value >= 0.01`: accept the null hypothesis, equal
>
> `p-value < 0.01`: reject the null hypothesis, not equal

Therefore, we build and display the table with the p-values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            thresholds_kruskal_test_df = multiple_groups_test_dataframe(compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'), group_1=['model', 'iterations'], group_2='threshold', metrics=METRICS, testfunc=stats.kruskal, check_identical=True)

            # Pivot the dataframe for better visualization.
            thresholds_kruskal_test_df = (
                thresholds_kruskal_test_df.pivot_table(index=['model', 'iterations'], columns=['metric'], values=['pvalue'])
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the iterations.
                .reindex(ITERATIONS, level=1)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
            )

            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(thresholds_kruskal_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
            )

Once we confirm that the median values of the groups are statistically different, we can apply a pairwise post-hoc test to understand where this is valid. We apply the *Dunn's Test*.

> *Null hypothesis*: there is no difference between the two compared groups.
>
> `p-value >= 0.01`: accept the null hypothesis, equal
>
> `p-value < 0.01`: reject the null hypothesis, not equal

In [None]:
thresholds_dunn_test_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            thresholds_dunn_test_df = pairwise_multiple_groups_posthoc_test_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['model', 'iterations'],
                group_2='threshold',
                metrics=METRICS,
                testfunc=sp.posthoc_dunn,
            )

            # Fix the threshold and comparison columns.
            thresholds_dunn_test_df['threshold'] = pd.Categorical(thresholds_dunn_test_df['threshold'], categories=THRESHOLDS)
            thresholds_dunn_test_df['comparison'] = pd.Categorical(thresholds_dunn_test_df['comparison'], categories=THRESHOLDS)

            # Pivot the dataframe for better visualization.
            thresholds_dunn_test_df = (
                thresholds_dunn_test_df.pivot_table(index=['model', 'iterations', 'threshold'], columns=['metric', 'comparison'], values=['pvalue'])
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the iterations.
                .reindex(ITERATIONS, level=1)
                # Sort the threshold.
                .reindex(THRESHOLDS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
                # Sort the comparison.
                .reindex(THRESHOLDS, axis=1, level=2)
            )

            # Store the dataframe.
            thresholds_dunn_test_dfs[(dep_var, selector, sampler)] = thresholds_dunn_test_df

            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(thresholds_dunn_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: '' if x < 0 else ('background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f'))
            )

Finally, we measure the effect size of the statistical differences by using a function that represents the *Vargha-Delaney A* test:

> `A = 0.5`: the group `x` performs equal to the group `y`
>
> `A < 0.5`: the group `x` performs worse than the group `y`
>
> `A > 0.5`: the group `x` performs better than the group `y`

In [None]:
thresholds_vda_test_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            thresholds_vda_test_df = pairwise_multiple_groups_vda_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['model', 'iterations'],
                group_2='threshold',
                metrics=METRICS,
            )

            # Pivot the dataframe for better visualization.
            thresholds_vda_test_df = (
                thresholds_vda_test_df.pivot_table(index=['model', 'iterations', 'threshold'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
                # Use "a" and "magnitude" as an index.
                .stack(level=0)
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the iterations.
                .reindex(ITERATIONS, level=1)
                # Sort the threshold.
                .reindex(THRESHOLDS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=0)
                # Sort the comparison.
                .reindex(THRESHOLDS, axis=1, level=1)
            )

            # Store the dataframe.
            thresholds_vda_test_dfs[(dep_var, selector, sampler)] = thresholds_vda_test_df

            # Print the dataframe showing the colored magnitude levels.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(thresholds_vda_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show all the magnitude values as the shortest version.
            .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
            # Apply the color filtering.
            .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
            )

#### Iterations comparison

For each combination of model and threshold value, we are interested in seeing if there is any significative difference when the iterations value increases.
Thus, we aplly the *Kruskal-Wallis Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            iterations_kruskal_test_df = multiple_groups_test_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['model', 'threshold'],
                group_2='iterations',
                metrics=METRICS,
                testfunc=stats.kruskal,
                check_identical=True,
            )

            # Pivot the dataframe for better visualization.
            iterations_kruskal_test_df = (
                iterations_kruskal_test_df.pivot_table(index=['model', 'threshold'], columns=['metric'], values=['pvalue'])
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the threshold.
                .reindex(THRESHOLDS, level=1)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
            )

            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(iterations_kruskal_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
            )

Then, we apply the *Dunn's Test* as a pairwise post-hoc test to understand for which groups the median values are statistically different.

In [None]:
iterations_dunn_test_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            iterations_dunn_test_df = pairwise_multiple_groups_posthoc_test_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['model', 'threshold'],
                group_2='iterations',
                metrics=METRICS,
                testfunc=sp.posthoc_dunn,
            )

            # Fix the iterations and comparison columns.
            iterations_dunn_test_df['iterations'] = pd.Categorical(iterations_dunn_test_df['iterations'], categories=ITERATIONS)
            iterations_dunn_test_df['comparison'] = pd.Categorical(iterations_dunn_test_df['comparison'], categories=ITERATIONS)

            # Pivot the dataframe for better visualization.
            iterations_dunn_test_df = (
                iterations_dunn_test_df.pivot_table(index=['model', 'threshold', 'iterations'], columns=['metric', 'comparison'], values=['pvalue'])
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the thresholds.
                .reindex(THRESHOLDS, level=1)
                # Sort the iterations.
                .reindex(ITERATIONS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
                # Sort the comparison.
                .reindex(ITERATIONS, axis=1, level=2)
            )

            # Store the dataframe.
            iterations_dunn_test_dfs[(dep_var, selector, sampler)] = iterations_dunn_test_df

            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(iterations_dunn_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: '' if x < 0 else ('background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f'))
            )

Finally, we measure the effect size with the *Vargha-Delaney A* test.

In [None]:
iterations_vda_test_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            iterations_vda_test_df = pairwise_multiple_groups_vda_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['model', 'threshold'],
                group_2='iterations',
                metrics=METRICS,
            )

            # Pivot the dataframe for better visualization.
            iterations_vda_test_df = (
                iterations_vda_test_df.pivot_table(index=['model', 'threshold', 'iterations'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
                # Use "a" and "magnitude" as an index.
                .stack(level=0)
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
                # Sort the thresholds.
                .reindex(THRESHOLDS, level=1)
                # Sort the iterations.
                .reindex(ITERATIONS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=0)
                # Sort the comparison.
                .reindex(ITERATIONS, axis=1, level=1)
            )

            # Store the dataframe.
            iterations_vda_test_dfs[(dep_var, selector, sampler)] = iterations_vda_test_df

            # Print the dataframe showing the colored magnitude levels.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(iterations_vda_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS)
            # Show all the magnitude values as the shortest version.
            .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
            # Apply the color filtering.
            .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
            )

#### Models comparison

For each combination of iterations and threshold value, we are interested in seeing if there is any significative difference when the models changes.
Thus, we aplly the *Kruskal-Wallis Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            models_kruskal_test_df = multiple_groups_test_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['iterations', 'threshold'],
                group_2='model',
                metrics=METRICS,
                testfunc=stats.kruskal,
                check_identical=True,
            )

            # Pivot the dataframe for better visualization.
            models_kruskal_test_df = (
                models_kruskal_test_df.pivot_table(index=['iterations', 'threshold'], columns=['metric'], values=['pvalue'])
                # Sort the iterations.
                .reindex(ITERATIONS, level=0)
                # Sort the threshold.
                .reindex(THRESHOLDS, level=1)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
            )

            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(models_kruskal_test_df
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
            )

Then, we apply the *Dunn's Test* as a pairwise post-hoc test to understand for which groups the median values are statistically different.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            models_dunn_test_df = pairwise_multiple_groups_posthoc_test_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['iterations', 'threshold'],
                group_2='model',
                metrics=METRICS,
                testfunc=sp.posthoc_dunn,
            )

            # Pivot the dataframe for better visualization.
            models_dunn_test_df = (
                models_dunn_test_df.pivot_table(index=['iterations', 'threshold', 'model'], columns=['metric', 'comparison'], values=['pvalue'])
                # Sort the iterations.
                .reindex(ITERATIONS, level=0)
                # Sort the thresholds.
                .reindex(THRESHOLDS, level=1)
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=1)
                # Sort the comparison.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, axis=1, level=2)
            )
            
            # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(models_dunn_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS, columns=MODELS_LABELS)
            # Show the p-values with reduced decimal digits.
            .style.format('{:.4f}')
            # Apply the color filtering.
            .applymap(lambda x: '' if x < 0 else ('background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f'))
            )

Finally, we measure the effect size with the *Vargha-Delaney A* test.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            # Create the dataframe.
            models_vda_test_df = pairwise_multiple_groups_vda_dataframe(
                compared_df.query('dependent_variable == @dep_var and selector == @selector and sampler == @sampler'),
                group_1=['iterations', 'threshold'],
                group_2='model',
                metrics=METRICS,
            )

            # Pivot the dataframe for better visualization.
            models_vda_test_df = (
                models_vda_test_df.pivot_table(index=['iterations', 'threshold', 'model'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
                # Use "a" and "magnitude" as an index.
                .stack(level=0)
                # Sort the iterations.
                .reindex(ITERATIONS, level=0)
                # Sort the thresholds.
                .reindex(THRESHOLDS, level=1)
                # Sort the models.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, level=2)
                # Sort the metrics.
                .reindex(METRICS, axis=1, level=0)
                # Sort the comparison.
                .reindex(BASELINE_MODELS + COMPARED_MODELS, axis=1, level=1)
            )
            
            # Print the dataframe showing the colored magnitude levels.
            print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}')
            display(models_vda_test_df
            # Rename all the model names into the shortest version.
            .rename(index=MODELS_LABELS, columns=MODELS_LABELS)
            # Show all the magnitude values as the shortest version.
            .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
            # Apply the color filtering.
            .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
            )

#### Best models comparison.

Rank the top 10 models by MCC.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    # Create the dataframe.
    best_mcc_df = median_pivot_df[['mcc', 'auc', 'fmeasure', 'precision', 'recall']].query('dependent_variable == @dep_var').droplevel(0).stack().sort_values(by=['mcc', 'auc', 'fmeasure', 'precision', 'recall'], ascending=False).reindex(columns=['mcc', 'auc', 'fmeasure', 'precision', 'recall']).reset_index()

    # Print the dataframe showing the bars in the background.
    print(f'dep_var={dep_var}')
    display(best_mcc_df.head(10)
    # Show the median values with reduced decimal digits.
    .style.format({'model': lambda x: MODELS_LABELS[x], **{x: '{:.4f}' for x in METRICS}})
    # Show a background bar as indication.
    .bar(vmin=0.0, vmax=1.0, color='#5fba7d')
    )

Best model metrics.

In [None]:
rf_median_pivot_df = median_pivot_df.query(f'dependent_variable == "{dep_var}" and model == "RandomForestClassifier()"')

# Print the dataframe showing the bars in the background.
display(rf_median_pivot_df
# Rename all the model names into the shortest version.
.rename(index=MODELS_LABELS)
# Show the median values with reduced decimal digits.
.style.format('{:.4f}')
# Show a background bar as indication.
.bar(vmin=0.0, vmax=1.0, color='#5fba7d')
)

##### Threshold comparison

Verify the Dunn's pairs for threshold values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            for metric in ['mcc', 'auc']:
                pairs_thresholds_dunn_test_df = thresholds_dunn_test_dfs[(dep_var, selector, sampler)]['pvalue', metric].stack().to_frame('pvalue').reset_index()
                pairs_thresholds_dunn_test_df['model'] = pd.Categorical(pairs_thresholds_dunn_test_df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)

                print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}, metric={metric}')
                
                # Extract the values for (1, 3), (3, 5), (5, 10)
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_thresholds_dunn_test_df, pd.DataFrame(columns=['threshold', 'comparison'], data=[(1, 3), (3, 5), (5, 10)]), on=['threshold', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'iterations', 'threshold', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f', subset='pvalue')
                )

                # Extract the values for (1, 10) 
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_thresholds_dunn_test_df, pd.DataFrame(columns=['threshold', 'comparison'], data=[(1, 10)]), on=['threshold', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'iterations', 'threshold', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f', subset='pvalue')
                )

Verify the Vargha's pairs for threshold values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            for metric in ['mcc', 'auc']:
                pairs_thresholds_vda_test_df = thresholds_vda_test_dfs[(dep_var, selector, sampler)][metric].stack().to_frame('value').reset_index().rename(columns={'level_3': 'variable'})[
                    ['model', 'iterations', 'threshold', 'comparison', 'variable', 'value']]
                pairs_thresholds_vda_test_df['model'] = pd.Categorical(pairs_thresholds_vda_test_df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)

                # Sets a column for growing values.
                pairs_thresholds_vda_test_df['growing'] = pairs_thresholds_vda_test_df.loc[pairs_thresholds_vda_test_df['variable'] == 'a']['value'] < 0.5

                print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}, metric={metric}')

                # Extract the values for (1, 3), (3, 5), (5, 10)
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_thresholds_vda_test_df, pd.DataFrame(columns=['threshold', 'comparison'], data=[(1, 3), (3, 5), (5, 10)]), on=['threshold', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'iterations', 'threshold', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '', subset='value')
                .applymap(lambda x: 'background-color: #5fba7d' if x is True else ('background-color: #d65f5f' if x is False else ''), subset='growing')
                )

                # Extract the values for (1, 10) 
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_thresholds_vda_test_df, pd.DataFrame(columns=['threshold', 'comparison'], data=[(1, 10)]), on=['threshold', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'iterations', 'threshold', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '', subset='value')
                .applymap(lambda x: 'background-color: #5fba7d' if x is True else ('background-color: #d65f5f' if x is False else ''), subset='growing')
                )

##### Iterations comparison

Verify the Dunn's pairs for iterations values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            for metric in ['mcc', 'auc']:
                pairs_iterations_dunn_test_df = iterations_dunn_test_dfs[(dep_var, selector, sampler)]['pvalue', metric].stack().to_frame('pvalue').reset_index()
                pairs_iterations_dunn_test_df['model'] = pd.Categorical(pairs_iterations_dunn_test_df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)

                print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}, metric={metric}')

                # Extract the values for (5, 10), (10, 20), (20, 30)
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_iterations_dunn_test_df, pd.DataFrame(columns=['iterations', 'comparison'], data=[(5, 10), (10, 20), (20, 30)]), on=['iterations', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'threshold', 'iterations', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f', subset='pvalue')
                )

                # Extract the values for (5, 30) 
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_iterations_dunn_test_df, pd.DataFrame(columns=['iterations', 'comparison'], data=[(5, 30)]), on=['iterations', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'threshold', 'iterations', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f', subset='pvalue')
                )

Verify the Vargha's pairs for iterations values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for selector in FOCUS_SELECTORS:
        for sampler in FOCUS_SAMPLERS:
            for metric in ['mcc', 'auc']:
                pairs_iterations_vda_test_df = iterations_vda_test_dfs[(dep_var, selector, sampler)][metric].stack().to_frame('value').reset_index().rename(columns={'level_3': 'variable'})[
                    ['model', 'threshold', 'iterations', 'comparison', 'variable', 'value']]
                pairs_iterations_vda_test_df['model'] = pd.Categorical(pairs_iterations_vda_test_df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)

                # Sets a column for growing values.
                pairs_iterations_vda_test_df['growing'] = pairs_iterations_vda_test_df.loc[pairs_iterations_vda_test_df['variable'] == 'a']['value'] < 0.5

                print(f'dep_var={dep_var}, selector={selector}, sampler={sampler}, metric={metric}')

                # Extract the values for (5, 10), (10, 20), (20, 30)
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_iterations_vda_test_df, pd.DataFrame(columns=['iterations', 'comparison'], data=[(5, 10), (10, 20), (20, 30)]), on=['iterations', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'threshold', 'iterations', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '', subset='value')
                .applymap(lambda x: 'background-color: #5fba7d' if x is True else ('background-color: #d65f5f' if x is False else ''), subset='growing')
                )

                # Extract the values for (5, 30) 
                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                display(pd.merge(pairs_iterations_vda_test_df, pd.DataFrame(columns=['iterations', 'comparison'], data=[(5, 30)]), on=['iterations', 'comparison'])
                # Sort the values of the columns.
                .sort_values(['model', 'threshold', 'iterations', 'comparison'])
                # Format cells.
                .style.format({'model': lambda x: MODELS_LABELS[x], 'pvalue': '{:.2f}'})
                # Apply the color filtering.
                .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '', subset='value')
                .applymap(lambda x: 'background-color: #5fba7d' if x is True else ('background-color: #d65f5f' if x is False else ''), subset='growing')
                )

#### Preprocessing comparison

We now analyze the impact of the use of some preprocessing steps on the classification performance.
We first plot the overall preprocessing comparison plot.
Then, we plot the comparison plot by model.

In [None]:
preprocessing_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    # Transform the dataframe from wide to long.
    preprocessing_df = pd.melt(df.query('dependent_variable == @dep_var'), id_vars=['dependent_variable', 'model', 'iterations', 'threshold', 'selector', 'sampler', 'fold'])

    # Create a column for the selector/sampler combinations.
    preprocessing_df['group'] = preprocessing_df['selector'].astype(str) + '/' + preprocessing_df['sampler'].astype(str)

    # Filter the data.
    preprocessing_df = preprocessing_df[preprocessing_df['model'].isin(COMPARED_MODELS)]
    preprocessing_df = preprocessing_df[preprocessing_df['variable'].isin(METRICS)]
    preprocessing_df = preprocessing_df[preprocessing_df['group'].isin(PREPROCESSING_GROUPS)]

    # Transform the values into categorical.
    preprocessing_df['variable'] = pd.Categorical(preprocessing_df['variable'], categories=METRICS)
    preprocessing_df['group'] = pd.Categorical(preprocessing_df['group'], categories=PREPROCESSING_GROUPS)

    print(f'dep_var={dep_var}')

    preprocessing_boxplot_overall = plot_preprocessing_boxplot_overall(
        preprocessing_df,
        metrics_labels=METRICS_LABELS,
        groups_labels=PREPROCESSING_GROUPS_LABELS,
        figure_size=(6, 10),
    )
    display(preprocessing_boxplot_overall)

    preprocessing_boxplot_bymodel = plot_preprocessing_boxplot_bymodel(
        preprocessing_df,
        models_labels=MODELS_LABELS,
        metrics_labels=METRICS_LABELS,
        groups_labels=PREPROCESSING_GROUPS_LABELS,
        figure_size=(6, 20),
    )
    display(preprocessing_boxplot_bymodel)

    # Store the dataframe.
    preprocessing_df = preprocessing_df.pivot(index=['dependent_variable', 'iterations', 'threshold', 'model', 'group', 'fold'], columns='variable', values='value')
    preprocessing_dfs[dep_var] = preprocessing_df
    display(preprocessing_df)

We do the same, but showing the pairwise differences values instead of absolute values.

In [None]:
differences_preprocessing_dfs = {}

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    preprocessing_df = df.query('dependent_variable == @dep_var')

    # Create a column for the selector/sampler combinations.
    preprocessing_df['group'] = preprocessing_df['selector'].astype(str) + '/' + preprocessing_df['sampler'].astype(str)

    # Filter the data.
    preprocessing_df = preprocessing_df[preprocessing_df['model'].isin(COMPARED_MODELS)]
    preprocessing_df = preprocessing_df[preprocessing_df['group'].isin(PREPROCESSING_GROUPS)]

    # Separate the baseline from the comparison rows.
    baseline_preprocessing_df = preprocessing_df.query('group == "None/None"')
    comparison_preprocessing_df = preprocessing_df.query('group != "None/None"')

    # Merge on the combinations, except for selector and sampler.
    differences_preprocessing_df = comparison_preprocessing_df.merge(baseline_preprocessing_df, on=['dependent_variable', 'iterations', 'threshold', 'model', 'fold'])

    # Adjust the dataframe and compute the differences.
    differences_preprocessing_df['group'] = differences_preprocessing_df['group_x']
    for metric in METRICS:
        differences_preprocessing_df[metric] = differences_preprocessing_df[f'{metric}_y'] - differences_preprocessing_df[f'{metric}_x']

    # Transform the dataframe from wide to long.
    differences_preprocessing_df = pd.melt(differences_preprocessing_df, id_vars=['dependent_variable', 'model', 'iterations', 'threshold', 'fold', 'group'], value_vars=METRICS)

    # Filter the data.
    differences_preprocessing_df = differences_preprocessing_df[differences_preprocessing_df['variable'].isin(METRICS)]

    # Transform the values into categorical.
    differences_preprocessing_df['variable'] = pd.Categorical(differences_preprocessing_df['variable'], categories=METRICS)
    differences_preprocessing_df['group'] = pd.Categorical(differences_preprocessing_df['group'], categories=[x for x in PREPROCESSING_GROUPS if x != 'None/None'])

    print(f'dep_var={dep_var}')

    differences_preprocessing_boxplot_overall = plot_preprocessing_boxplot_overall(
        differences_preprocessing_df,
        metrics_labels=METRICS_LABELS,
        groups_labels=PREPROCESSING_GROUPS_LABELS,
        figure_size=(6, 10),
    )
    display(differences_preprocessing_boxplot_overall)

    differences_preprocessing_boxplot_bymodel = plot_preprocessing_boxplot_bymodel(
        differences_preprocessing_df,
        models_labels=MODELS_LABELS,
        metrics_labels=METRICS_LABELS,
        groups_labels=PREPROCESSING_GROUPS_LABELS,
        figure_size=(6, 20),
    )
    display(differences_preprocessing_boxplot_bymodel)

    # Store the dataframe.
    differences_preprocessing_df = differences_preprocessing_df.pivot(index=['dependent_variable', 'iterations', 'threshold', 'model', 'group', 'fold'], columns='variable', values='value')
    differences_preprocessing_dfs[dep_var] = differences_preprocessing_df
    display(differences_preprocessing_df)

We print the difference values.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    differences_preprocessing_df = differences_preprocessing_dfs[dep_var]

    overall_median_df = pivot_table_grouping(
        differences_preprocessing_df,
        index='dependent_variable',
        columns='group',
        metrics=METRICS,
        index_sort=None,
        columns_sort=[METRICS, PREPROCESSING_GROUPS],
        aggfunc=np.median,
    )
    display(overall_median_df)

    bymodel_median_df = pivot_table_grouping(
        differences_preprocessing_df,
        index=['dependent_variable', 'model'],
        columns='group',
        metrics=METRICS,
        index_sort=[DEPENDENT_VARIABLES, BASELINE_MODELS + COMPARED_MODELS],
        columns_sort=[METRICS, PREPROCESSING_GROUPS],
        aggfunc=np.median,
    )
    display(bymodel_median_df)

    display(bymodel_median_df.query('model == "RandomForestClassifier()"'))

We verify the normality of the distributions by using the *D’Agostino's K^2 Test*.

In [None]:

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    preprocessing_df = preprocessing_dfs[dep_var]

    # Create a dataframe with the p-values of the normality test.
    overall_normal_test_df = pivot_table_grouping(
        preprocessing_df,
        index='dependent_variable',
        columns='group',
        metrics=METRICS,
        index_sort=None,
        columns_sort=[METRICS, PREPROCESSING_GROUPS],
        aggfunc=lambda x: stats.normaltest(x)[1],
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(f'dep_var={dep_var}')
    display(overall_normal_test_df
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
    )

    # Create a dataframe with the p-values of the normality test.
    bymodel_normal_test_df = pivot_table_grouping(
        preprocessing_df,
        index=['dependent_variable', 'model'],
        columns='group',
        metrics=METRICS,
        index_sort=[DEPENDENT_VARIABLES, BASELINE_MODELS + COMPARED_MODELS],
        columns_sort=[METRICS, PREPROCESSING_GROUPS],
        aggfunc=lambda x: stats.normaltest(x)[1],
    )


    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    display(bymodel_normal_test_df
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
    )

We test whether there are any statiscally significant differences between the groups by using the *Kruskall-Wallis Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    preprocessing_df = preprocessing_dfs[dep_var]

    # Create the dataframe.
    overall_kruskal_test_df = multiple_groups_test_dataframe(preprocessing_df, group_1='dependent_variable', group_2='group', metrics=METRICS, testfunc=stats.kruskal, check_identical=True)

    # Pivot the dataframe for better visualization.
    overall_kruskal_test_df = (
        overall_kruskal_test_df.pivot_table(index=['dependent_variable'], columns=['metric'], values=['pvalue'])
        # Sort the dependent variables.
        .reindex(FOCUS_DEPENDENT_VARIABLES)
        # Sort the metrics.
        .reindex(METRICS, level=1, axis=1)
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(f'dep_var={dep_var}')
    display(overall_kruskal_test_df
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
    )

    # Create the dataframe.
    bymodel_kruskal_test_df = multiple_groups_test_dataframe(preprocessing_df, group_1=['dependent_variable', 'model'], group_2='group', metrics=METRICS, testfunc=stats.kruskal, check_identical=True)

    # Pivot the dataframe for better visualization.
    bymodel_kruskal_test_df = (
        bymodel_kruskal_test_df.pivot_table(index=['dependent_variable', 'model'], columns=['metric'], values=['pvalue'])
        # Sort the dependent variables.
        .reindex(FOCUS_DEPENDENT_VARIABLES, level=0)
        # Sort the models.
        .reindex(BASELINE_MODELS + COMPARED_MODELS, level=1)
        # Sort the metrics.
        .reindex(METRICS, level=1, axis=1)
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(f'dep_var={dep_var}')
    display(bymodel_kruskal_test_df
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
    )

We apply the *Dunn's Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    preprocessing_df = preprocessing_dfs[dep_var]

    # Create the dataframe.
    overall_dunn_test_df = pairwise_multiple_groups_posthoc_test_dataframe(
        pd.DataFrame(preprocessing_df.to_records()),
        group_1='dependent_variable',
        group_2='group',
        metrics=METRICS,
        testfunc=sp.posthoc_dunn,
    )

    # Fix the columns.
    overall_dunn_test_df['group'] = pd.Categorical(overall_dunn_test_df['group'], categories=PREPROCESSING_GROUPS)
    overall_dunn_test_df['comparison'] = pd.Categorical(overall_dunn_test_df['comparison'], categories=PREPROCESSING_GROUPS)

    # Pivot the dataframe for better visualization.
    overall_dunn_test_df = (
        overall_dunn_test_df.pivot_table(index=['group'], columns=['metric', 'comparison'], values=['pvalue'])
        # Sort the groups.
        .reindex(PREPROCESSING_GROUPS)
        # Sort the metrics.
        .reindex(METRICS, axis=1, level=1)
        # Sort the comparison.
        .reindex(PREPROCESSING_GROUPS, axis=1, level=2)
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(f'dep_var={dep_var}')
    display(overall_dunn_test_df
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: '' if x < 0 else ('background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f'))
    )

    # Create the dataframe.
    bymodel_dunn_test_df = pairwise_multiple_groups_posthoc_test_dataframe(
        pd.DataFrame(preprocessing_df.to_records()),
        group_1='model',
        group_2='group',
        metrics=METRICS,
        testfunc=sp.posthoc_dunn,
    )

    # Fix the columns.
    bymodel_dunn_test_df['group'] = pd.Categorical(bymodel_dunn_test_df['group'], categories=PREPROCESSING_GROUPS)
    bymodel_dunn_test_df['comparison'] = pd.Categorical(bymodel_dunn_test_df['comparison'], categories=PREPROCESSING_GROUPS)

    # Pivot the dataframe for better visualization.
    bymodel_dunn_test_df = (
        bymodel_dunn_test_df.pivot_table(index=['model', 'group'], columns=['metric', 'comparison'], values=['pvalue'])
        # Sort the models.
        .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
        # Sort the groups.
        .reindex(PREPROCESSING_GROUPS, level=1)
        # Sort the metrics.
        .reindex(METRICS, axis=1, level=1)
        # Sort the comparison.
        .reindex(PREPROCESSING_GROUPS, axis=1, level=2)
    )

    # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
    print(f'dep_var={dep_var}')
    display(bymodel_dunn_test_df
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show the p-values with reduced decimal digits.
    .style.format('{:.4f}')
    # Apply the color filtering.
    .applymap(lambda x: '' if x < 0 else ('background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f'))
    )

We use the *Vargha-Delaney Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    preprocessing_df = preprocessing_dfs[dep_var]

    # Create the dataframe.
    overall_vda_test_df = pairwise_multiple_groups_vda_dataframe(
        pd.DataFrame(preprocessing_df.to_records()),
        group_1='dependent_variable',
        group_2='group',
        metrics=METRICS,
    )

    # Pivot the dataframe for better visualization.
    overall_vda_test_df = (
        overall_vda_test_df.pivot_table(index=['group'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
        # Use "a" and "magnitude" as an index.
        .stack(level=0)
        # Sort the groups.
        .reindex(PREPROCESSING_GROUPS, level=0)
        # Sort the metrics.
        .reindex(METRICS, axis=1, level=0)
        # Sort the comparison.
        .reindex(PREPROCESSING_GROUPS, axis=1, level=1)
    )

    # Print the dataframe showing the colored magnitude levels.
    print(f'dep_var={dep_var}')
    display(overall_vda_test_df
    # Show all the magnitude values as the shortest version.
    .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
    # Apply the color filtering.
    .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
    )

    # Create the dataframe.
    bymodel_vda_test_df = pairwise_multiple_groups_vda_dataframe(
        pd.DataFrame(preprocessing_df.to_records()),
        group_1='model',
        group_2='group',
        metrics=METRICS,
    )

    # Pivot the dataframe for better visualization.
    bymodel_vda_test_df = (
        bymodel_vda_test_df.pivot_table(index=['model', 'group'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
        # Use "a" and "magnitude" as an index.
        .stack(level=0)
        # Sort the models.
        .reindex(BASELINE_MODELS + COMPARED_MODELS, level=0)
        # Sort the groups.
        .reindex(PREPROCESSING_GROUPS, level=1)
        # Sort the metrics.
        .reindex(METRICS, axis=1, level=0)
        # Sort the comparison.
        .reindex(PREPROCESSING_GROUPS, axis=1, level=1)
    )

    # Print the dataframe showing the colored magnitude levels.
    display(bymodel_vda_test_df
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show all the magnitude values as the shortest version.
    .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
    # Apply the color filtering.
    .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
    )

    # Print the dataframe showing the colored magnitude levels.
    display(bymodel_vda_test_df.query('model == "RandomForestClassifier()"')
    # Rename all the model names into the shortest version.
    .rename(index=MODELS_LABELS)
    # Show all the magnitude values as the shortest version.
    .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
    # Apply the color filtering.
    .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
    )

#### Dependent variables comparison

We now analyze the differences in predictive performance with the other dependent variables.
For each of the variables, we print the top 10 models with the highest MCC score.

In [None]:
for dep_var in DEPENDENT_VARIABLES:
    # Create the dataframe.
    best_mcc_df = median_pivot_df[['mcc', 'auc', 'fmeasure', 'precision', 'recall']].query('dependent_variable == @dep_var').droplevel(0).stack().sort_values(by=['mcc', 'auc', 'fmeasure', 'precision', 'recall'], ascending=False).reindex(columns=['mcc', 'auc', 'fmeasure', 'precision', 'recall']).reset_index()

    # Print the dataframe showing the bars in the background.
    print(f'dep_var={dep_var}')
    display(best_mcc_df.head(10)
    # Show the median values with reduced decimal digits.
    .style.format({'model': lambda x: MODELS_LABELS[x], **{x: '{:.4f}' for x in METRICS}})
    # Show a background bar as indication.
    .bar(vmin=0.0, vmax=1.0, color='#5fba7d')
    )