# Predicting Unstable Software Benchmarks Using Static Source Code Features
## Group importance study

The following Python Jupyter Notebook can be used to interactively reproduce the study we performed
in our paper with the title *Predicting Unstable Software Benchmarks Using Static Source Code Features*.

### Initialization

We import the needed Python modules.

In [None]:
%reload_ext autoreload
%autoreload 2

import itertools

from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import plotnine as p9
import scikit_posthocs as sp
from scipy import stats

from conf_independent_variables import *
from study_conf_labels import *
from study_conf_palettes import *
from study_data_utils import *
from study_plots_utils import *
from study_stats_utils import *
from utils import apply_binary_threshold, approximate_zeros, remove_negative_values

### Configuration

First, we configure some parameters for the script.

`DATA_CSV_FILE_PATH` specifies the path for the data `CSV` file analyzed bu the notebook.
`METRICS` is the list of metrics considered by the study.

`BASELINE_MODELS` and `COMPARED_MODELS` differentiate, respectively, the list of models used as baseline and those for comparison.

`ITERATIONS` and `THRESHOLDS` represent the values considered for the respective parameters.

In [None]:
GROUP_IMPORTANCE_RESULTS_CSV_FILE_PATH = 'resources/group_importance_mcc_results.csv'
PLOTS_OUTPUT_DIRECTORY_PATH = 'resources/output/plots'

METRICS = ['precision', 'recall', 'fmeasure', 'auc', 'mcc']

BASELINE_MODELS = [
    'DummyClassifier(strategy=\'most_frequent\')', 'DummyClassifier(strategy=\'prior\')',
    'DummyClassifier(strategy=\'stratified\')', 'DummyClassifier(strategy=\'uniform\')',
]

COMPARED_MODELS = [
    'GaussianNB()', 'KNeighborsClassifier()', 'LogisticRegression()', 'MLPClassifier()', 'LinearDiscriminantAnalysis()',
    'DecisionTreeClassifier()', 'SVC(kernel=\'linear\')', 'SVC(kernel=\'rbf\')', 'RandomForestClassifier()',
    'AdaBoostClassifier()', 'GradientBoostingClassifier()',
]

FOCUS_MODELS = ['RandomForestClassifier()']

ITERATIONS = [
    5,
    10,
    20,
    30,
]
FOCUS_ITERATIONS = [
    30,
]

THRESHOLDS = [
    1,
    3,
    5,
    10,
]
FOCUS_THRESHOLDS = [
    10,
]

CROSS_VALIDATION_FOLDS = 10
CROSS_VALIDATION_REPETITIONS = 30
TOTAL_CROSS_VALIDATION_FOLDS = CROSS_VALIDATION_FOLDS * CROSS_VALIDATION_REPETITIONS

DEPENDENT_VARIABLES = [
    'rciw99',
    'rciw99mjhd',
    'rmadhd',
]
FOCUS_DEPENDENT_VARIABLES = [
    'rciw99mjhd',
]

SIGNIFICANCE_LEVEL = 0.01

BASELINE_GROUP = IV_GROUP_NONE[0]
GROUPS = [
    IV_GROUP_NONE[0],
    IV_GROUP_BENCH[0],
    IV_GROUP_CODE[0],
    IV_GROUP_META[0],
    IV_GROUP_PL[0],
    IV_GROUP_PL_CF[0],
    IV_GROUP_PL_DATA[0],
    IV_GROUP_PL_CONC[0],
    IV_GROUP_LIB[0],
    IV_GROUP_IO[0],
    IV_GROUP_LIB_CONC[0],
    IV_GROUP_MATH[0],
    IV_GROUP_STR[0],
    IV_GROUP_OS[0],
]
GROUPS_LABELS = {
    IV_GROUP_NONE[0]: 'All features',
    IV_GROUP_BENCH[0]: 'bench',
    IV_GROUP_CODE[0]: 'code',
    IV_GROUP_META[0]: 'meta',
    IV_GROUP_PL[0]: 'pl',
    IV_GROUP_PL_CF[0]: 'pl cf',
    IV_GROUP_PL_DATA[0]: 'pl data',
    IV_GROUP_PL_CONC[0]: 'pl conc',
    IV_GROUP_LIB[0]: 'lib',
    IV_GROUP_IO[0]: 'io',
    IV_GROUP_LIB_CONC[0]: 'lib conc',
    IV_GROUP_MATH[0]: 'math',
    IV_GROUP_STR[0]: 'str',
    IV_GROUP_OS[0]: 'os',
}

### Study

In [None]:
# Open the CSV.
df = pd.read_csv(GROUP_IMPORTANCE_RESULTS_CSV_FILE_PATH)

# Select according to the given configuration.
df = df[['dependent_variable', 'iterations', 'threshold', 'model', 'fold', 'excluded_group', *METRICS]]
df = df[df['model'].isin(BASELINE_MODELS + COMPARED_MODELS)]
df = df[df['iterations'].isin(ITERATIONS)]
df = df[df['threshold'].isin(THRESHOLDS)]
df = df[df['dependent_variable'].isin(DEPENDENT_VARIABLES)]

# Transform some of the columns to categorical type for easy sorting.
df['model'] = pd.Categorical(df['model'], categories=BASELINE_MODELS + COMPARED_MODELS)
df['iterations'] = pd.Categorical(df['iterations'], categories=ITERATIONS)
df['threshold'] = pd.Categorical(df['threshold'], categories=THRESHOLDS)
df['dependent_variable'] = pd.Categorical(df['dependent_variable'], categories=DEPENDENT_VARIABLES)
df['excluded_group'] = pd.Categorical(df['excluded_group'], categories=GROUPS)

# Print the head of the dataframe.
display(df)

# Print some statistics.
print(f"Number of experiments: {df.shape[0]}")
print(f"Number of folds per combination: {df['fold'].unique().shape[0]}")
print(f"Models: {list(df['model'].unique())}")
print(f"Benchmark iterations: {list(df['iterations'].unique())}")
print(f"Stability thresholds: {list(df['threshold'].unique())}")
print(f"Dependent variables: {list(df['dependent_variable'].unique())}")

We verify the normality of the distributions by using the *D’Agostino's K^2 Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for iterations in FOCUS_ITERATIONS:
        for threshold in FOCUS_THRESHOLDS:
            for model in FOCUS_MODELS:
                # Create a dataframe with the p-values of the normality test.
                normal_test_df = pivot_table_grouping(
                    df.query('dependent_variable == @dep_var and iterations == @iterations and threshold == @threshold and model == @model'),
                    index=['dependent_variable', 'iterations', 'threshold', 'model'],
                    columns='excluded_group',
                    metrics=METRICS,
                    index_sort=[DEPENDENT_VARIABLES, ITERATIONS, THRESHOLDS, BASELINE_MODELS + COMPARED_MODELS],
                    columns_sort=[METRICS, GROUPS],
                    aggfunc=lambda x: stats.normaltest(x)[1],
                )

                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                print(f'dep_var={dep_var}, iterations={iterations}, threshold={threshold}, model={model}')
                display(normal_test_df
                # Rename all the model names into the shortest version.
                .rename(index=MODELS_LABELS)
                # Show the p-values with reduced decimal digits.
                .style.format('{:.4f}')
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
                )

We test whether there are any statiscally significant differences between the baseline and the comparison by using the *Wilcoxon Test*.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for iterations in FOCUS_ITERATIONS:
        for threshold in FOCUS_THRESHOLDS:
            for model in FOCUS_MODELS:
                # Create the dataframe.
                wilcoxon_test_df = pairwise_multiple_groups_test_dataframe(
                    df.query('dependent_variable == @dep_var and iterations == @iterations and threshold == @threshold and model == @model'),
                    group_1=['iterations', 'threshold', 'model'],
                    group_2='excluded_group',
                    metrics=METRICS,
                    testfunc=stats.wilcoxon,
                )

                # Fix the excluded group and comparison columns.
                wilcoxon_test_df['excluded_group'] = pd.Categorical(wilcoxon_test_df['excluded_group'], categories=GROUPS)
                wilcoxon_test_df['comparison'] = pd.Categorical(wilcoxon_test_df['comparison'], categories=GROUPS)

                # Pivot the dataframe for better visualization.
                wilcoxon_test_df = (
                    wilcoxon_test_df.pivot_table(index=['iterations', 'threshold', 'model', 'excluded_group'], columns=['metric', 'comparison'], values=['pvalue'])
                    # Sort the iterations.
                    .reindex(ITERATIONS, level=0)
                    # Sort the threshold.
                    .reindex(THRESHOLDS, level=1)
                    # Sort the models.
                    .reindex(BASELINE_MODELS + COMPARED_MODELS, level=2)
                    # Sort the excluded groups.
                    .reindex(GROUPS, level=3)
                    # Sort the metrics.
                    .reindex(METRICS, axis=1, level=1)
                    # Sort the excluded groups.
                    .reindex(GROUPS, axis=1, level=2)
                )

                # Print the dataframe showing the acceptance of the alternative hypothesis as green, and reject as red.
                print(f'dep_var={dep_var}, iterations={iterations}, threshold={threshold}, model={model}')
                display(wilcoxon_test_df
                # Rename all the model names into the shortest version.
                .rename(index=MODELS_LABELS)
                # Show the p-values with reduced decimal digits.
                .style.format('{:.4f}')
                # Apply the color filtering.
                .applymap(lambda x: 'background-color: #5fba7d' if x < SIGNIFICANCE_LEVEL else 'background-color: #d65f5f')
                )

We measure the effect size using the *Vargha-Delaney A* test.

In [None]:
for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for iterations in FOCUS_ITERATIONS:
        for threshold in FOCUS_THRESHOLDS:
            for model in FOCUS_MODELS:
                # Create the dataframe.
                vda_test_df = pairwise_multiple_groups_vda_dataframe(
                    df.query('dependent_variable == @dep_var and iterations == @iterations and threshold == @threshold and model == @model'),
                    group_1=['iterations', 'threshold', 'model'],
                    group_2='excluded_group',
                    metrics=METRICS,
                )

                # Pivot the dataframe for better visualization.
                vda_test_df = (
                    vda_test_df.pivot_table(index=['iterations', 'threshold', 'model', 'excluded_group'], columns=['metric', 'comparison'], values=['a', 'magnitude'], aggfunc='first')
                    # Use "a" and "magnitude" as an index.
                    .stack(level=0)
                    # Sort the iterations.
                    .reindex(ITERATIONS, level=0)
                    # Sort the threshold.
                    .reindex(THRESHOLDS, level=1)
                    # Sort the models.
                    .reindex(BASELINE_MODELS + COMPARED_MODELS, level=2)
                    # Sort the excluded groups.
                    .reindex(GROUPS, level=3)
                    # Sort the metrics.
                    .reindex(METRICS, axis=1, level=0)
                    # Sort the excluded groups.
                    .reindex(GROUPS, axis=1, level=1)
                )

                # Print the dataframe showing the colored magnitude levels.
                print(f'dep_var={dep_var}, iterations={iterations}, threshold={threshold}, model={model}')
                display(vda_test_df
                # Rename all the model names into the shortest version.
                .rename(index=MODELS_LABELS)
                # Show all the magnitude values as the shortest version.
                .style.format(lambda x: MAGNITUDE_LABELS[x] if isinstance(x, str) else '{:.4f}'.format(x))
                # Apply the color filtering.
                .applymap(lambda x: f'background-color: {MAGNITUDE_PALETTE[x]}' if x in MAGNITUDE_PALETTE else '')
                )

We prepare the data for the comparison.

In [None]:
# Separate the baseline from the comparison instances rows.
baseline_group_df = df.query(f'excluded_group == "{BASELINE_GROUP}"')
comparison_groups_df = df.query(f'excluded_group != "{BASELINE_GROUP}"')

# Merge on the combinations.
merge_df = comparison_groups_df.merge(baseline_group_df, on=['dependent_variable', 'iterations', 'threshold', 'model', 'fold'], suffixes=(None, '_none'))

display(merge_df)

In [None]:
# Add the baseline data to the output dataframe.
groups_table_df = pd.concat([baseline_group_df, merge_df], ignore_index=True)

for dep_var in FOCUS_DEPENDENT_VARIABLES:
    for iterations in FOCUS_ITERATIONS:
        for threshold in FOCUS_THRESHOLDS:
            for model in FOCUS_MODELS:
                focus_median_values_df = groups_table_df.query('dependent_variable == @dep_var and iterations == @iterations and threshold == @threshold and model == @model').groupby(['excluded_group']).median()
                # Compute the differences.
                for metric in METRICS:
                    focus_median_values_df[f'diff_{metric}'] =  focus_median_values_df[f'{metric}'] - focus_median_values_df[f'{metric}_none']
                
                # Adjust the dataframe.
                focus_median_values_df = focus_median_values_df[['fold', *METRICS, *[f'diff_{x}' for x in METRICS]]]

                # Sort excluding the baseline row.
                focus_median_table_df = focus_median_values_df.loc[~focus_median_values_df.index.isin([BASELINE_GROUP])].sort_values(by=['diff_mcc', 'diff_auc', 'diff_fmeasure', 'diff_precision', 'diff_recall'], ascending=True)
                focus_median_table_df = pd.concat([focus_median_values_df.loc[focus_median_values_df.index.isin([BASELINE_GROUP])], focus_median_table_df])

                # Print the dataframe showing the bars in the background.
                print(f'dep_var={dep_var}, iterations={iterations}, threshold={threshold}, model={model}')
                display_columns = list(itertools.chain.from_iterable([(x, f'diff_{x}') for x in METRICS]))
                display(
                    focus_median_table_df[display_columns]
                    # Show the median values with reduced decimal digits.
                    .style.format({**{x: '{:.8f}' for x in METRICS}})
                    # Show a background bar as indication.
                    .bar(vmin=0.0, vmax=1.0, color='#5fba7d')
                )