In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.stats.multicomp import MultiComparison

In [2]:
# Load the merged CSV file containing model performance data
# This file is generated from the previous step where multiple model performance results are consolidated
df = pd.read_csv('all_evaluations.csv', low_memory=False, encoding="utf-8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4554 entries, 0 to 4553
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   resampling  4554 non-null   object 
 1   mmr         4554 non-null   float64
 2   eml         4554 non-null   object 
 3   gmean       4554 non-null   float64
 4   mcc         4554 non-null   float64
 5   ap          4554 non-null   float64
 6   auc         4554 non-null   float64
dtypes: float64(5), object(2)
memory usage: 249.2+ KB


In [3]:
# Define the model formula
# This formula defines 'gmean' as the dependent variable and includes 'mmr', 'eml', and 'resampling' as categorical independent variables.
formula = 'gmean ~ C(mmr) + C(eml) + C(resampling)'

# Fit the OLS model
# Fit the Ordinary Least Squares regression model using the defined formula and dataset.
model = ols(formula, data=df).fit()

# Perform ANOVA
# Conduct Analysis of Variance (ANOVA) to examine the statistical significance of the effects of the factors on the model.
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq      df           F         PR(>F)
C(mmr)         0.487982    94.0    7.136639   1.680386e-80
C(eml)         0.214865     5.0   59.076168   9.828152e-60
C(resampling)  1.268650     7.0  249.150258  9.154552e-314
Residual       3.234817  4447.0         NaN            NaN


In [4]:
# The following sections repeat the analysis for different performance metrics.
# We will use the same approach to model and analyze 'mcc', 'ap', 'auc'. 
# using Ordinary Least Squares regression and ANOVA to assess the impact of 'mmr', 'eml', and 'resampling'.

In [5]:
# Analyze 'mcc' as the dependent variable
formula = 'mcc ~ C(mmr) + C(eml) + C(resampling)'
model = ols(formula, data=df).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq      df           F         PR(>F)
C(mmr)         0.214040    94.0    7.456464   2.115303e-85
C(eml)         0.058157     5.0   38.088578   2.143815e-38
C(resampling)  0.502400     7.0  235.026976  2.887366e-298
Residual       1.358005  4447.0         NaN            NaN


In [6]:
# Analyze 'ap' as the dependent variable
formula = 'ap ~ C(mmr) + C(eml) + C(resampling)'
model = ols(formula, data=df).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq      df           F         PR(>F)
C(mmr)         0.526416    94.0   15.413608  1.219034e-204
C(eml)         0.777392     5.0  427.930815   0.000000e+00
C(resampling)  0.553344     7.0  217.570803  9.207737e-279
Residual       1.615711  4447.0         NaN            NaN


In [7]:
# Analyze 'auc' as the dependent variable
formula = 'auc ~ C(mmr) + C(eml) + C(resampling)'
model = ols(formula, data=df).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq      df            F        PR(>F)
C(mmr)         0.158653    94.0     7.000310  2.053583e-78
C(eml)         1.210595     5.0  1004.212753  0.000000e+00
C(resampling)  0.793041     7.0   469.888064  0.000000e+00
Residual       1.072186  4447.0          NaN           NaN


In [8]:
# Assessing the importance of resampling strategies
# Given the significant F-statistic and extremely low p-value in ANOVA results for 'resampling',
# it is crucial to further investigate which specific resampling strategies perform best.

# Analyze the significant effect of 'resampling' on 'gmean'
if aov_table.loc['C(resampling)', 'PR(>F)'] < 0.05:
    print("Resampling strategies have a significant impact on gmean. Proceeding with multiple comparisons to identify the most effective method.")

    # Conducting multiple comparisons among resampling strategies
    from statsmodels.sandbox.stats.multicomp import MultiComparison
    mult_comp = MultiComparison(df['gmean'], df['resampling'])

    tukey_result = mult_comp.tukeyhsd()

    # Convert the Tukey HSD results into a DataFrame for easier manipulation
    tukey_df = pd.DataFrame(data=tukey_result._results_table.data[1:], columns=tukey_result._results_table.data[0])

    # Identify which resampling strategies significantly exceed others in performance
    higher_resampling = tukey_df[(tukey_df['reject'] == True) & (tukey_df['meandiff'] > 0)]
    
    # Analyzing the results of multiple comparisons
    # The 'meandiff' column represents the mean difference in 'gmean' between group2 and group1,
    # indicating that a positive 'meandiff' suggests group2 performs better than group1.
    print(higher_resampling)

Resampling strategies have a significant impact on gmean. Proceeding with multiple comparisons to identify the most effective method.
             group1           group2  meandiff   p-adj   lower   upper  reject
0            ADASYN  BorderlineSMOTE    0.0235  0.0000  0.0182  0.0288    True
1            ADASYN            IWGMM    0.0496  0.0000  0.0443  0.0549    True
3            ADASYN              ROS    0.0208  0.0000  0.0155  0.0261    True
4            ADASYN            SMOTE    0.0063  0.0077  0.0010  0.0116    True
6            ADASYN         SVMSMOTE    0.0263  0.0000  0.0210  0.0316    True
7   BorderlineSMOTE            IWGMM    0.0261  0.0000  0.0208  0.0314    True
18      KMeansSMOTE              ROS    0.0166  0.0000  0.0113  0.0219    True
21      KMeansSMOTE         SVMSMOTE    0.0221  0.0000  0.0168  0.0274    True
24              ROS         SVMSMOTE    0.0055  0.0339  0.0002  0.0108    True
26            SMOTE         SVMSMOTE    0.0200  0.0000  0.0148  0.0253    Tr

In [9]:
# Given the results, the Inversely Weighted Gaussian Mixture Model (IWGMM) resampling strategy 
# consistently shows significant improvements over others. This highlights IWGMM as the most 
# effective resampling method in our analysis.

In [10]:
# Next, we will focus exclusively on data where 'resampling' is set to 'IWGMM'
# to further explore the characteristics and performance metrics of this optimal resampling strategy.

df_filtered1 = df[df['resampling'] == 'IWGMM']
df_filtered1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 570 entries, 3984 to 4553
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   resampling  570 non-null    object 
 1   mmr         570 non-null    float64
 2   eml         570 non-null    object 
 3   gmean       570 non-null    float64
 4   mcc         570 non-null    float64
 5   ap          570 non-null    float64
 6   auc         570 non-null    float64
dtypes: float64(5), object(2)
memory usage: 35.6+ KB


In [11]:
# Define the model formula for the filtered dataset
# This formula establishes 'gmean' as the dependent variable with 'mmr' and 'eml' as independent variables.
# We now focus on analyzing how these factors influence performance metrics within the IWGMM resampling strategy context.
formula = 'gmean ~ C(mmr) + C(eml)'

# Fit the OLS model on the filtered data
# We use the Ordinary Least Squares (OLS) regression model to estimate the relationship between 'gmean' and the selected factors.
model = ols(formula, data=df_filtered1).fit()

# Perform ANOVA to assess the statistical significance
# Analysis of Variance (ANOVA) is conducted to examine the impact of 'mmr' and 'eml' on 'gmean',
# helping to identify which factors significantly affect the outcome.
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

            sum_sq     df           F        PR(>F)
C(mmr)    0.051831   94.0    0.994740  4.989661e-01
C(eml)    0.279607    5.0  100.883832  3.960856e-72
Residual  0.260528  470.0         NaN           NaN


In [12]:
# Analyze 'mcc' as the dependent variable
formula = 'mcc ~ C(mmr) + C(eml)'
model = ols(formula, data=df_filtered1).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

            sum_sq     df          F        PR(>F)
C(mmr)    0.021512   94.0   0.864899  8.043113e-01
C(eml)    0.098714    5.0  74.615389  1.874960e-57
Residual  0.124359  470.0        NaN           NaN


In [13]:
# Analyze 'ap' as the dependent variable
formula = 'ap ~ C(mmr) + C(eml)'
model = ols(formula, data=df_filtered1).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

            sum_sq     df          F        PR(>F)
C(mmr)    0.022762   94.0   1.431515  8.964919e-03
C(eml)    0.079275    5.0  93.730521  2.460667e-68
Residual  0.079503  470.0        NaN           NaN


In [14]:
# Analyze 'auc' as the dependent variable
formula = 'auc ~ C(mmr) + C(eml)'
model = ols(formula, data=df_filtered1).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

            sum_sq     df           F         PR(>F)
C(mmr)    0.019424   94.0    0.982780   5.290294e-01
C(eml)    0.177118    5.0  168.473141  2.234757e-102
Residual  0.098823  470.0         NaN            NaN


In [15]:
# Following the ANOVA analysis which highlighted significant differences among 'eml' categories,
# we now conduct a detailed comparison to pinpoint which specific machine learning algorithms
# significantly outperform others within the IWGMM resampling strategy context.

# Conducting multiple comparisons among machine learning algorithms
mult_comp = MultiComparison(df_filtered1['gmean'], df_filtered1['eml'])

# Performing the Tukey HSD test to statistically compare all pairs of algorithms
tukey_result = mult_comp.tukeyhsd()

# Converting the results of Tukey's test into a DataFrame for easier analysis and interpretation
tukey_df = pd.DataFrame(data=tukey_result._results_table.data[1:], columns=tukey_result._results_table.data[0])

# Identifying which machine learning strategies significantly enhance performance
# by checking which comparisons have a positive mean difference and are statistically significant
higher_strategies = tukey_df[(tukey_df['reject'] == True) & (tukey_df['meandiff'] > 0)]
print(tukey_df)

      group1    group2  meandiff   p-adj   lower   upper  reject
0   AdaBoost  CatBoost   -0.0092  0.0753 -0.0190  0.0005   False
1   AdaBoost      GBDT    0.0208  0.0000  0.0110  0.0306    True
2   AdaBoost  LightGBM   -0.0213  0.0000 -0.0310 -0.0115    True
3   AdaBoost        RF   -0.0506  0.0000 -0.0604 -0.0409    True
4   AdaBoost   XGBoost   -0.0249  0.0000 -0.0347 -0.0152    True
5   CatBoost      GBDT    0.0300  0.0000  0.0203  0.0398    True
6   CatBoost  LightGBM   -0.0120  0.0061 -0.0218 -0.0023    True
7   CatBoost        RF   -0.0414  0.0000 -0.0511 -0.0316    True
8   CatBoost   XGBoost   -0.0157  0.0001 -0.0255 -0.0059    True
9       GBDT  LightGBM   -0.0421  0.0000 -0.0518 -0.0323    True
10      GBDT        RF   -0.0714  0.0000 -0.0812 -0.0617    True
11      GBDT   XGBoost   -0.0457  0.0000 -0.0555 -0.0360    True
12  LightGBM        RF   -0.0293  0.0000 -0.0391 -0.0196    True
13  LightGBM   XGBoost   -0.0037  0.8922 -0.0134  0.0061   False
14        RF   XGBoost   

In [16]:
# Reviewing the results of Tukey's multiple comparisons
# We have identified several significant comparisons indicating superior performance of certain algorithms.
# For instance, 'GBDT' has shown a significant improvement over 'AdaBoost' and some other algorithms.

In [17]:
# Now, focusing on the subset where 'GBDT' is the chosen ensemble method within the IWGMM resampling strategy,
# we will extract the top-performing configurations based on the 'gmean' scores.

# Filtering the data for the optimal resampling strategy 'IWGMM' and the most effective ML algorithm 'GBDT'
df_filtered2 = df_filtered1[(df_filtered1['resampling'] == 'IWGMM') & (df_filtered1['eml'] == 'GBDT') ]

# Sorting the filtered data to identify the top 10 configurations with the highest 'gmean' scores
top_10_df = df_filtered2.nlargest(10, 'gmean')
print(top_10_df)

     resampling   mmr   eml     gmean       mcc        ap       auc
4178      IWGMM  0.09  GBDT  0.669374  0.212361  0.168155  0.776938
4200      IWGMM  0.31  GBDT  0.667517  0.205352  0.151108  0.773669
4188      IWGMM  0.19  GBDT  0.667088  0.203796  0.171474  0.767442
4190      IWGMM  0.21  GBDT  0.664637  0.227340  0.159483  0.769225
4202      IWGMM  0.33  GBDT  0.658332  0.200762  0.160595  0.764569
4192      IWGMM  0.23  GBDT  0.657909  0.199198  0.175665  0.781634
4191      IWGMM  0.22  GBDT  0.657627  0.198167  0.171752  0.771618
4205      IWGMM  0.36  GBDT  0.656499  0.194140  0.170120  0.767827
4183      IWGMM  0.14  GBDT  0.654764  0.220890  0.172151  0.768489
4175      IWGMM  0.06  GBDT  0.649665  0.198784  0.179060  0.787162
