In [None]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
test1 = pd.read_csv('./data/test1_results.csv')
test2 = pd.read_csv('./data/test2_results.csv')
test3 = pd.read_csv('./data/test3_results.csv')
test4 = pd.read_csv('./data/test4_results.csv')

In [None]:
model_list = test4['model'].unique()

##### First, We check the maximum value for each methods

In [None]:
IGNNet1 = test1[test1['model'] == 'IGNnet']
IGNNet1[IGNNet1['ACC'] >= 88].sort_values(by=['AUC', 'F1_score'], ascending=[False, False])

In [None]:
IGNNet2 = test2[test2['model'] == 'IGNnet']
IGNNet2[IGNNet2['ACC'] >= 88].sort_values(by=['AUC', 'F1_score'], ascending=[False, False])

In [None]:
IGNNet3 = test3[test3['model'] == 'IGNnet']
IGNNet3[IGNNet3['ACC'] >= 90].sort_values(by=['AUC', 'F1_score'], ascending=[False, False])

In [None]:
IGNNet4 = test4[test4['model'] == 'IGNnet']
IGNNet4[IGNNet4['ACC'] >= 90].sort_values(by=['AUC', 'F1_score'], ascending=[False, False])

##### Retrieve hyperparameters for reproduction.

In [None]:
print(IGNNet4[IGNNet4['AUC'] == 82]['best_params'].values[0])

In [None]:
num_index = IGNNet4[IGNNet4['ACC'] >= 90].sort_values(by=['AUC', 'F1_score'], ascending=[False, False]).index[1]

In [None]:
len_traditional = len(model_list)
len_traditional

Here, We compare the performance between IGNnet and traditional methods (e.g. LR, DT, SVM ...) 

In [None]:
get_perform = test4.iloc[num_index:num_index + len_traditional , 1:]
get_perform

In [None]:
IGNNet4['best_params']

##### Now, We compare the performance between methods about oversamplings

In [None]:
sampling_methods = ["borderline-smote", "smote", "adasyn", "over-random"]

##### test1

In [None]:
over_data1 = {method: IGNNet1.loc[IGNNet1["best_params"].str.contains(f"sampling:{method}", na=False)]
                 for method in sampling_methods}

In [None]:
perform_df = []

for sampling in sampling_methods :
    stack = over_data1[sampling].sort_values(by = ['AUC', 'F1_score'], ascending=[False, False]).iloc[0]
    perform_df.append(stack)

In [None]:
perform_df = pd.DataFrame([s.to_dict() for s in perform_df])

In [None]:
perform_df

##### test2

In [None]:
over_data2 = {method: IGNNet2.loc[IGNNet2["best_params"].str.contains(f"sampling:{method}", na=False)]
                 for method in sampling_methods}

perform_df2 = []

for sampling in sampling_methods :
    stack = over_data2[sampling].sort_values(by = ['AUC', 'F1_score'], ascending=[False, False]).iloc[0]
    perform_df2.append(stack)
    
perform_df2 = pd.DataFrame([s.to_dict() for s in perform_df2])
perform_df2

##### test3

In [None]:
over_data3 = {method: IGNNet3.loc[IGNNet3["best_params"].str.contains(f"sampling:{method}", na=False)]
                 for method in sampling_methods}

perform_df3 = []

for sampling in sampling_methods :
    stack = over_data3[sampling].sort_values(by = ['AUC', 'F1_score'], ascending=[False, False]).iloc[0]
    perform_df3.append(stack)
    
perform_df3 = pd.DataFrame([s.to_dict() for s in perform_df3])
perform_df3

##### test4

In [None]:
over_data4 = {method: IGNNet4.loc[IGNNet4["best_params"].str.contains(f"sampling:{method}", na=False)]
                 for method in sampling_methods}

perform_df4 = []

for sampling in sampling_methods :
    stack = over_data4[sampling].sort_values(by = ['AUC', 'F1_score'], ascending=[False, False]).iloc[0]
    perform_df4.append(stack)
    
perform_df4 = pd.DataFrame([s.to_dict() for s in perform_df4])
perform_df4

In [None]:
perform_all = pd.concat([perform_df, perform_df2, perform_df3, perform_df4], ignore_index = True)

In [None]:
perform_all

#### ALL Performance

In [None]:
perform_all['sampling_method'] = perform_all['best_params'].apply(lambda x: next((s for s in sampling_methods if s in str(x)), None))

best_auc_idx = perform_all.groupby("sampling_method")["AUC"].idxmax()
best_f1_idx = perform_all.groupby("sampling_method")["F1_score"].idxmax()
best_auc = perform_all.loc[best_auc_idx]
best_f1 = perform_all.loc[best_f1_idx]

merged_perform_all = pd.concat([best_auc, best_f1]).drop_duplicates().reset_index(drop=True)

final_rows = []
for method in sampling_methods:
    subset = merged_perform_all[merged_perform_all['sampling_method'] == method]

    if len(subset) > 1:
        row1, row2 = subset.iloc[0], subset.iloc[1]
        if row1["AUC"] > row2["AUC"] and row1["F1_score"] > row2["F1_score"]:
            final_rows.append(row1)
        elif row2["AUC"] > row1["AUC"] and row2["F1_score"] > row1["F1_score"]:
            final_rows.append(row2)
        else:
            subset = subset.copy() 
            subset["mean_score"] = (subset["AUC"] + subset["F1_score"]) / 2
            chosen_row = subset.sort_values("mean_score", ascending=False).iloc[0]
            final_rows.append(chosen_row)
    elif len(subset) == 1:
        final_rows.append(subset.iloc[0])

final_perform_all = pd.DataFrame(final_rows).reset_index(drop=True).drop(['dataset', 'best_params', 'mean_score'], axis = 1)

In [None]:
final_perform_all