### Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier

from imblearn.metrics import geometric_mean_score
from imblearn.metrics import specificity_score
from sklearn.model_selection import  cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer

import dataframe_image as dfi

In [2]:
def evaluate_performance(X, y, features, model=RandomForestClassifier(random_state=42)):
    
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', model)
    ])

    X = X[features]

    kfold = RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42)

    gmean_scorer = make_scorer(geometric_mean_score)
    spec_scorer = make_scorer(specificity_score)

    scoring_metrics = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'roc_auc': 'roc_auc',
        'g_mean': gmean_scorer,
        'specificity': spec_scorer,
        'mcc': 'matthews_corrcoef'
    }
    
    results = cross_validate(pipeline, X, y, 
                             cv=kfold, 
                             scoring=scoring_metrics, 
                             n_jobs=-1, 
                             return_train_score=False)

    result = {}
    result['accuracy'] = np.mean(results['test_accuracy'])
    result['precision'] = np.mean(results['test_precision'])
    result['recall'] = np.mean(results['test_recall'])
    result['roc_auc'] = np.mean(results['test_roc_auc'])
    result['gmean'] = np.mean(results['test_g_mean'])
    result['specificity'] = np.mean(results['test_specificity'])
    result['mcc'] = np.mean(results['test_mcc'])

    return result

def load_data(dataset):
    
    df_encoded = pd.read_csv(f'../Data & Outputs/{dataset}/df_encoded.csv')
    y = df_encoded['churn']
    X = df_encoded.drop(columns=['churn'])

    return X, y
    
def create_model(classifier):
    # Create model based on user input
    if classifier == 'RF':
        model = RandomForestClassifier(n_jobs=-1, random_state=42)
    elif classifier == 'BRF':
        model = BalancedRandomForestClassifier(n_jobs=-1, random_state=42)
    else:
        model = XGBClassifier(n_jobs=-1, random_state=42)
    return model

def load_uni_selected_features(dataset):
    with open(f'../Data & Outputs/{dataset}/{dataset}_uni_selected_features.pkl', 'rb') as f:
        corr_features, chi_sq_features = pickle.load(f)
    return corr_features, chi_sq_features

def load_mul_selected_features(dataset, classifier):
    with open(f'../Data & Outputs/{dataset}/{classifier}/{dataset}_{classifier}_mul_selected_features.pkl', 'rb') as f:
        seq_fea_sel_features, rfe_cv_features = pickle.load(f)
    return seq_fea_sel_features, rfe_cv_features

def create_results(X, y, model, corr_features, chi_sq_features, seq_fea_sel_features, rfe_cv_features):
    all_features = list(X.columns)

    ep_all = evaluate_performance(X, y, all_features, model=model)
    ep_corr = evaluate_performance(X, y, corr_features, model=model)
    ep_chi2 = evaluate_performance(X, y, chi_sq_features, model=model)
    ep_sfs = evaluate_performance(X, y, seq_fea_sel_features, model=model)
    ep_rfe = evaluate_performance(X, y, rfe_cv_features, model=model)

    # create a list of the dictionaries
    dict_list = [ep_all, ep_corr, ep_chi2, ep_sfs, ep_rfe]
    dict_list_name = ['base', 'correlation', 'chi2', 'sfs', 'rfe']
    df_result = pd.DataFrame(dict_list, index=[d for d in dict_list_name])

    return df_result

def save_results(df_result, dataset, classifier):
    # df_result.to_csv(f'../Data & Outputs/{dataset}/{classifier}/{dataset}_{classifier}_results.csv')
    # df_result.style.to_latex(f'../Data & Outputs/{dataset}/{classifier}/{dataset}_{classifier}_results.tex')

    #df_styled = df_result.style.background_gradient()
    df_styled = df_result.style.highlight_max(color='lightgreen', axis=0)
    dfi.export(df_styled, f'../Data & Outputs/{dataset}/{classifier}/{dataset}_{classifier}_results.png', dpi=300)

def main():
    for dataset in ['Orange', 'IBM']:
        print("Dataset: ", dataset)
        X, y = load_data(dataset)

        for classifier in ['XGB', 'RF', 'BRF']:
            print("Classifier: ", classifier)
    
            model = create_model(classifier)

            corr_features, chi_sq_features = load_uni_selected_features(dataset)
            seq_fea_sel_features, rfe_cv_features = load_mul_selected_features(dataset, classifier)

            df_result = create_results(X, y, model, 
                                               corr_features, chi_sq_features, 
                                               seq_fea_sel_features, rfe_cv_features)
            
            save_results(df_result, dataset, classifier)

In [3]:
if __name__ == "__main__":
    main()

Dataset:  Orange
Classifier:  XGB
Classifier:  RF
Classifier:  BRF
Dataset:  IBM
Classifier:  XGB
Classifier:  RF
Classifier:  BRF
