In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import time
import logging

## Feature-scaling stack
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, OneHotEncoder, FunctionTransformer

## Dimesionality reduction
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

## Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN

## Machine-learning stack
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier 
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

## Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_curve, roc_auc_score, auc, fbeta_score, f1_score

## Model saving
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

# Control Variables

In [2]:
ENABLE = {
    'rf':     {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'xgb':    {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'lgb':    {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'cb':     {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'ada':    {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'svc':    {'cross-val':    0,
               'compute-pauc': 0,
               'final-train':  0,
               'save-model':   0,
              },
    'soft-v': {'cross-val':    0,
               'compute-pauc': 1,
               'final-train':  1,
               'save-model':   1,
              },
    'lr-v':   {'cross-val':    1,
               'compute-pauc': 1,
               'final-train':  1,
               'save-model':   1,
              },
}

# Evaluation Functions

In [3]:
def partial_auc_score(y_actual, y_scores, tpr_threshold=0.80):
    max_fpr = 1 - tpr_threshold

    # create numpy arrays
    y_actual = np.asarray(y_actual)
    y_scores = np.asarray(y_scores)

    # ROC curve
    fpr, tpr, _ = roc_curve(y_actual, y_scores)

    # Find the index where fpr exceeds max_fpr
    stop_index = np.searchsorted(fpr, max_fpr, side='right')

    if stop_index < len(fpr):
        # Interpolate to find the TPR at max_fpr
        fpr_interp_points = [fpr[stop_index - 1], fpr[stop_index]]
        tpr_interp_points = [tpr[stop_index - 1], tpr[stop_index]]
        tpr = np.append(tpr[:stop_index], np.interp(max_fpr, fpr_interp_points, tpr_interp_points))
        fpr = np.append(fpr[:stop_index], max_fpr)
    else:
        tpr = np.append(tpr, 1.0)
        fpr = np.append(fpr, max_fpr)

    # Calculate partial AUC
    partial_auc_value = auc(fpr, tpr)

    return partial_auc_value

def cross_val_partial_auc_score(X, y, model, n_splits):

     # Setup cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    pauc_scores = []
    cont = 1
    for train_idx, val_idx in skf.split(X, y):

        print(f'Processing fold {cont} of {n_splits}... ', end='', flush=True)
        
        # Create the folds
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                
        # Train the model
        model.fit(X_train_fold, y_train_fold)
    
        # Predict on the validation set
        preds = model.predict_proba(X_val_fold)[:,1]
   
        # Calculate partical AUC and store it
        pauc = partial_auc_score(y_val_fold, preds)
        pauc_scores.append(pauc)

        print(f'pAUC: {pauc}', flush=True)
        
        cont = cont + 1

    #print("\n")
    
    # Return the average
    return np.mean(pauc_scores)

# Model Loading and Data Analysis

In [4]:
# Read the metadata-based feature set
ROOT_DATASET_DIR = "./"
file_name_data = os.path.join(ROOT_DATASET_DIR,"train-metadata-eda-fe.csv")
df_data = pd.read_csv(file_name_data)

# Read the image (pixel)-based feature set
file_name_img = os.path.join(ROOT_DATASET_DIR,"train-cnn-features-rn152v2.csv")
df_img = pd.read_csv(file_name_img)

In [5]:
df_data.drop(columns=['isic_id'], inplace=True)
df_data['anatom_site_general'] = pd.Categorical(df_data['anatom_site_general'])
df_data['tbp_lv_location'] = pd.Categorical(df_data['tbp_lv_location'])
df_data['tbp_lv_location_simple'] = pd.Categorical(df_data['tbp_lv_location_simple'])
df_data['sex'] = pd.Categorical(df_data['sex'])
df_img.drop(columns=['target'], inplace=True)
df = pd.concat([df_data, df_img], axis=1)

In [6]:
df.head(5)

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,...,im_feature_54,im_feature_55,im_feature_56,im_feature_57,im_feature_58,im_feature_59,im_feature_60,im_feature_61,im_feature_62,im_feature_63
0,0,60.0,0,lower extremity,3.04,0,20.244422,16.261975,26.922447,23.954773,...,3.312,2.363,6.31,0.0,0.0,0.832,2.627,3.842,2.242,0.0
1,0,60.0,0,head/neck,1.1,0,31.71257,25.36474,26.331,24.54929,...,3.277,2.197,5.42,0.0,0.0,0.6885,2.225,2.885,1.667,0.0
2,0,60.0,0,posterior torso,3.4,1,22.57583,17.12817,37.97046,33.48541,...,3.844,2.775,6.72,0.0,0.0,1.07,2.955,3.877,2.416,0.0
3,0,65.0,0,anterior torso,3.22,1,14.242329,12.164757,21.448144,21.121356,...,3.65,2.547,6.395,0.0,0.0,0.919,2.78,3.803,2.271,0.0
4,0,55.0,0,anterior torso,2.73,0,24.72552,20.05747,26.4649,25.71046,...,3.328,2.33,5.574,0.0,0.0,0.8813,2.488,3.332,2.027,0.0


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,395303.0,0.000994,0.031515,0.000000,0.000000,0.00000,0.000000,1.00000
age_approx,395303.0,57.946816,13.546105,5.000000,50.000000,60.00000,70.000000,85.00000
clin_size_long_diam_mm,395303.0,3.930147,1.741947,1.000000,2.840000,3.37000,4.380000,28.40000
tbp_tile_type,395303.0,0.710966,0.453314,0.000000,0.000000,1.00000,1.000000,1.00000
tbp_lv_A,395303.0,19.961896,3.993054,-2.487115,17.325051,19.79291,22.289341,48.18961
...,...,...,...,...,...,...,...,...
im_feature_59,395303.0,0.719197,0.257739,0.000000,0.587000,0.76000,0.893600,1.73500
im_feature_60,395303.0,2.097273,0.760955,0.000000,1.753000,2.23400,2.600000,4.73400
im_feature_61,395303.0,2.809590,1.064116,0.000000,2.287000,2.97500,3.518000,7.11300
im_feature_62,395303.0,1.712804,0.642774,0.000000,1.404000,1.83200,2.146000,3.98800


# Feature Preparation

Some new features are from other notebooks at: https://www.kaggle.com/competitions/isic-2024-challenge/code

In [8]:
# Original feature names
features_to_be_logtr = ['clin_size_long_diam_mm',
                        'tbp_lv_areaMM2',
                        'tbp_lv_area_perim_ratio',
                        'tbp_lv_color_std_mean',
                        'tbp_lv_deltaLB',
                        'tbp_lv_deltaLBnorm',
                        'tbp_lv_minorAxisMM',
                        'tbp_lv_norm_border',
                        'tbp_lv_norm_color',
                        'tbp_lv_perimeterMM',
                        'tbp_lv_radial_color_std_max',
                        'tbp_lv_stdL',
                        'tbp_lv_stdLExt',
                        'tbp_lv_symm_2axis']
features_to_be_sqrtr = ['tbp_lv_eccentricity']

# Modified original feature names
log_features = ['log_' + col for col in features_to_be_logtr]
sqr_features = ['sqr_' + col for col in features_to_be_sqrtr]

# New feature names
new_features_to_be_logtr = ['hue_contrast',
                            'luminance_contrast',
                            'lesion_color_difference',
                            'border_complexity',
                            'perimeter_to_area_ratio',
                            'area_to_perimeter_ratio',
                            'lesion_visibility_score',
                            'symmetry_border_consistency',
                            'consistency_symmetry_border',
                            'consistency_color',
                            'size_age_interaction',
                            'lesion_severity_index',
                            'shape_complexity_index',
                            'std_dev_contrast',
                            'color_shape_composite_index',
                            'symmetry_perimeter_interaction',
                            'comprehensive_lesion_index',
                            'border_color_interaction',
                            'size_color_contrast_ratio',
                            'age_normalized_nevi_confidence',
                            'volume_approximation_3d',
                            'color_range',
                            'age_size_symmetry_index',
                            'index_age_size_symmetry']
new_features_to_be_sqrtr = ['lesion_shape_index',
                            'position_distance_3d']
new_features_to_be_sqrttr = ['color_consistency',
                             'hue_color_std_interaction',
                             'normalized_lesion_size',                            
                             'color_variance_ratio',
                             'color_asymmetry_index',
                             'shape_color_consistency']

# Modify the column names
log_new_features = ['log_' + col for col in new_features_to_be_logtr]
sqr_new_features = ['sqr_' + col for col in new_features_to_be_sqrtr]
sqrt_new_features = ['sqrt_' + col for col in new_features_to_be_sqrttr]

# Train-test Split

In [9]:
# Train-Test split
# Drop non-used features for the baseline + target
X = df.drop(['target']
             + features_to_be_logtr + features_to_be_sqrtr                                         # drop original features with skeweness (no transformation)                          
             + new_features_to_be_logtr + new_features_to_be_sqrtr + new_features_to_be_sqrttr,    # drop new features with skeweness (no transformation)             
             axis=1)
y = df['target']
#X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=TRAIN_TEST_SPLIT, random_state=42, stratify=y)

In [10]:
numerical_features = X.select_dtypes(include=['float64','int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()
print(f"Numerical features: {numerical_features} - Length: {len(numerical_features)}")
print(f"Categorical features: {categorical_features} - Length: {len(categorical_features)}")

Numerical features: ['age_approx', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_nevi_confidence', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'log_clin_size_long_diam_mm', 'log_tbp_lv_areaMM2', 'log_tbp_lv_area_perim_ratio', 'log_tbp_lv_color_std_mean', 'log_tbp_lv_deltaLB', 'log_tbp_lv_deltaLBnorm', 'log_tbp_lv_minorAxisMM', 'log_tbp_lv_norm_border', 'log_tbp_lv_norm_color', 'log_tbp_lv_perimeterMM', 'log_tbp_lv_radial_color_std_max', 'log_tbp_lv_stdL', 'log_tbp_lv_stdLExt', 'log_tbp_lv_symm_2axis', 'sqr_tbp_lv_eccentricity', 'lesion_size_ratio', 'color_contrast_index', 'log_lesion_area', 'mean_hue_difference', 'lesion_orientation_3d', 'overall_color_difference', 'border_color_interaction_2', 'age_normalized_nevi_confidence_2', 'border_length_ratio', 'log_hue_contrast', 'log_luminance_contrast', 'log_le

# Dimensionality Reduction

In [11]:
# Use SelectKBest (f_classif) for numerical features
Kbest_numerical = SelectKBest(score_func=f_classif, k='all')
Kbest_numerical.fit(X[numerical_features], y)

# Extract feature scores and p-values
scores = Kbest_numerical.scores_
pvalues = Kbest_numerical.pvalues_

# Create a DataFrame to save feature names, scores, and p-values
feature_scores = pd.DataFrame({
    'Feature': numerical_features,
    'Score': scores,
    'P-Value': pvalues
})

# Sort features by 'Score'
best_feature_scores = feature_scores[feature_scores['P-Value'] < 0.05]
sorted_features = best_feature_scores.sort_values(by='Score', ascending=False)

# Display the sorted features
KBEST_NUM = sorted_features.shape[0]
print(f"Number of relevant numerical features: {KBEST_NUM}\n")
print(sorted_features.set_index('Feature'))

# Use SelectKBest (chi2) for categorical features

# Build a pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('onehot', preprocessor),
    ('kbest', SelectKBest(score_func=chi2, k='all'))
])

cat_transformed = pipeline.fit_transform(X, y)
Kbest_categorical = pipeline.named_steps['kbest']

# Extract feature scores and p-values
scores = Kbest_categorical.scores_
pvalues = Kbest_categorical.pvalues_

# Extract feature names after one-hot encoding
one_hot_feature_names = pipeline.named_steps['onehot'].transformers_[0][1].get_feature_names_out(categorical_features)

# Create a DataFrame to hold feature names, scores, and p-values
feature_scores = pd.DataFrame({
    'Feature': one_hot_feature_names,
    'Score': scores,
    'P-Value': pvalues
})

# Sort features by their scores
best_feature_scores = feature_scores[feature_scores['P-Value'] < 0.05]
sorted_features = best_feature_scores.sort_values(by='Score', ascending=False)

# Display the sorted features
KBEST_CAT = sorted_features.shape[0]
print(f"Number of relevant categorical features: {KBEST_CAT}\n")
print(sorted_features.set_index('Feature'))

Number of relevant numerical features: 131

                                        Score   P-Value
Feature                                                
im_feature_28                    38498.512139  0.000000
im_feature_17                    37569.437479  0.000000
im_feature_10                    37105.093383  0.000000
im_feature_29                    37097.916715  0.000000
im_feature_19                    36514.406202  0.000000
...                                       ...       ...
lesion_size_ratio                   10.627077  0.001115
log_symmetry_border_consistency      8.738379  0.003116
tbp_lv_L                             6.699025  0.009647
sqrt_hue_color_std_interaction       6.449304  0.011100
sqrt_color_variance_ratio            3.877523  0.048937

[131 rows x 2 columns]
Number of relevant categorical features: 8

                                              Score       P-Value
Feature                                                          
anatom_site_general_head/nec

# Preprocessing Pipeline 

In [12]:
# Pipeline
from sklearn.preprocessing import FunctionTransformer
   
pipe_num = Pipeline([
    ('scaler', RobustScaler()),
    ('kbest', SelectKBest(score_func=f_classif, k=KBEST_NUM)),   
])

pipe_cat = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)),
    ('kbest', SelectKBest(score_func=chi2, k=KBEST_CAT)),    
])

preprocessing = ColumnTransformer(transformers=[
    ('numerical', pipe_num, numerical_features),
    ('categorical',pipe_cat, categorical_features)
])

# Balanced Random Forest

### Cross-validation

In [13]:
if ENABLE['rf']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 100, 400)
        max_depth = trial.suggest_int('max_depth', 10, 30)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10) #7)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10) #5)   
        bootstrap=True
        class_weight='balanced_subsample'
    
        pauc_scores = []
    
        for train_idx, val_idx in skf.split(X, y):
            
            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                        
            # Pipeline                           
            pipe_rf = ImbPipeline([
                ('preprocessing', preprocessing),
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
                ('RF', BalancedRandomForestClassifier(random_state=42,
                                                      n_estimators=n_estimators,
                                                      max_depth=max_depth,
                                                      min_samples_split=min_samples_split,
                                                      min_samples_leaf=min_samples_leaf,
                                                      bootstrap=bootstrap,
                                                      class_weight=class_weight
                                                     )
                )
            ])
    
            # Train the model
            pipe_rf.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_rf.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')
    
#Best trial number: 18
#Best value (partial auc - 0.8): 0.19099394435542486
#Best hyperparameters: {'n_estimators': 138, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 8}

### Cross-validation Partial AUC Score

In [14]:
param_rf = {
        'random_state': 42,
        'n_estimators': 138,
        'max_depth': 12,
        'min_samples_split': 5,
        'min_samples_leaf': 8,
        'bootstrap': True,
        'class_weight': 'balanced_subsample',
        'n_jobs': -1
}

model_rf_cv = ImbPipeline([    
    ('preprocessing', preprocessing),
    ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
    ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
    ('RF',  BalancedRandomForestClassifier(**param_rf))
])

if ENABLE['rf']['compute-pauc'] == 1:
    pauc_rf_cv = cross_val_partial_auc_score(X, y, model_rf_cv, n_splits=5)    
    print(f"CV Partial AUC Score: {pauc_rf_cv}")

### Final Training

In [15]:
if ENABLE['rf']['final-train'] == 1:
    
    model_rf_fe139_rsmpl = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('RF',  BalancedRandomForestClassifier(**param_rf))
    ])
    
    model_rf_fe139_rsmpl.fit(X, y)

### Model Saving

In [16]:
if ENABLE['rf']['save-model'] == 1:
    dump(model_rf_fe139_rsmpl, 'model_rf_fe139_rsmpl.pkl')

# XGBoost

### Cross-validation

In [17]:
if ENABLE['xgb']['cross-val'] == 1:

    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):

        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 200, 400)
        learning_rate = trial.suggest_float('learning_rate', 0.080, 0.090)
        reg_lambda = trial.suggest_float('reg_lambda', 5.0, 10.0)
        alpha = trial.suggest_float('alpha', 0.5, 0.8)
        max_depth = trial.suggest_int('max_depth', 10, 31)
        subsample = trial.suggest_float('subsample', 0.45, 0.70)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.80, 1.0)
        colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.4, 0.7)
        scale_pos_weight = trial.suggest_float('scale_pos_weight', 1, 10)
        eval_metric = 'logloss'
        enable_categorical = True
    
        pauc_scores = []
    
        for train_idx, val_idx in skf.split(X, y):
            
            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                    
            # Pipeline                           
            pipe_xgb = ImbPipeline([
                ('preprocessing', preprocessing),
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
                ('XGB', XGBClassifier(random_state=42,
                                      enable_categorical=enable_categorical,
                                      eval_metric=eval_metric,
                                      n_estimators=n_estimators,
                                      learning_rate=learning_rate,
                                      reg_lambda=reg_lambda,
                                      alpha=alpha,
                                      max_depth=max_depth,
                                      subsample=subsample,
                                      colsample_bytree=colsample_bytree,
                                      colsample_bylevel=colsample_bylevel,                             
                                      scale_pos_weight=scale_pos_weight,
                                    )
                )
            ])
    
            # Train the model
            pipe_xgb.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_xgb.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')
    
#Best trial number: 80
#Best value (partial auc - 0.8): 0.1917259326323112
#Best hyperparameters: {'n_estimators': 208, 'learning_rate': 0.08256404336718554, 'reg_lambda': 6.1374965163887305, 'alpha': 0.6026870768041418, 'max_depth': 21, 'subsample': 0.47434308951793386, 'colsample_bytree': 0.9081510010036246, 'colsample_bylevel': 0.5171030271682259, 'scale_pos_weight': 9.454864886835315}

### Cross-validation Partial AUC Score

In [18]:
param_xgb = {
        'random_state':      42,
        'n_estimators':      208,
        'learning_rate':     0.08256404336718554,
        'reg_lambda':        6.1374965163887305,
        'alpha':             0.6026870768041418,
        'max_depth':         21,
        'subsample':         0.47434308951793386,
        'colsample_bytree':  0.9081510010036246,
        'colsample_bylevel': 0.5171030271682259,
        'scale_pos_weight':  9.454864886835315
    }

model_xgb_cv = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('RF',  XGBClassifier(**param_xgb))
    ])

if ENABLE['xgb']['compute-pauc'] == 1:
    pauc_xgb_cv = cross_val_partial_auc_score(X, y, model_xgb_cv, n_splits=5)    
    print(f"CV Partial AUC Score: {pauc_xgb_cv}")

### Final Training

In [19]:
if ENABLE['xgb']['final-train'] == 1:
                           
    model_xgb_fe139_rsmpl = ImbPipeline([
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('XGB', XGBClassifier(**param_xgb))
    ])
    
    model_xgb_fe139_rsmpl.fit(X, y)

### Model Saving

In [20]:
if ENABLE['xgb']['save-model'] == 1:
    dump(model_xgb_fe139_rsmpl, 'model_xgb_fe139_rsmpl.pkl')

# LightGBM

### Cross-validation

In [21]:
if ENABLE['lgb']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):
        
        # Suggest values for the hyperparameters
        random_state = 42
        objective = 'binary'
        boosting_type = 'gbdt'
        verbosity = -1
        n_estimators = trial.suggest_int('n_estimators', 200, 500)    
        lambda_l1 = trial.suggest_float('lambda_l1', 0.05, 0.10)
        lambda_l2 = trial.suggest_float('lambda_l2', 0.001, 0.010)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0)
        max_depth = trial.suggest_int('max_depth', 20, 50)    
        num_leaves = trial.suggest_int('num_leaves', 20, 20)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1.0)
        colsample_bynode = trial.suggest_float('colsample_bynode', 0.1, 1.0)
        bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0) 
        bagging_freq = trial.suggest_int('bagging_freq', 0, 15)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 10)
        scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 10.0)
    
        pauc_scores = []
    
        for train_idx, val_idx in skf.split(X, y):

            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                        
            # Pipeline
            pipe_lgb = ImbPipeline([
                ('preprocessing', preprocessing),
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
                ('LGB', LGBMClassifier(random_state=random_state,
                                       verbosity=verbosity,
                                       objective=objective,
                                       boosting_type=boosting_type,                                   
                                       n_estimators=n_estimators,
                                       lambda_l1=lambda_l1,
                                       lambda_l2=lambda_l2,
                                       learning_rate=learning_rate,
                                       max_depth=max_depth,
                                       num_leaves=num_leaves,
                                       colsample_bytree=colsample_bytree,
                                       colsample_bynode=colsample_bynode,
                                       bagging_fraction=bagging_fraction,
                                       bagging_freq=bagging_freq,
                                       min_data_in_leaf=min_data_in_leaf,
                                       scale_pos_weight=scale_pos_weight
                                      )         
                )
            ])
    
            # Train the model
            pipe_lgb.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_lgb.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')
        
#Best value (partial auc - 0.8): 0.19165167013143908
#Best hyperparameters: {'n_estimators': 390, 'lambda_l1': 0.05127158993500609, 'lambda_l2': 0.002856268451232568, 'learning_rate': 0.016316428473444125, 'max_depth': 32, 'num_leaves': 20, 'colsample_bytree': 0.501841654985379, 'colsample_bynode': 0.3253277554214308, 'bagging_fraction': 0.22054551329713584, 'bagging_freq': 5, 'min_data_in_leaf': 9, 'scale_pos_weight': 1.0154421072327424}

### Cross-validation Partial AUC Score

In [22]:
param_lgb = {
        'random_state':     42,
        'objective':        'binary',
        'boosting_type':    'gbdt',
        'verbosity':        -1,
        'n_estimators':     390,
        'lambda_l1':        0.05127158993500609,
        'lambda_l2':        0.002856268451232568,
        'learning_rate':    0.016316428473444125,
        'max_depth':        32,
        'num_leaves':       20,
        'colsample_bytree': 0.501841654985379, 
        'colsample_bynode': 0.3253277554214308,
        'bagging_fraction': 0.22054551329713584,
        'bagging_freq':     5,
        'min_data_in_leaf': 9,
        'scale_pos_weight': 1.0154421072327424,
    }


model_lgb_cv = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('LGB',  LGBMClassifier(**param_lgb))
    ])

if ENABLE['lgb']['compute-pauc'] == 1:
    pauc_lgb_cv = cross_val_partial_auc_score(X, y, model_lgb_cv, n_splits=5)
    print(f"CV Partial AUC Score: {pauc_lgb_cv}")

### Final Training

In [23]:
if ENABLE['lgb']['final-train'] == 1:
    
    # Pipeline                          
    model_lgb_fe139_rsmpl = ImbPipeline([
        ('preprocessing', preprocessing),  
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),    
        ('LGB', LGBMClassifier(**param_lgb))
    ])
    
    model_lgb_fe139_rsmpl.fit(X, y)

### Model Saving

In [24]:
if ENABLE['lgb']['save-model'] == 1:
    dump(model_lgb_fe139_rsmpl, 'model_lgb_fe139_rsmpl.pkl')

# CatBoost

### Cross-validation

In [25]:
if ENABLE['cb']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):
            
        # Suggest values for the hyperparameters
        random_state = 42
        loss_function = 'Logloss'
        verbose = False
        n_estimators = trial.suggest_int('n_estimators', 200, 400)    
        max_depth = trial.suggest_int('max_depth', 1, 16)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0)    
        scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 10.0) #4.0)
        l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1.0, 10.0) #8.0)
        subsample = trial.suggest_float('subsample', 0.1, 1.0) #0.8)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 35) # 15, 35)
        
        pauc_scores = []
        
        for train_idx, val_idx in skf.split(X, y):

            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                    
            # Pipeline
            pipe_cb = ImbPipeline([
                ('preprocessing', preprocessing),  
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),    
                ('CAT', CatBoostClassifier(random_state=random_state,                               
                                           loss_function=loss_function,
                                           verbose=verbose,
                                           n_estimators=n_estimators,                               
                                           max_depth=max_depth,
                                           learning_rate=learning_rate,
                                           scale_pos_weight=scale_pos_weight,
                                           l2_leaf_reg=l2_leaf_reg,
                                           subsample=subsample,
                                           min_data_in_leaf=min_data_in_leaf,
                                        )         
                )
            ])
    
            # Train the model
            pipe_cb.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_cb.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')
    
# TRIAL
#Trial 30 finished with value: 0.19182318666176065
#and parameters: {'n_estimators': 352, 'max_depth': 12, 'learning_rate': 0.010916211896675203, 'scale_pos_weight': 1.1308423310589069, 'l2_leaf_reg': 9.938943422516182, 'subsample': 0.48191020552292485, 'min_data_in_leaf': 35}. Best is trial 30 with value: 0.19182318666176065.

### Cross-validation Partial AUC Score

In [26]:
param_cb = {
        'random_state':     42,
        'loss_function':    'Logloss',
        'verbose':          False,
        'n_estimators':     352,
        'max_depth':        12,
        'learning_rate':    0.010916211896675203,
        'scale_pos_weight': 1.1308423310589069,
        'l2_leaf_reg':      9.938943422516182,
        'subsample':        0.48191020552292485, 
        'min_data_in_leaf': 35
    }

model_cb_cv = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('CB',  CatBoostClassifier(**param_cb))
    ])

if ENABLE['cb']['compute-pauc'] == 1:
    pauc_cb_cv = cross_val_partial_auc_score(X, y, model_cb_cv, n_splits=5)
    print(f"CV Partial AUC Score: {pauc_cb_cv}")

### Final Training

In [27]:
if ENABLE['cb']['final-train'] == 1:
        
    model_cb_fe139_rsmpl = ImbPipeline([
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('CAT', CatBoostClassifier(**param_cb)         
        )
    ])
    
    model_cb_fe139_rsmpl.fit(X, y)

### Model Saving

In [28]:
if ENABLE['cb']['save-model'] == 1:
    dump(model_cb_fe139_rsmpl, 'model_cb_fe139_rsmpl.pkl')

# AdaBoost

### Cross-validation

In [29]:
if ENABLE['ada']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):
            
        # Suggest values for the hyperparameters
        random_state = 42
        n_estimators = trial.suggest_int('n_estimators', 50, 400)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0) 
        
        pauc_scores = []

        for train_idx, val_idx in skf.split(X, y):
            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            # Pipeline                                       
            pipe_ada = ImbPipeline([
                ('preprocessing', preprocessing),
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
                ('ADA', AdaBoostClassifier(random_state=42,                             
                                           n_estimators=n_estimators,
                                           learning_rate=learning_rate                             
                                          )
                )
            ])
    
            # Train the model
            pipe_ada.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_ada.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')

#Best trial number: 53
#Best value (partial auc - 0.8): 0.19173484739529267
#Best hyperparameters: {'n_estimators': 352, 'learning_rate': 0.022798770211700257}

#Trial 38 finished with value: 0.19175458060851266 and parameters: {'n_estimators': 276, 'learning_rate': 0.02915033101191371}. Best is trial 38 with value: 0.19175458060851266.

### Cross-validaton Partial AUC Score

In [30]:
param_ada = {
        'random_state':  42,
        'n_estimators':  276, #352,  
        'learning_rate': 0.02915033101191371 #0.022798770211700257
    }

model_ada_cv = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('ADA', AdaBoostClassifier(**param_ada))
    ])

if ENABLE['ada']['compute-pauc'] == 1:
    pauc_ada_cv = cross_val_partial_auc_score(X, y, model_ada_cv, n_splits=5)    
    print(f"CV Partial AUC Score: {pauc_ada_cv}")

### Final Training

In [31]:
if ENABLE['ada']['final-train'] == 1:
    
    # Pipeline                              
    model_ada_fe139_rsmpl = ImbPipeline([
        ('preprocessing', preprocessing),  
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),    
        ('ADA', AdaBoostClassifier(**param_ada)
        )
    ])
    
    model_ada_fe139_rsmpl.fit(X, y)

### Model Saving

In [32]:
if ENABLE['ada']['save-model'] == 1:
    dump(model_ada_fe139_rsmpl, 'model_ada_fe139_rsmpl.pkl')

# SVC

In [33]:
if ENABLE['svc']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function
    def objective(trial):
            
        # Suggest values for the hyperparameters
        random_state = 42
        probability = True
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        C = trial.suggest_loguniform('C', 0.0001, 10.0)
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        #class_weight = trial.suggest_categorical('class_weight', [None, 'balanced']) 
        class_weight = 'balanced'
        pauc_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                    
            # Pipeline
            pipe_svc = ImbPipeline([
                ('preprocessing', preprocessing),  
                ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
                ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),    
                ('SVC', SVC(random_state=random_state,
                            probability=probability,
                            kernel=kernel,
                            C=C,
                            gamma=gamma,
                            class_weight=class_weight
                           )
                )
            ])
    
            # Train the model
            pipe_svc.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_svc.predict_proba(X_val_fold)[:,1]
        
            # Calculate partical AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')

#Best value (partial auc - 0.8): 0.19117447674037716
#Best hyperparameters: {'kernel': 'poly', 'C': 0.0061928828007692584, 'gamma': 'scale', 'class_weight': 'balanced'}.

#Trial 27 finished with value: 0.19127118606665136 and parameters: {'kernel': 'linear', 'C': 0.00012545092811368545, 'gamma': 'auto'}. Best is trial 27 with value: 0.19127118606665136.

### Cross-validation Partial AUC Score

In [34]:
param_svc = {
        'random_state': 42,
        'probability': True,
        'kernel': 'linear',
        'C': 0.00012545092811368545,
        'gamma': 'auto',
        'class_weight': 'balanced'
}

model_svc_cv = ImbPipeline([    
        ('preprocessing', preprocessing),
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),
        ('SVC', SVC(**param_svc))
])

if ENABLE['svc']['compute-pauc'] == 1:
    pauc_svc_cv = cross_val_partial_auc_score(X, y, model_svc_cv, n_splits=5)
    print(f"CV Partial AUC Score: {pauc_svc_cv}")

### Final Training

In [35]:
if ENABLE['svc']['final-train'] == 1:
    
    # Pipeline
    model_svc_fe139_rsmpl = ImbPipeline([
        ('preprocessing', preprocessing),  
        ('undersample', RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)),
        ('oversample', SMOTE(sampling_strategy={1: 4000}, random_state=42)),    
        ('SVC', SVC(**param_svc)
        )
    ])
    
    model_svc_fe139_rsmpl.fit(X, y)

### Model Saving

In [36]:
if ENABLE['svc']['save-model'] == 1:
    dump(model_svc_fe139_rsmpl, 'model_svc_fe139_rsmpl.pkl')

# Ensemble: Soft Voting

### Cross-validation Partial AUC Score

In [38]:
# Build the soft-voting ensemble archicecture
model_soft_cv = VotingClassifier(estimators=[
        ('RF',  model_rf_cv),
        ('XGB', model_xgb_cv),
        ('LGB', model_lgb_cv),
        ('CB',  model_cb_cv),
        ('ADA', model_ada_cv),
        #('SVC', model_svc_cv)
], voting='soft')

if ENABLE['soft-v']['compute-pauc'] == 1:
    pauc_sft_cv = cross_val_partial_auc_score(X, y, model_soft_cv, n_splits=5)
    print(f"CV Partial AUC Score: {pauc_sft_cv}")

Processing fold 1 of 5... pAUC: 0.19137194855164882
Processing fold 2 of 5... pAUC: 0.1926880631991458
Processing fold 3 of 5... pAUC: 0.18859894050527132
Processing fold 4 of 5... pAUC: 0.1934111894368661
Processing fold 5 of 5... pAUC: 0.1932032550097425
CV Partial AUC Score: 0.1918546793405349


### Final Training

In [39]:
if ENABLE['soft-v']['final-train'] == 1:
    model_soft_cv_fe130_rsmpl = VotingClassifier(estimators=[
        ('RF',  model_rf_cv),
        ('XGB', model_xgb_cv),
        ('LGB', model_lgb_cv),
        ('CB',  model_cb_cv),
        ('ADA', model_ada_cv),
        ('SVC', model_svc_cv)
        ], voting='soft')

    # In case that the individual models have been trained before
    
    #model_rf_fe139_rsmpl = load('model_rf_fe139_rsmpl.pkl')
    #model_xgb_fe139_rsmpl = load('model_xgb_fe139_rsmpl.pkl')
    #model_lgb_fe139_rsmpl = load('model_lgb_fe139_rsmpl.pkl')
    #model_cb_fe139_rsmpl = load('model_cb_fe139_rsmpl.pkl')
    #model_cb_fe139_rsmpl = load('model_ada_fe139_rsmpl.pkl')
    #model_svc_fe139_rsmpl = load('model_svc_fe139_rsmpl.pkl')
    #model_soft_cv_fe130_rsmpl = VotingClassifier(estimators=[
    #    ('RF',  model_rf_fe139_rsmpl),
    #    ('XGB', model_xgb_fe139_rsmpl),
    #    ('LGB', model_lgb_fe139_rsmpl),
    #    ('CB',  model_cb_fe139_rsmpl),
    #    ('ADA', model_cb_fe139_rsmpl),
    #    ('SVC', model_svc_fe139_rsmpl)
    #], voting='soft')

    model_soft_cv_fe130_rsmpl.fit(X, y)

### Model Saving

In [40]:
if ENABLE['soft-v']['save-model'] == 1:
    dump(model_soft_cv_fe130_rsmpl, 'model_soft_cv_fe130_rsmpl.pkl')

# Ensemble: Logistic Regression

In [88]:
if ENABLE['lr-v']['cross-val'] == 1:
    
estimators=[
        ('RF',  model_rf_cv),
        ('XGB', model_xgb_cv),
        ('LGB', model_lgb_cv),
        ('CB',  model_cb_cv),
        ('ADA', model_ada_cv),
        ('SVC', model_svc_cv)

stacking_lr = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Construct a pipeline with StackingClassifier
pipe_lr_soft = Pipeline([
    ('stacking_clf', stacking_lr)
])


# Define hyperparameters only for LogisticRegression()
hyperparams = {
    'stacking_clf__final_estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'stacking_clf__final_estimator__penalty': ['l1', 'l2'],
    'stacking_clf__final_estimator__solver': ['liblinear', 'saga']
}

#meta = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())


IndentationError: expected an indented block after 'if' statement on line 1 (2417230151.py, line 3)

In [37]:
if ENABLE['lr-v']['cross-val'] == 1:
    
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Define the objective function for Optuna
    def objective(trial):
        
        # Suggest values for the hyperparameters of Logistic Regression
        C = trial.suggest_loguniform('C', 0.1, 10.0)
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
        
        pauc_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            # Create the folds
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            # Pipeline with preprocessing and logistic regression
            pipe_lr = Pipeline([
                ('preprocessing', preprocessing),
                ('logreg', LogisticRegression(
                    C=C,
                    penalty=penalty,
                    solver=solver,
                    max_iter=1000,
                    random_state=42
                ))
            ])
            
            # Train the model
            pipe_lr.fit(X_train_fold, y_train_fold)
        
            # Predict on the validation set
            preds = pipe_lr.predict_proba(X_val_fold)[:, 1]
        
            # Calculate partial AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)
        
        # Return the average
        return np.mean(pauc_scores)
    
    # Create a study object with 'maximize' direction
    study = optuna.create_study(direction='maximize')
    
    # Start the optimization
    study.optimize(objective, n_trials=3, n_jobs=1)
    
    # Get the best trial
    best_trial = study.best_trial
    
    print(f'Best trial number: {best_trial.number}')
    print(f'Best value (partial auc - 0.8): {best_trial.value}')
    print(f'Best hyperparameters: {best_trial.params}')

[I 2024-08-28 11:08:59,632] A new study created in memory with name: no-name-127e983a-a0be-4b1c-abf0-0f194cf16ce2
[I 2024-08-28 11:39:29,162] Trial 0 finished with value: 0.18996824374324256 and parameters: {'C': 0.4454395767141481, 'penalty': 'l1', 'solver': 'saga'}. Best is trial 0 with value: 0.18996824374324256.
[I 2024-08-28 12:11:51,379] Trial 1 finished with value: 0.18976532074970778 and parameters: {'C': 3.3930740912105417, 'penalty': 'l1', 'solver': 'saga'}. Best is trial 0 with value: 0.18996824374324256.
[I 2024-08-28 12:33:37,795] Trial 2 finished with value: 0.19212974289656715 and parameters: {'C': 0.37206196533421637, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 2 with value: 0.19212974289656715.


Best trial number: 2
Best value (partial auc - 0.8): 0.19212974289656715
Best hyperparameters: {'C': 0.37206196533421637, 'penalty': 'l1', 'solver': 'liblinear'}


In [38]:
# Trial 7 finished with value: 0.19230752612288346 and parameters: {'C': 1.4669795429403636, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 7 with value: 0.19230752612288346.
# Trial 4 finished with value: 0.19185952072646328 and parameters: {'C': 9.5120708822337, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 4 with value: 0.19185952072646328
# Trial 3 finished with value: 0.19208920555615153 and parameters: {'C': 0.25814369656714853, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 3 with value: 0.19208920555615153.

### Cross-validation Partial AUC Score

In [None]:
if ENABLE['lr-v']['compute-pauc'] == 1:
    # Separate numerical and categorical columns
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Preprocessing for numerical data: Standard Scaler
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical data: OneHot Encoder
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Combine preprocessing steps using ColumnTransformer
    preprocessing = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Define the best parameters directly from Trial 7
    param_lr_v = {
        'C': 1.4669795429403636,  
        'penalty': 'l1',         
        'solver': 'liblinear'     
    }

    # Build the Logistic Regression model with the best parameters in a pipeline
    model_lr_cv = Pipeline([
        ('preprocessing', preprocessing),
        ('logreg', LogisticRegression(**param_lr_v, max_iter=1000, random_state=42))
    ])

    # Function for cross-validation to calculate the partial AUC score
    def cross_val_partial_auc_score(X, y, model, n_splits=5):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        pauc_scores = []

        for train_idx, val_idx in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            # Train the model
            model.fit(X_train_fold, y_train_fold)

            # Predict on the validation set
            preds = model.predict_proba(X_val_fold)[:, 1]

            # Calculate partial AUC and store it
            pauc = partial_auc_score(y_val_fold, preds)
            pauc_scores.append(pauc)

        return np.mean(pauc_scores)

    # Perform cross-validation to calculate the partial AUC score
    pauc_lr_cv = cross_val_partial_auc_score(X, y, model_lr_cv, n_splits=5)
    print(f"CV Partial AUC Score for Logistic Regression: {pauc_lr_cv}")

### Final Training

In [None]:
if ENABLE['lr-v']['final-train'] == 1:

    # Use the best parameters found from Optuna for Logistic Regression
    param_lr_v = {
        'C': 1.4669795429403636,  # Value from best trial
        'penalty': 'l1',          # Value from best trial
        'solver': 'liblinear'     # Value from best trial
    }

    # Define the base estimators for stacking
    estimators = [
        ('RF',  model_rf_cv),   # Random Forest Classifier model
        ('XGB', model_xgb_cv),  # XGBoost Classifier model
        ('LGB', model_lgb_cv),  # LightGBM Classifier model
        ('CB',  model_cb_cv),   # CatBoost Classifier model
        ('ADA', model_ada_cv),  # AdaBoost Classifier model
        ('SVC', model_svc_cv)   # Support Vector Classifier model
    ]

    # Define the stacking classifier with Logistic Regression as the final estimator
    stacking_lr = StackingClassifier(
        estimators=estimators, 
        final_estimator=LogisticRegression(**param_lr_v, max_iter=1000, random_state=42)
    )

    # Construct a pipeline with StackingClassifier
    model_lr_cv_fe130_rsmpl = Pipeline([('stacking_clf', stacking_lr)])

    # Fit the pipeline on the entire dataset
    model_lr_cv_fe130_rsmpl.fit(X, y)

    print("Stacking Classifier trained successfully.")

# Summary

In [45]:
print("Model Performance Scores:\n")
print(f"{'Model':<15} {'Partial AUC Score':<20}")
print("-" * 35)
if 'pauc_rf_cv' in locals():
    print(f"{'Random Forest':<15} {pauc_rf_cv:<20.4f}")
if 'pauc_xgb_cv' in locals():    
    print(f"{'XGBoost':<15} {pauc_xgb_cv:<20.4f}")
if 'pauc_lgb_cv' in locals():    
    print(f"{'LightGBM':<15} {pauc_lgb_cv:<20.4f}")
if 'pauc_cb_cv' in locals():        
    print(f"{'CatBoost':<15} {pauc_cb_cv:<20.4f}")
if 'pauc_ada_cv' in locals():    
    print(f"{'AdaBoost':<15} {pauc_ada_cv:<20.4f}")
if 'pauc_svc_cv' in locals():        
    print(f"{'SVC':<15} {pauc_svc_cv:<20.4f}")
if 'pauc_sft_cv' in locals():        
    print(f"{'Soft Voting':<15} {pauc_sft_cv:<20.4f}")

Model Performance Scores:

Model           Partial AUC Score   
-----------------------------------
Soft Voting     0.1920              


In [41]:
# Initialize cross-validation

#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#pauc_scores = []

# Cross-validation loop
#for train_idx, val_idx in skf.split(X, y):
            
    # Create the folds
#    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
#    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Create the ensemble model. Assumed these models already pretrained
#    soft_voting_model = VotingClassifier(estimators=[
#        ('RF',  model_rf_fe139_rsmpl),
#        ('XGB', model_xgb_fe139_rsmpl),
#        ('LGB', model_lgb_fe139_rsmpl),
#        ('CB',  model_cb_fe139_rsmpl),
#        ('ADA', model_ada_fe139_rsmpl),
#        ('SVC', model_svc_fe139_rsmpl),
        
    # Make predictions with the validation set
    #preds1 = model_rf_fe139_rsmpl.predict_proba(X_val_fold)[:,1]
    #preds2 = model_xgb_fe139_rsmpl.predict_proba(X_val_fold)[:,1]
    #preds3 = model_lgb_fe139_rsmpl.predict_proba(X_val_fold)[:,1]
    #preds4 = model_cb_fe139_rsmpl.predict_proba(X_val_fold)[:,1]
    #preds5 = model_ada_fe139_rsmpl.predict_proba(X_val_fold)[:,1]
    #preds6 = model_svc_fe139_rsmpl.predict_proba(X_val_fold)[:,1]

    #pauc1 = partial_auc_score(y_val_fold, preds1)
    #pauc2 = partial_auc_score(y_val_fold, preds2)
    #pauc3 = partial_auc_score(y_val_fold, preds3)
    #pauc4 = partial_auc_score(y_val_fold, preds4)
    #pauc5 = partial_auc_score(y_val_fold, preds5)
    #pauc6 = partial_auc_score(y_val_fold, preds6)
    
    # Combine the partial AUC scores using the average
    #pauc_ave = np.mean([preds1, preds2, preds3, preds4, preds5, preds6])

    # Predict on the validation set
#    preds = soft_voting_model.predict_proba(X_val_fold)[:,1]
        
    # Calculate partical AUC and store it
#    pauc = partial_auc_score(y_val_fold, preds)
#    pauc_scores.append(pauc)

# Average the scores across all folds
#average_ensemble_score = np.mean(pauc_scores)
#print(f'Average partical AUC score - Soft Voting: {average_ensemble_score}')

In [42]:
# Feature Importance

In [43]:
#importances = model.feature_importances_
#feature_importance_df = pd.DataFrame({
#    'feature': selected_features,  # This should correspond to your final set of features after KBest
#    'importance': importances
#}).sort_values(by='importance', ascending=False)

# Plot feature importance
#sns.barplot(x='importance', y='feature', data=feature_importance_df)
#plt.title('Feature Importance')
#plt.show()