# Imports

In [2]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/hackerearth/amexpert_codelab/data/processed/processed_train.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/hackerearth/amexpert_codelab/data/processed/processed_test.csv'

In [3]:
!pip install --quiet optuna

[K     |████████████████████████████████| 308 kB 4.0 MB/s 
[K     |████████████████████████████████| 209 kB 44.5 MB/s 
[K     |████████████████████████████████| 80 kB 5.8 MB/s 
[K     |████████████████████████████████| 75 kB 4.0 MB/s 
[K     |████████████████████████████████| 49 kB 5.2 MB/s 
[K     |████████████████████████████████| 112 kB 42.9 MB/s 
[K     |████████████████████████████████| 149 kB 42.5 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [33]:
import time
import imblearn

import pandas as pd
import numpy as np

import optuna
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier

from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from scipy.stats import mode

SEED = 2311

In [5]:
# processed train and test
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

In [6]:
train.shape, test.shape

((36743, 19), (11383, 18))

In [8]:
features = [f for f in train.columns if f not in ('customer_id', 'name', 'credit_card_default')]

In [9]:
cat_features = ['gender', 'owns_car', 'owns_house', 'occupation_type', 'migrant_worker', 'default_in_last_6months']

# Class imbalance

In [10]:
train.credit_card_default.value_counts()

0    33528
1     3215
Name: credit_card_default, dtype: int64

In [11]:
oversample = imblearn.over_sampling.SMOTE()

In [12]:
train_oversampled, y = oversample.fit_resample(train[features + ['credit_card_default']], train.credit_card_default)

In [13]:
train_oversampled.shape, y.shape

((67056, 17), (67056,))

# Folds for cross-validation

In [14]:
N_SPLITS = 5

train['fold'] = -1

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (_, id) in enumerate(skf.split(X=train, y=train.credit_card_default)):
  train.loc[id, 'fold'] = fold

In [15]:
train_oversampled['fold'] = -1

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (_, id) in enumerate(skf.split(X=train_oversampled, y=train_oversampled.credit_card_default)):
  train_oversampled.loc[id, 'fold'] = fold

# Model 1 - train; all features

In [16]:
xtrain, xval, ytrain, yval = train_test_split(train[features], train.credit_card_default, 
                                              stratify=train.credit_card_default,
                                              test_size=0.2,
                                              random_state=SEED)

In [49]:
base_params1 = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'gpu_hist',
    'n_estimators': 5000,
    'use_label_encoder': False,
    'scale_pos_weight': 10.45, #ratio of negative class to positive class = 33528 / 3215
    'enable_categorical': True,
    'verbosity': 0,
    'random_state': SEED
}

In [50]:
def objective(trial, xtrain, ytrain, xval, yval, base_params):
        
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }
    
    model = XGBClassifier(**base_params, **param_grid)
    
    model.fit(
        xtrain, ytrain,
        eval_set=[(xval, yval)],
        eval_metric='aucpr',
        early_stopping_rounds=100,
        callbacks=[
            XGBoostPruningCallback(trial, 'validation_0-aucpr')
            #need to include index of validation set from eval_set for scikit-learn API
        ],
        verbose=200
    )
    
    predictions = model.predict_proba(xval)[:, 1]
    return roc_auc_score(yval, predictions)

In [None]:
%%time
study1 = optuna.create_study(direction='maximize', study_name='model1')

study1.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params1), 
    n_trials=50
)

In [52]:
print(f'Best value (AUC-PR): {study1.best_value:.5f}')

best_params1 = study1.best_params
print('Best params:')
for key, value in best_params1.items():
    print(f'\t{key}: {value}')

Best value (AUC): 0.99520
Best params:
	learning_rate: 0.2758303432184364
	subsample: 0.8508705080169281
	reg_lambda: 0.001299324214395094
	reg_alpha: 0.00020563736367449882
	max_depth: 5


In [53]:
def custom_cross_val_predict(train, test, features, model):
    test_preds = []
    auc_scores = []
    f1_scores = []
    
    cv_start = time.time()
    
    for fold in range(N_SPLITS):
        print('-' * 40)
        
        xtrain = train[train.fold != fold].reset_index(drop=True)

        xval = train[train.fold == fold].reset_index(drop=True)    
        
        fold_start = time.time()
        
        model.fit(
            xtrain[features], xtrain.credit_card_default,
            eval_set=[(xval[features], xval.credit_card_default)],
            eval_metric='aucpr',
            early_stopping_rounds=100,
            verbose=200
        )
        val_probs = model.predict_proba(xval[features])[:,1] #out-of-fold predictions
        val_preds = model.predict(xval[features])
        
        auc = roc_auc_score(xval.credit_card_default, val_probs)
        auc_scores.append(auc)

        f1 = f1_score(xval.credit_card_default, val_preds)
        f1_scores.append(f1)

        
        fold_end = time.time()
        
        print(f'Fold #{fold}: AUC = {auc:.5f}, f1_score = {f1:5f}\t[Time: {fold_end - fold_start:.2f} secs]')
        
        test_preds.append(model.predict(test[features]))
        
    cv_end = time.time()
    print(f'Average AUC = {np.mean(auc_scores):.5f} with std. dev. = {np.std(auc_scores):.5f}')
    print(f'Average f1_score = {np.mean(f1_scores):.5f} with std. dev. = {np.std(f1_scores):.5f}')
    print(f'[Total time: {cv_end - cv_start:.2f} secs]')
    
    return mode(np.column_stack(test_preds), axis=1).mode

In [54]:
model1 = XGBClassifier(**base_params1, **best_params1)

In [55]:
test_preds1 = custom_cross_val_predict(train, test, features, model1)

----------------------------------------
[0]	validation_0-aucpr:0.949031
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[27]	validation_0-aucpr:0.954335

Fold #0: AUC = 0.99463, f1_score = 0.791980	[Time: 1.53 secs]
----------------------------------------
[0]	validation_0-aucpr:0.955755
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[14]	validation_0-aucpr:0.957395

Fold #1: AUC = 0.99490, f1_score = 0.792826	[Time: 1.37 secs]
----------------------------------------
[0]	validation_0-aucpr:0.945885
Will train until validation_0-aucpr hasn't improved in 100 rounds.
[200]	validation_0-aucpr:0.949327
Stopping. Best iteration:
[153]	validation_0-aucpr:0.95153

Fold #2: AUC = 0.99417, f1_score = 0.837736	[Time: 2.91 secs]
----------------------------------------
[0]	validation_0-aucpr:0.956101
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[7]	validation

In [56]:
sub1 = pd.DataFrame(columns=['customer_id', 'credit_card_default'])
sub1.customer_id = test.customer_id
sub1.credit_card_default = test_preds1

sub1.to_csv('sub1.csv', index=False)

# Model 2 - train; important features

In [42]:
train[features + ['credit_card_default']].corr()['credit_card_default']

age                        0.000455
gender                     0.052901
owns_car                  -0.026598
owns_house                 0.001690
no_of_children             0.012714
net_yearly_income         -0.035228
no_of_days_employed       -0.077430
occupation_type            0.002322
total_family_members       0.001516
migrant_worker             0.025963
yearly_debt_payments      -0.013671
credit_limit              -0.024024
credit_limit_used(%)       0.336084
credit_score              -0.555296
prev_defaults              0.771174
default_in_last_6months    0.774399
credit_card_default        1.000000
Name: credit_card_default, dtype: float64

In [43]:
mask = [(f in cat_features) for f in features]

In [44]:
%%time
mi_scores = mutual_info_classif(train[features], train.credit_card_default,
                                discrete_features=mask, random_state=SEED)

CPU times: user 2.08 s, sys: 3.67 ms, total: 2.08 s
Wall time: 2.08 s


In [47]:
dict(zip(features, mi_scores))

{'age': 0.000981779735959698,
 'credit_limit': 0.0006799572997060466,
 'credit_limit_used(%)': 0.09655786669031174,
 'credit_score': 0.23195443172139185,
 'default_in_last_6months': 0.15316637548702577,
 'gender': 0.0013729419586149911,
 'migrant_worker': 0.0003263585295365523,
 'net_yearly_income': 0.0011010409644265362,
 'no_of_children': 0.0,
 'no_of_days_employed': 0.00502548879955933,
 'occupation_type': 0.002781599338767377,
 'owns_car': 0.0003595133655748539,
 'owns_house': 1.4299786092880629e-06,
 'prev_defaults': 0.168398737370832,
 'total_family_members': 0.0030217512849008976,
 'yearly_debt_payments': 0.0}

From the correlation and mutual-information data, we will choose the important features.

In [48]:
imp_features = ['credit_limit_used(%)', 'credit_score', 'prev_defaults', 'default_in_last_6months']

In [57]:
xtrain, xval, ytrain, yval = train_test_split(train[imp_features], train.credit_card_default, 
                                              stratify=train.credit_card_default,
                                              test_size=0.2,
                                              random_state=SEED)

In [None]:
%%time
study2 = optuna.create_study(direction='maximize', study_name='model2')

study2.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params1), 
    n_trials=50
)

In [59]:
print(f'Best value (AUC-PR): {study2.best_value:.5f}')

best_params2 = study2.best_params
print('Best params:')
for key, value in best_params2.items():
    print(f'\t{key}: {value}')

Best value (AUC-PR): 0.99419
Best params:
	learning_rate: 0.027812315553777913
	subsample: 0.5095780195212785
	reg_lambda: 0.08765516793830402
	reg_alpha: 0.005952428807138894
	max_depth: 4


In [60]:
model2 = XGBClassifier(**base_params1, **best_params2)

In [61]:
test_preds2 = custom_cross_val_predict(train, test, imp_features, model2)

----------------------------------------
[0]	validation_0-aucpr:0.944914
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[11]	validation_0-aucpr:0.945262

Fold #0: AUC = 0.99346, f1_score = 0.781763	[Time: 1.19 secs]
----------------------------------------
[0]	validation_0-aucpr:0.952405
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[4]	validation_0-aucpr:0.955095

Fold #1: AUC = 0.99463, f1_score = 0.790898	[Time: 1.18 secs]
----------------------------------------
[0]	validation_0-aucpr:0.943593
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[4]	validation_0-aucpr:0.944829

Fold #2: AUC = 0.99342, f1_score = 0.793827	[Time: 1.18 secs]
----------------------------------------
[0]	validation_0-aucpr:0.947253
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[41]	validation_0-aucpr:0.947304

Fold #3: AUC = 0

In [62]:
sub2 = pd.DataFrame(columns=['customer_id', 'credit_card_default'])
sub2.customer_id = test.customer_id
sub2.credit_card_default = test_preds2

sub2.to_csv('sub2.csv', index=False)

# Model 3 - train_oversampled; all features

In [63]:
xtrain, xval, ytrain, yval = train_test_split(train_oversampled[features], 
                                              train_oversampled.credit_card_default, 
                                              stratify=train_oversampled.credit_card_default,
                                              test_size=0.2,
                                              random_state=SEED)

In [64]:
base_params3 = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'gpu_hist',
    'n_estimators': 5000,
    'use_label_encoder': False,
    # 'scale_pos_weight': 10.45, #ratio of negative class to positive class = 33528 / 3215
    'enable_categorical': True,
    'verbosity': 0,
    'random_state': SEED
}

In [None]:
%%time
study3 = optuna.create_study(direction='maximize', study_name='model3')

study3.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params3), 
    n_trials=50
)

In [66]:
print(f'Best value (AUC-PR): {study3.best_value:.5f}')

best_params3 = study3.best_params
print('Best params:')
for key, value in best_params3.items():
    print(f'\t{key}: {value}')

Best value (AUC-PR): 0.99936
Best params:
	learning_rate: 0.060348024522222786
	subsample: 0.9154376034848039
	reg_lambda: 0.000267697763636681
	reg_alpha: 0.10909969956659828
	max_depth: 7


In [67]:
model3 = XGBClassifier(**base_params3, **best_params3)

In [68]:
test_preds3 = custom_cross_val_predict(train, test, features, model3)

----------------------------------------
[0]	validation_0-aucpr:0.94534
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[65]	validation_0-aucpr:0.953242

Fold #0: AUC = 0.99459, f1_score = 0.848858	[Time: 2.23 secs]
----------------------------------------
[0]	validation_0-aucpr:0.955275
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[3]	validation_0-aucpr:0.957907

Fold #1: AUC = 0.99499, f1_score = 0.867303	[Time: 1.28 secs]
----------------------------------------
[0]	validation_0-aucpr:0.947576
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[38]	validation_0-aucpr:0.953467

Fold #2: AUC = 0.99451, f1_score = 0.850267	[Time: 1.69 secs]
----------------------------------------
[0]	validation_0-aucpr:0.952533
Will train until validation_0-aucpr hasn't improved in 100 rounds.
[200]	validation_0-aucpr:0.955598
Stopping. Best iteration:
[106]	validation

In [69]:
sub3 = pd.DataFrame(columns=['customer_id', 'credit_card_default'])
sub3.customer_id = test.customer_id
sub3.credit_card_default = test_preds3

sub3.to_csv('sub3.csv', index=False)

# Model 4 - train_oversampled; important features

In [70]:
xtrain, xval, ytrain, yval = train_test_split(train_oversampled[imp_features], 
                                              train_oversampled.credit_card_default, 
                                              stratify=train_oversampled.credit_card_default,
                                              test_size=0.2,
                                              random_state=SEED)

In [None]:
%%time
study4 = optuna.create_study(direction='maximize', study_name='model4')

study4.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params3), 
    n_trials=50
)

In [72]:
print(f'Best value (AUC-PR): {study4.best_value:.5f}')

best_params4 = study4.best_params
print('Best params:')
for key, value in best_params4.items():
    print(f'\t{key}: {value}')

Best value (AUC-PR): 0.99683
Best params:
	learning_rate: 0.161035720350742
	subsample: 0.7040293077768522
	reg_lambda: 0.0003887230399109898
	reg_alpha: 6.7065841527997545e-06
	max_depth: 6


In [73]:
model4 = XGBClassifier(**base_params3, **best_params4)

In [74]:
test_preds4 = custom_cross_val_predict(train, test, imp_features, model4)

----------------------------------------
[0]	validation_0-aucpr:0.944963
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[0]	validation_0-aucpr:0.944963

Fold #0: AUC = 0.99340, f1_score = 0.852863	[Time: 2.11 secs]
----------------------------------------
[0]	validation_0-aucpr:0.956128
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[2]	validation_0-aucpr:0.957675

Fold #1: AUC = 0.99493, f1_score = 0.871930	[Time: 1.99 secs]
----------------------------------------
[0]	validation_0-aucpr:0.938803
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[6]	validation_0-aucpr:0.948122

Fold #2: AUC = 0.99388, f1_score = 0.847670	[Time: 2.08 secs]
----------------------------------------
[0]	validation_0-aucpr:0.948483
Will train until validation_0-aucpr hasn't improved in 100 rounds.
Stopping. Best iteration:
[39]	validation_0-aucpr:0.954932

Fold #3: AUC = 0.

In [75]:
sub4 = pd.DataFrame(columns=['customer_id', 'credit_card_default'])
sub4.customer_id = test.customer_id
sub4.credit_card_default = test_preds4

sub4.to_csv('sub4.csv', index=False)