In [None]:
## IMPORT LIBRARIES
# os level
import warnings
import os

# data handling
import pandas as pd
import numpy as np
import pickle

# visulization
import matplotlib.pyplot as plt
import seaborn as sns

# feature engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

In [None]:
df = pickle.load(open('artifacts\processed_data.pkl','rb'))

In [None]:
df.head()

Unnamed: 0,Total_TL_opened_L6M,pct_tl_closed_L6M,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0,0.0,0.0,0.0,0,0,0,4,1,4,...,False,False,True,False,False,False,False,False,True,False
1,0,0.0,1.0,0.0,0,0,0,0,0,1,...,True,False,False,False,False,False,True,False,False,False
2,1,0.0,0.25,0.0,1,0,0,0,2,6,...,True,False,False,False,False,False,False,False,False,True
4,0,0.0,0.0,0.0,0,0,0,0,3,0,...,False,False,False,False,True,False,False,False,False,False
5,0,0.0,0.0,0.167,0,0,0,0,6,0,...,True,False,False,False,False,False,False,False,True,False


In [None]:
print(df.shape)

(42064, 62)


In [None]:
df.columns

Index(['Total_TL_opened_L6M', 'pct_tl_closed_L6M', 'pct_tl_open_L12M',
       'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Home_TL', 'PL_TL',
       'Secured_TL', 'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL',
       'Age_Newest_TL', 'time_since_recent_payment',
       'time_since_recent_deliquency', 'num_times_delinquent',
       'max_delinquency_level', 'num_deliq_6_12mts', 'max_deliq_6mts',
       'max_deliq_12mts', 'num_times_60p_dpd', 'num_sub', 'num_sub_6mts',
       'num_sub_12mts', 'num_dbt', 'num_dbt_6mts', 'num_lss', 'num_lss_6mts',
       'recent_level_of_deliq', 'CC_enq', 'CC_enq_L6m', 'PL_enq_L12m',
       'time_since_recent_enq', 'enq_L3m', 'NETMONTHLYINCOME',
       'Time_With_Curr_Empr', 'pct_currentBal_all_TL', 'CC_Flag', 'PL_Flag',
       'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag',
       'GL_Flag', 'EDUCATION', 'PROSPECTID', 'Approved_Flag',
       'MARITALSTATUS_Married', 'MARITALSTATUS_Single', 'GENDER_F', 'GENDER_M',
       'last_prod_enq2_A

In [None]:
df_model, df_unseen = train_test_split(df, test_size=0.01, random_state=42)

In [None]:
pickle.dump(df_model, open('artifacts\unseen_data.pkl','wb'))

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 9-10: truncated \uXXXX escape (<ipython-input-7-e31ff670a16b>, line 1)

In [None]:
pickle.dump(df_model, open(r'artifacts\unseen_data.pkl','wb'))

In [None]:
# Feature Engineering
X_train, X_test, y_train, y_test = train_test_split(df_model.drop(['Approved_Flag', 'PROSPECTID'], axis=1), 
                                                    df_model['Approved_Flag'], test_size=0.2, 
                                                    random_state=42)

df_model['Approved_Flag'].value_counts()
df_unseen['Approved_Flag'].value_counts()

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [None]:
## Base Model
xgb = XGBClassifier(n_estimators=200, n_jobs=-1, verbosity=2, random_state=42)
xgb.fit(X_train, y_train_enc)
print('accuracy', xgb.score(X_test, y_test_enc))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_enc, xgb.predict(X_test))
# base performance
for i, label in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

accuracy 0.764437507503902
p1 class:
precision: 0.8007075471698113
recall: 0.6886409736308317
f1score: 0.7404580152671756
p2 class:
precision: 0.8135654046294635
recall: 0.9108075532342306
f1score: 0.8594446024073548
p3 class:
precision: 0.4330357142857143
recall: 0.29573170731707316
f1score: 0.3514492753623188
p4 class:
precision: 0.7569169960474308
recall: 0.7274453941120608
f1score: 0.7418886198547215


In [None]:
# P2 vs Rest data
y_train_p2rest = (y_train_enc == 2).astype(int)
y_test_p2rest = (y_test_enc == 2).astype(int)

In [None]:
# xgb without weights for feature
xgb = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb.fit(X_train, y_train_p2rest)
xgb.score(X_test, y_test_p2rest)
print('P3 vs Rest model accuracy:', xgb.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8317925321167007
Rest class:
precision: 0.8576158940397351
recall: 0.9596693743765142
f1score: 0.905777120182931
P3 class:
precision: 0.40670859538784065
recall: 0.14786585365853658
f1score: 0.21688093907210731


In [None]:
# P2 vs Rest data
y_train_p2rest = (y_train_enc == 1).astype(int)
y_test_p2rest = (y_test_enc == 1).astype(int)

In [None]:
# P2 vs Rest data
y_train_p2rest = (y_train == 'P2').astype(int)
y_test_p2rest = (y_test == 'P2').astype(int)

In [None]:
# xgb without weights for feature
xgb = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb.fit(X_train, y_train_p2rest)
xgb.score(X_test, y_test_p2rest)
print('P3 vs Rest model accuracy:', xgb.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8226677872493696
Rest class:
precision: 0.821551132463967
recall: 0.7144136078782453
f1score: 0.7642458100558658
P3 class:
precision: 0.8232686980609418
recall: 0.8955403776617116
f1score: 0.8578851149812374


In [None]:
np.unique(y_train_p2rest, return_counts=True)

(array([0, 1]), array([13100, 20214], dtype=int64))

In [None]:
# %%
np.unique(y_test_p2rest, return_counts=True)

(array([0, 1]), array([3351, 4978], dtype=int64))

In [None]:
4978/3351

1.4855267084452402

In [None]:
20214/13100

1.5430534351145038

In [None]:
print('f1score:', f1[i])

f1score: 0.8578851149812374


In [None]:
# xgb with custom weights for feature
weights = y_test_p2rest.map({0: 1, 1: 1.5})  # Custom weights
scale_weights = (weights.sum() / weights[weights == 1].sum())

xgb_p2rest_weight = XGBClassifier(scale_pos_weight=scale_weights, n_estimators=200, 
                    n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_weight.fit(X_train, y_train_p2rest)
print('P3 vs Rest model accuracy:', xgb_p2rest_weight.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_weight.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8062192340016808
Rest class:
precision: 0.8725868725868726
recall: 0.6069829901521934
f1score: 0.7159450897571277
P3 class:
precision: 0.7804268089363121
recall: 0.9403374849337083
f1score: 0.8529518950437318


In [None]:
# xgb without weights for feature
xgb_p2rest_unweigh = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_unweigh.fit(X_train, y_train_p2rest)
print('P3 vs Rest model accuracy:', xgb_p2rest_unweigh.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_unweigh.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8226677872493696
Rest class:
precision: 0.821551132463967
recall: 0.7144136078782453
f1score: 0.7642458100558658
P3 class:
precision: 0.8232686980609418
recall: 0.8955403776617116
f1score: 0.8578851149812374


In [None]:
# svc
svc = SVC()
svc.fit(X_train, y_train_p2rest)
print(svc.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, svc.predict(X_test))
print('P3 vs Rest model accuracy:', accuracy_score(y_test_p2rest, svc.predict(X_test)))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

0.5976707888101813
P3 vs Rest model accuracy: 0.5976707888101813
Rest class:
precision: 0.5
recall: 0.003282602208296031
f1score: 0.006522383634746516
P3 class:
precision: 0.5979294570843866
recall: 0.997790277219767
f1score: 0.7477606322920587


In [None]:
from imblearn.over_sampling import BorderlineSMOTE

In [None]:
borderline_smote = BorderlineSMOTE(random_state=42)

In [None]:
X_resampled, y_resampled = borderline_smote.fit_resample(X_train, y_train_p2rest)

In [None]:
# xgb without weights for feature
xgb_p2rest_bal = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_bal.fit(X_resampled, y_resampled)
print('P3 vs Rest model accuracy:', xgb_p2rest_bal.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_bal.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8259094729259214
Rest class:
precision: 0.8184254606365159
recall: 0.7290361086242912
f1score: 0.77114898989899
P3 class:
precision: 0.8300898203592815
recall: 0.8911209321012454
f1score: 0.8595233481883355


In [None]:
# xgb on balanced data
xgb_p2rest_bal = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_bal.fit(X_resampled, y_resampled)
print('P3 vs Rest model accuracy:', xgb_p2rest_bal.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_bal.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8259094729259214
Rest class:
precision: 0.8184254606365159
recall: 0.7290361086242912
f1score: 0.77114898989899
P3 class:
precision: 0.8300898203592815
recall: 0.8911209321012454
f1score: 0.8595233481883355


In [None]:
from imblearn.over_sampling import ADASYN

# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train_p2rest)

print("After ADASYN:", dict(zip(*np.unique(y_resampled, return_counts=True))))


After ADASYN: {0: 19872, 1: 20214}


In [None]:
# xgb on ADASYN balanced data
xgb_p2rest_bal = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_bal.fit(X_resampled, y_resampled)
print('P3 vs Rest model accuracy:', xgb_p2rest_bal.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_bal.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8256693480609917
Rest class:
precision: 0.8202360876897133
recall: 0.7257535064159952
f1score: 0.7701076630778974
P3 class:
precision: 0.8286726323639075
recall: 0.8929288871032544
f1score: 0.8596016244440147


In [None]:
np.unique(y_resampled, return_counts=True)

(array([0, 1]), array([19872, 20214], dtype=int64))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train_p2rest)

In [None]:
# xgb on SMOTE balanced data
xgb_p2rest_bal = XGBClassifier(n_jobs=-1, verbosity=2, random_state=42)
xgb_p2rest_bal.fit(X_resampled, y_resampled)
print('P3 vs Rest model accuracy:', xgb_p2rest_bal.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_p2rest_bal.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8260295353583864
Rest class:
precision: 0.8228105906313645
recall: 0.7233661593554163
f1score: 0.769890424011434
P3 class:
precision: 0.8277911945012075
recall: 0.8951386098834874
f1score: 0.8601486343017083


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xgb_p2rest_unweigh.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'eval_metric': None,
 'gamma': None,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_jobs': -1,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': 2}

In [None]:
## hyperparameter tuning for xgb without weights and imabalanced data
param_grid = {
    'n_estimators': [None, 50, 100, 150],
    'learning_rate': [None, 0.01, 0.1, 0.2],
    'max_depth': [None, 3, 5, 7],
    'subsample': [None, 0.6, 0.8, 1.0],
    'colsample_bytree': [None, 0.6, 0.8, 1.0],
}

xgb_finetune = GridSearchCV(estimator=XGBClassifier(objective='multi:softmax', use_label_encoder=False, 
                                                    eval_metric='mlogloss', n_jobs=-1, random_state=42),
                            param_grid=param_grid, scoring='f1_weighted', cv=3, verbose=1, n_jobs=-1)

xgb_finetune.fit(X_train, y_train_p2rest)

Fitting 3 folds for each of 1024 candidates, totalling 3072 fits


ValueError: 
All the 3072 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3072 fits failed with the following error:
Traceback (most recent call last):
  File "z:\Google\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "z:\Google\Anaconda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "z:\Google\Anaconda\lib\site-packages\xgboost\sklearn.py", line 1519, in fit
    self._Booster = train(
  File "z:\Google\Anaconda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "z:\Google\Anaconda\lib\site-packages\xgboost\training.py", line 181, in train
    bst.update(dtrain, i, obj)
  File "z:\Google\Anaconda\lib\site-packages\xgboost\core.py", line 2050, in update
    _check_call(
  File "z:\Google\Anaconda\lib\site-packages\xgboost\core.py", line 282, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [None]:
xgb_finetune = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, 
                                                    eval_metric='error', n_jobs=-1, random_state=42),
                            param_grid=param_grid, scoring='f1_weighted', cv=3, verbose=1, n_jobs=-1)

xgb_finetune.fit(X_train, y_train_p2rest)

Fitting 3 folds for each of 1024 candidates, totalling 3072 fits


In [None]:
print("Best Parameters:", xgb_finetune.best_params_)
print("Best Score:", xgb_finetune.best_score_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': None, 'subsample': None}
Best Score: 0.821785968976252


In [None]:
print('P3 vs Rest model accuracy:', xgb_finetune.best_estimator_.score(X_test, y_test_p2rest))
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, xgb_finetune.best_estimator_.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

P3 vs Rest model accuracy: 0.8239884740064833
Rest class:
precision: 0.8294302691366655
recall: 0.7081468218442256
f1score: 0.7640051513200258
P3 class:
precision: 0.8211411850768106
recall: 0.9019686621132985
f1score: 0.8596591996936627


In [None]:
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42).fit(X_train, y_train_p2rest)

In [None]:
rf.score(X_test, y_test_p2rest)

0.8147436667066875

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test_p2rest, rf.predict(X_test))
for i, label in enumerate(['Rest', 'P3']):
    print(label, 'class:')
    print('precision:', precision[i])
    print('recall:', recall[i])
    print('f1score:', f1[i])

Rest class:
precision: 0.8074829931972789
recall: 0.708445240226798
f1score: 0.7547289779049436
P3 class:
precision: 0.8187047689738356
recall: 0.8862997187625552
f1score: 0.8511623420468795
