In [None]:
#pip install imblearn

In [1]:
%%time
# Data exploration
import matplotlib.pyplot as plt


# Data processing
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from scipy.stats.mstats import winsorize

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier

# Experimental setup
from sklearn.model_selection import KFold, StratifiedKFold, cross_validate, GridSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
#from imblearn.over_sampling import SMOTE, RandomOverSampler
#from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split

CPU times: total: 2.58 s
Wall time: 8.51 s


In [2]:
# Read train, test
train = pd.read_csv('./credit_default_train.csv', low_memory=False)
test = pd.read_csv('./credit_default_test.csv', low_memory=False)

In [3]:
# Print out to check the data
print(train.dtypes.value_counts())
train.head()

float64    23
int64       2
dtype: int64


Unnamed: 0,cust_id,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,18895,70000.0,1.0,3.0,2.0,34.0,0.0,0.0,0.0,0.0,...,25559.0,26134.0,26715.0,1700.0,1500.0,2000.0,1000.0,1000.0,2000.0,0
1,25102,390000.0,2.0,2.0,2.0,26.0,2.0,2.0,2.0,0.0,...,140387.0,128112.0,115514.0,5000.0,3000.0,5000.0,4548.0,4100.0,3300.0,0
2,28867,60000.0,1.0,1.0,2.0,27.0,0.0,0.0,0.0,0.0,...,26038.0,28607.0,27997.0,1378.0,1406.0,3000.0,3000.0,0.0,923.0,1
3,1842,140000.0,2.0,2.0,1.0,55.0,0.0,0.0,0.0,0.0,...,72391.0,61298.0,62193.0,4200.0,2822.0,2336.0,2588.0,2250.0,2491.0,0
4,3371,50000.0,1.0,1.0,2.0,29.0,2.0,2.0,2.0,0.0,...,1047.0,0.0,0.0,3000.0,0.0,1000.0,0.0,0.0,0.0,1


In [4]:
#check Target distribution

np.bincount(train["default.payment.next.month"])

array([15586,  4414], dtype=int64)

In [5]:
#Code used from Mihn Phan notebooks
# General list of variables
id_var = ["cust_id"]  # ID
target_var = ["default.payment.next.month"]  # Target get variable
predictors = [v for v in train.columns if v not in id_var + target_var]

# List of numerical and catergorical variables
num_vars = ['LIMIT_BAL', 'AGE',
            'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
            'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
cat_vars = ['SEX', 'EDUCATION', 'MARRIAGE',
            'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

# Double check the list of variables
assert(len(predictors) == len(num_vars) + len(cat_vars))

In [6]:
# Here, we test the effect of dropping variables with high missing percentage (>25%)
na_threshold = 0.25

# Drop num variables with more than 25% missing values
num_na_pct = train[num_vars].isnull().mean()
num_vars = num_na_pct[num_na_pct <= na_threshold].index.tolist()
print("Drop num variables with high missing pct:", num_na_pct[num_na_pct > na_threshold].tolist())

# Drop cat variables with more than 25% missing values
cat_na_pct = train[cat_vars].isnull().mean()
cat_vars = cat_na_pct[cat_na_pct <= 0.25].index.tolist()
print("Drop cat variables with high missing pct:", cat_na_pct[cat_na_pct > na_threshold].tolist())

# Update train, test
train = train[id_var + num_vars + cat_vars + target_var]
# test = test[id_var + num_vars + cat_vars]

Drop num variables with high missing pct: []
Drop cat variables with high missing pct: []


In [7]:
train["EDUCATION"].unique()

array([ 3.,  2.,  1., nan,  5.,  4.,  6.,  0.])

In [8]:
#since we have 2 unknown variables, we filled Nas, and merge the 2 unknowns
train["EDUCATION"][train["EDUCATION"]==0]=6
train["EDUCATION"][train["EDUCATION"].isnull()] = 6
train["EDUCATION"][train["EDUCATION"]==6]=5

test["EDUCATION"][test["EDUCATION"]==0]=6
test["EDUCATION"][test["EDUCATION"].isnull()] = 6
test["EDUCATION"][test["EDUCATION"]==6]=5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["EDUCATION"][train["EDUCATION"].isnull()] = 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["EDUCATION"][train["EDUCATION"]==6]=5
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["EDUCATION"][test["EDUCATION"]==0]=6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["EDUCATION"][test["EDUCAT

In [9]:
train["MARRIAGE"].unique()


array([ 2.,  1.,  3., nan,  0.])

In [10]:
train["MARRIAGE"][train["MARRIAGE"]==0]=4
train["MARRIAGE"][train["MARRIAGE"].isnull()] = 4

test["MARRIAGE"][test["MARRIAGE"]==0]=4
test["MARRIAGE"][test["MARRIAGE"].isnull()] = 4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["MARRIAGE"][train["MARRIAGE"].isnull()] = 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["MARRIAGE"][test["MARRIAGE"]==0]=4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["MARRIAGE"][test["MARRIAGE"].isnull()] = 4


In [11]:
# Categorical variables
# Impute missing value using a new category "-1"
# Note: If the categorical vars are imputed by most_frequent, the indicators should be added
train[cat_vars] = train[cat_vars].fillna(-1)
test[cat_vars] = test[cat_vars].fillna(-1)

In [12]:
# Select a cat variable
v = "SEX"

enc = OneHotEncoder(handle_unknown="error")
enc.fit(pd.concat([train[[v]], test[[v]]], axis=0))
#dummy_vars = enc.get_feature_names().tolist()

# Transform train, test
dummy_vars= []
for i in enc.categories_[0]:
    dummy_vars.append(v + str(i) + "_Dummy")
train[dummy_vars] = enc.transform(train[[v]]).toarray()
train.drop(columns=v,inplace= True)
test[dummy_vars] = enc.transform(test[[v]]).toarray()
test.drop(columns=v,inplace= True)

In [13]:
# Select a cat variable
v = "MARRIAGE"

enc = OneHotEncoder(handle_unknown="error")
enc.fit(pd.concat([train[[v]], test[[v]]], axis=0))
#dummy_vars = enc.get_feature_names().tolist()

# Transform train, test
dummy_vars= []
for i in enc.categories_[0]:
    dummy_vars.append(v + str(i) + "_Dummy")
train[dummy_vars] = enc.transform(train[[v]]).toarray()
train.drop(columns=v,inplace= True)
test[dummy_vars] = enc.transform(test[[v]]).toarray()
test.drop(columns=v,inplace= True)

In [14]:
# Select a cat variable
v = "EDUCATION"

enc = OneHotEncoder(handle_unknown="error")
enc.fit(pd.concat([train[[v]], test[[v]]], axis=0))
#dummy_vars = enc.get_feature_names().tolist()

# Transform train, test
dummy_vars= []
for i in enc.categories_[0]:
    dummy_vars.append(v + str(i) + "_Dummy")
train[dummy_vars] = enc.transform(train[[v]]).toarray()
train.drop(columns=v,inplace= True)
test[dummy_vars] = enc.transform(test[[v]]).toarray()
test.drop(columns=v,inplace= True)

In [15]:
inci_vars = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for v in inci_vars:
# Find the incidence rates per category of a categorical variable
    tb = pd.pivot_table(train, values=target_var, index=v, aggfunc=np.mean).reset_index()
    icd_var = v + "_icd"
#     repr_vars.append(icd_var)
    tb.columns = [v, icd_var]
    # Add the incidence column to train, test
    train[icd_var] = pd.merge(train[[v]], tb, on=v)[icd_var]
    test[icd_var] = pd.merge(test[[v]], tb, on=v)[icd_var]
test.drop(columns=inci_vars,inplace= True)
train.drop(columns=inci_vars,inplace= True)

In [16]:
len(train.columns)

34

In [17]:
len(test.columns)

33

In [18]:
# List dummary variables to track missing values imputation
na_vars = []

# Numerical variables
# Build the missing value imputor using the mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
imp.fit(train[num_vars])

# Reconstruct the list of vars + indicators
na_vars = na_vars + [num_vars[v] + "_na" for v in imp.indicator_.features_]
impute_vars = num_vars + na_vars

# Apply on train, test
train[impute_vars] = pd.DataFrame(imp.transform(train[num_vars]), columns=impute_vars)
test[impute_vars] = pd.DataFrame(imp.transform(test[num_vars]), columns=impute_vars)

In [19]:

# Check the outliers on train, test
for v in num_vars:
    # Calculate the boundaries on train [mean-3*sd, mean+3*sd]
    mu = np.mean(train[v])
    sd = np.std(train[v])
    lower = mu - 3*sd
    upper = mu + 3*sd
    # Check outliers using the boundaries
    train_out = (train[v] < lower) | (train[v] > upper)
    test_out = (test[v] < lower) | (test[v] > upper)
    if np.sum(train_out) + np.sum(test_out) > 0:
        print(v, "has # outliers on train, test :",
              np.sum(train_out), "[", np.round(100*np.mean(train_out), 2), "% ]",
              np.sum(test_out), "[", np.round(100*np.mean(test_out), 2), "% ]")

LIMIT_BAL has # outliers on train, test : 92 [ 0.46 % ] 37 [ 0.37 % ]
AGE has # outliers on train, test : 89 [ 0.44 % ] 51 [ 0.51 % ]
BILL_AMT1 has # outliers on train, test : 458 [ 2.29 % ] 225 [ 2.25 % ]
BILL_AMT2 has # outliers on train, test : 457 [ 2.28 % ] 208 [ 2.08 % ]
BILL_AMT3 has # outliers on train, test : 439 [ 2.2 % ] 220 [ 2.2 % ]
BILL_AMT4 has # outliers on train, test : 454 [ 2.27 % ] 228 [ 2.28 % ]
BILL_AMT5 has # outliers on train, test : 434 [ 2.17 % ] 216 [ 2.16 % ]
BILL_AMT6 has # outliers on train, test : 437 [ 2.18 % ] 212 [ 2.12 % ]
PAY_AMT1 has # outliers on train, test : 284 [ 1.42 % ] 159 [ 1.59 % ]
PAY_AMT2 has # outliers on train, test : 199 [ 1.0 % ] 118 [ 1.18 % ]
PAY_AMT3 has # outliers on train, test : 253 [ 1.26 % ] 111 [ 1.11 % ]
PAY_AMT4 has # outliers on train, test : 266 [ 1.33 % ] 133 [ 1.33 % ]
PAY_AMT5 has # outliers on train, test : 278 [ 1.39 % ] 132 [ 1.32 % ]
PAY_AMT6 has # outliers on train, test : 302 [ 1.51 % ] 131 [ 1.31 % ]


In [20]:
for j in num_vars:
    winsorize(train[j], limits=[0.05, 0.05])
    winsorize(test[j], limits=[0.05, 0.05])

In [21]:
for v in num_vars:
        # Build the normalizer on train
    scaler = MinMaxScaler().fit(train[[v]])
        # Apply on train, test
    train[v] = scaler.transform(train[[v]])
    test[v] = scaler.transform(test[[v]])

### MODELING

In [22]:
#Split into train and set
X, y = train.drop(["cust_id", "default.payment.next.month"], axis=1), train["default.payment.next.month"]
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
#define which models are best suited
log_reg_params = [{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}]
dec_tree_params = [{"criterion": "gini"}, {"criterion": "entropy"}]
rand_for_params = [{"criterion": "gini"}, {"criterion": "entropy"}]
kneighbors_params = [{"n_neighbors":3}, {"n_neighbors":5}]
naive_bayes_params = [{}]
svc_params = [{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}]

In [None]:
modelclasses = [
    ["log regression", LogisticRegression, log_reg_params],
    ["decision tree", DecisionTreeClassifier, dec_tree_params],
    ["random forest", RandomForestClassifier, rand_for_params],
    ["k neighbors", KNeighborsClassifier, kneighbors_params],
    ["naive bayes", GaussianNB, naive_bayes_params],
    ["support vector machines", SVC, svc_params]
]

In [None]:
insights = []
for modelname, Model, params_list in modelclasses:
    for params in params_list:
        model = Model(**params)
        model.fit(X_train, y_train)
        score = model.score(X_val, y_val)
        insights.append((modelname, model, params, score))

In [None]:
insights.sort(key=lambda x:x[-1], reverse=True)
for modelname, model, params, score in insights:
    print(modelname, params, score)

#### Logistic Regression

In [None]:
logreg =LogisticRegression(random_state=123, max_iter=500, class_weight="balanced")

In [None]:
# Cross validation on Log Reg
cv_lr = cross_val_score(logreg, X, y, scoring="roc_auc", cv=3,n_jobs= -2)

In [None]:
cv_lr.mean()

In [None]:
#hyperparameter tuning
# define model
model = LogisticRegression()
# define evaluation
cv = StratifiedKFold(n_splits=10)
# define search space
grid = dict()
grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
grid['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
grid['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
Lrgrid = GridSearchCV(model, grid, scoring='roc_auc', n_jobs=-2, cv=cv)
# execute search
Lrgrid = Lrgrid.fit(X, y)

In [None]:

log_pred_prob = Lrgrid.predict_proba(X_val)[:,1]
log_pred_prob_train = Lrgrid.predict_proba(X_train)[:,1]

log_pred = Lrgrid.predict(X_val)
log_pred_train = Lrgrid.predict(X_train)

In [None]:
accuracy_val = accuracy_score(y_val, log_pred)
accuracy_train = accuracy_score(y_train, log_pred_train)
print(f"Accuracy Train: {accuracy_train} \nAccuracy Val: {accuracy_val}")

In [None]:
auc_val = roc_auc_score(y_val, log_pred_prob)
auc_train = roc_auc_score(y_train, log_pred_prob_train)

print(f"AUC Train: {auc_train} \nAUC Val: {auc_val}")

#### SVC

In [None]:
param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf',"poly",'sigmoid']}

In [None]:
from sklearn.svm import SVC
grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2, n_jobs=-2)

In [None]:
#Will take ages to run
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
svc = LinearSVC(random_state=123, tol=1e-5)

In [None]:
cv_svc = cross_val_score(svc, X, y, scoring="roc_auc", cv=5)

In [None]:
cv_svc.mean()

#### Random Forest

In [None]:
rfc=RandomForestClassifier(random_state=42)

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' :['gini', 'entropy']
}

In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,n_jobs= -2)
CV_rfc.fit(X_train, y_train)

In [None]:
CV_rfc.best_params_

In [23]:
rf = RandomForestClassifier(criterion="gini", n_estimators=200, random_state=42, max_depth=10,max_features = "auto")

In [24]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42)

In [None]:
rfpred = rf.predict_proba(X_val)[:,1]

In [None]:
rfpred_train = rf.predict_proba(X_train)[:,1]

In [None]:
roc_auc_score(y_train, rfpred_train )

In [None]:
roc_auc_score(y_val, rfpred)

In [None]:
importances = rf.feature_importances_

In [None]:
len(importances)

In [None]:
len()

In [None]:
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

forest_importances = pd.Series(importances, index=X_train.columns)
forest_importances = forest_importances[forest_importances>= forest_importances.mean()]
forest_importances

In [None]:
fig, ax = plt.subplots()
forest_importances.plot.bar( ax=ax)
ax.set_title("Feature importances using RF")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#### XGB

In [None]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.02, 0.05]    
        }

In [None]:

from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
folds = 3
param_comb = 100

#Gridsearch
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

xgb = XGBClassifier(learning_rate=0.02, n_estimators=1000, objective='binary:logistic',
                    silent=True, nthread=6, tree_method='gpu_hist', eval_metric='auc')

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-2, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

random_search.fit(X_train, y_train)


In [None]:
random_search.best_params_

In [25]:
from xgboost import XGBClassifier
x_grad = XGBClassifier(subsample = 0.6, min_child_weight =  1, max_depth= 10,learning_rate = 0.01,gamma= 5,colsample_bytree= 0.6, eval_metric='auc')

  from pandas import MultiIndex, Int64Index


In [26]:
x_grad.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, eval_metric='auc', gamma=5, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.6, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [None]:
x_grad

In [None]:

xcgb_pred_prob = x_grad.predict_proba(X_val)[:,1]
xcgb_pred_prob_train = x_grad.predict_proba(X_train)[:,1]

xcgb_pred = x_grad.predict(X_val)
xcgb_pred_train = x_grad.predict(X_train)

In [None]:
accuracy_val = accuracy_score(y_val, xcgb_pred)
accuracy_train = accuracy_score(y_train, xcgb_pred_train)
print(f"Accuracy Train: {accuracy_train} \nAccuracy Val: {accuracy_val}")

In [None]:
auc_val = roc_auc_score(y_val, xcgb_pred_prob)
auc_train = roc_auc_score(y_train, xcgb_pred_prob_train)

print(f"AUC Train: {auc_train} \nAUC Val: {auc_val}")

In [None]:

xgb_importances = x_grad.feature_importances_

In [None]:

gb_importances = pd.Series(xgb_importances, index=X_train.columns)
gb_importances = gb_importances[gb_importances>= gb_importances.mean()]
gb_importances

In [None]:
fig, ax = plt.subplots()
gb_importances.plot.bar( ax=ax)
ax.set_title("Feature importances using XGBoosting")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#### Gradient Boosting

In [None]:
from sklearn.metrics import roc_auc_score
scoring= ['roc_auc']
parameters = {#'nthread':[3,4], #when use hyperthread, xgboost may become slower
               "criterion": ["friedman_mse",  "mae"],
              "loss":["deviance","exponential"],
              "max_features":["log2","sqrt"],
              'learning_rate': [0.01,0.05,0.1,1,0.5], #so called `eta` value
              'max_depth': [3,4,5],
              'min_samples_leaf': [4,5,6],

              'subsample': [0.6,0.7,0.8],
              'n_estimators': [50,100,150,200]#number of trees, change it to 1000 for better results
              

              }

In [None]:
clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False,cv=3)


In [None]:
clf.fit(X_train, y_train)

In [None]:
gb = GradientBoostingClassifier(random_state= 123)

In [None]:
gb.fit(X_train, y_train)

In [None]:
#estimator.get_params().keys()

In [None]:
gb_pred_prob = gb.predict_proba(X_val)[:,1]
gb_pred_prob_train = gb.predict_proba(X_train)[:,1]

gb_pred = gb.predict(X_val)
gb_pred_train = gb.predict(X_train)

In [None]:
accuracy_val = accuracy_score(y_val, gb_pred)
accuracy_train = accuracy_score(y_train, gb_pred_train)
print(f"Accuracy Train: {accuracy_train} \nAccuracy Val: {accuracy_val}")

In [None]:
auc_val = roc_auc_score(y_val, gb_pred_prob)
auc_train = roc_auc_score(y_train, gb_pred_prob_train)

print(f"AUC Train: {auc_train} \nAUC Val: {auc_val}")

##### Cross Validation Gradient Boosting

In [None]:
cv_gb = cross_val_score(gb, X, y, scoring="roc_auc", cv=5)

In [None]:
cv_gb.mean()

#### NEURAL NETWORK

In [None]:
X_train

In [None]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras

In [None]:
def model_builder(hp):
  '''
  Args:
    hp - Keras tuner object
  '''
  # Initialize the Sequential API and start stacking the layers
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape=(46, 1)))
  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
  model.add(keras.layers.Dense(units=hp_units, activation='relu', name='dense_1'))
  # Add next layers
  model.add(keras.layers.Dropout(0.2))
  model.add(keras.layers.Dense(2, activation='softmax'))
  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(),
                metrics=['accuracy'])
  return model

#https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html

In [None]:
import kerastuner as kt

# Instantiate the tuner
tuner = kt.Hyperband(model_builder, # the hypermodel
                     objective='val_accuracy', # objective to optimize
max_epochs=10,
factor=3, # factor which you have seen above 
directory='dir', # directory to save logs 
project_name='khyperband')

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
# Perform hypertuning
tuner.search(X_train, y_train, epochs=10, validation_split=0.3, callbacks=[stop_early])

In [None]:
best_hp=tuner.get_best_hyperparameters()[0]
# Build the model with the optimal hyperparameters


In [None]:
h_model = tuner.hypermodel.build(best_hps)
h_model.summary()
h_model.fit(X_train, y_train, epochs=10, validation_split=0.2)

In [None]:
NN_pred_prob = h_model.predict(X_val)[:,1]
NN_pred_prob_train = h_model.predict(X_train)[:,1]
auc_val = roc_auc_score(y_val, NN_pred_prob)
auc_train = roc_auc_score(y_train, NN_pred_prob_train)

print(f"AUC Train: {auc_train} \nAUC Val: {auc_val}")


#### HYBRID MODEL

In [27]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
hybrid = VotingClassifier(estimators=[('rf', rf), ('x_grad', x_grad)], voting='soft')
hybrid = hybrid.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [28]:
hybrid_pred_prob = hybrid.predict_proba(X_val)[:,1]
hybrid_prob_train = hybrid.predict_proba(X_train)[:,1]

hybrid_pred = hybrid.predict(X_val)
hybrid_pred_train = hybrid.predict(X_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [29]:
accuracy_val = accuracy_score(y_val, hybrid_pred)
accuracy_train = accuracy_score(y_train, hybrid_pred_train)
print(f"Accuracy Train: {accuracy_train} \nAccuracy Val: {accuracy_val}")

Accuracy Train: 0.816 
Accuracy Val: 0.7885


In [30]:
auc_val = roc_auc_score(y_val, hybrid_pred_prob)
auc_train = roc_auc_score(y_train, hybrid_prob_train)

print(f"AUC Train: {auc_train} \nAUC Val: {auc_val}")


AUC Train: 0.8834097751832439 
AUC Val: 0.730128085434655


##### Plotting ROC Curve

In [None]:

plot_roc_curve(gb, X_val, y_val)
plt.plot([0, 1], [0, 1],'r--')
plt.show()

In [None]:
len(X_train.columns)

#### Predicting Test Set

In [31]:
X_test=  test.drop(["cust_id"], axis=1)
rf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [32]:
test_predict = test[["cust_id"]]
test_predict["default.payment.next.month"] = hybrid.predict_proba(X_test)[:,1]

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_predict["default.payment.next.month"] = hybrid.predict_proba(X_test)[:,1]


In [38]:
test_sorted = test_predict.sort_values(by="default.payment.next.month", ascending = False)

### Profiling

In [63]:
test_original = pd.read_csv('./credit_default_test.csv', low_memory=False)

In [47]:
top_10_cust = test_sorted[test_sorted["default.payment.next.month"]>=0.41]["cust_id"]

In [64]:
top_10_default = test_original[test_original["cust_id"].isin(top_10_cust)]

In [66]:
top_10_default["age_group"] = (top_10_default["AGE"]//10)*10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_default["age_group"] = (top_10_default["AGE"]//10)*10


In [76]:
test_original["age_group"] = (test_original["AGE"]//10)*10

In [67]:
top_10_default

Unnamed: 0,cust_id,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,age_group
0,18847,130000.0,1.0,1.0,2.0,33.0,1.0,2.0,2.0,2.0,...,39526.0,41346.0,40630.0,0.0,6000.0,0.0,3000.0,1000.0,1100.0,30.0
5,10205,400000.0,2.0,1.0,1.0,44.0,4.0,3.0,2.0,0.0,...,269278.0,269278.0,1976.0,0.0,0.0,150.0,0.0,3355.0,5688.0,40.0
10,27114,50000.0,2.0,2.0,1.0,34.0,2.0,2.0,2.0,2.0,...,45290.0,48322.0,49094.0,2000.0,1700.0,0.0,3760.0,1700.0,0.0,30.0
17,41,20000.0,1.0,3.0,2.0,28.0,1.0,2.0,2.0,2.0,...,19492.0,19888.0,14087.0,0.0,2860.0,0.0,549.0,441.0,502.0,20.0
21,25759,460000.0,1.0,1.0,1.0,39.0,2.0,2.0,5.0,5.0,...,2495.0,2495.0,2495.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9970,27011,30000.0,2.0,2.0,1.0,67.0,2.0,2.0,0.0,0.0,...,29499.0,30124.0,26855.0,0.0,1500.0,2182.0,1200.0,0.0,2302.0,60.0
9976,26815,20000.0,1.0,2.0,2.0,42.0,1.0,2.0,0.0,0.0,...,13923.0,13405.0,13683.0,0.0,1600.0,2726.0,0.0,643.0,646.0,40.0
9979,3608,20000.0,1.0,3.0,1.0,53.0,3.0,4.0,3.0,2.0,...,15116.0,14582.0,14206.0,0.0,0.0,1200.0,0.0,0.0,2230.0,50.0
9983,13758,360000.0,1.0,1.0,1.0,32.0,-1.0,-1.0,-2.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0


In [87]:
top_10_default.groupby(["age_group", "SEX"])["cust_id"].count()/test_original.groupby(["age_group","SEX"])["cust_id"].count()

age_group  SEX
20.0       1.0    0.140301
           2.0    0.101435
30.0       1.0    0.086635
           2.0    0.071141
40.0       1.0    0.117965
           2.0    0.089438
50.0       1.0    0.126761
           2.0    0.126263
60.0       1.0    0.160714
           2.0    0.115385
70.0       1.0         NaN
           2.0         NaN
Name: cust_id, dtype: float64

In [92]:
top_10_default.groupby(["MARRIAGE"])["cust_id"].count()

MARRIAGE
0.0      2
1.0    485
2.0    484
3.0      9
Name: cust_id, dtype: int64

In [90]:
top_10_default.groupby(["age_group","SEX"])["cust_id"].count()

age_group  SEX
20.0       1.0    149
           2.0    205
30.0       1.0    129
           2.0    159
40.0       1.0    109
           2.0    105
50.0       1.0     45
           2.0     50
60.0       1.0      9
           2.0      6
Name: cust_id, dtype: int64

In [80]:
test_original.groupby(["SEX"])["cust_id"].count()

SEX
1.0    3936
2.0    5953
Name: cust_id, dtype: int64

In [None]:
test_predict.to_csv("./predict_rf.csv", index= False)

In [None]:
test_predict