In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
data.head()

In [None]:
Y = data['Bankrupt?']
data = data.drop(['Bankrupt?'], axis=1)

In [None]:
data.isnull().sum().values

# 1) Feature Selection

# 1.1) Remove too low variance

In [None]:
data_mean = np.mean(data, axis=0)

variance = np.var(data/data_mean, axis=0)
var_df = pd.DataFrame(variance,columns=['variance'])
var_df.sort_values(by='variance').style.background_gradient(sns.light_palette('green', as_cmap=True))

In [None]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(5.0).fit(data/data_mean)

vt_transform = vt.transform(data/data_mean)

In [None]:
high_var_cols = data.columns[vt.get_support()]
low_var_cols = data.columns[~vt.get_support()]

print("Removed cols :",len(low_var_cols))
print("Remaining cols :",len(high_var_cols))

# 1.2) Inspect Correlation

In [None]:
corr_mat = np.corrcoef(vt_transform, rowvar=False)
corr_mat = pd.DataFrame(corr_mat)

corr_mat.style.background_gradient(sns.light_palette('blue', as_cmap=True))

# 1.3) Recursive Feature Elimination <br>
Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

rfe = RFE(estimator = RandomForestClassifier(n_estimators=300, class_weight={0:1,1:2}), n_features_to_select=18, verbose=1).fit(vt_transform, Y.values)

In [None]:
vt_transform_rfe = rfe.transform(vt_transform)
vt_transform_rfe.shape

# 2) Predictive models

This data is **Highly imbalanced class** because not so many companies are bankrupted. <br>
Thus, we need to concern very much about **Recall** score because we don't want to misclassified the bankrupted.<br>

"This company is bankrupted, but we predicted that this company is safe" -> We don't want this to happen.

In [None]:
Y.value_counts()

# 2.1) Build the models

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import xgboost  

X_train, X_test,y_train, y_test = train_test_split(vt_transform_rfe, Y.values, test_size=0.25, stratify=Y.values)
scaler = StandardScaler().fit(X_train)

models = dict()

models['Random Forest'] = RandomForestClassifier(n_estimators=300, class_weight={0:1,1:3})
models['Logreg'] = LogisticRegression(penalty='elasticnet',  class_weight={0:1,1:3}, solver='saga', l1_ratio=0.7)
models['GradientBoost'] = GradientBoostingClassifier(n_estimators=300)
models['AdaBoost'] = AdaBoostClassifier(n_estimators=300)
models['XGBoost'] = xgboost.XGBClassifier()

for model in models:
    if model == 'Logreg':
        train = scaler.transform(train)
    else:
        train = X_train
    models[model].fit(train, y_train)
    print(model + ' : fit')


# 2.2) Performance in train set

In [None]:

for x in models:

    if x == 'Logreg':
        train = scaler.transform(X_train)
    else:
        train = X_train
        
    print('------------------------'+x+'------------------------')
    model = models[x]
    y_train_pred = model.predict(train)
    arg_train = {'y_true':y_train, 'y_pred':y_train_pred}
    print(confusion_matrix(**arg_train))
    print(classification_report(**arg_train))
    

We see that XGBoost has done a very good job!!

# 2.3) Performance in test set

In [None]:
for x in models:
    
    if x == 'Logreg':
        test = scaler.transform(X_test)
    else:
        test=X_test
    print('------------------------'+x+'------------------------')
    model = models[x]
    y_test_pred = model.predict(test)
    arg_test = {'y_true':y_test, 'y_pred':y_test_pred}
    print(confusion_matrix(**arg_test))
    print(classification_report(**arg_test))

XGBoost have the greatest Recall !!

In [None]:
'''
Precision = เราอยากทาย 1 ให้ถูก/ ให้โมเดลทาย 1 แม่นๆ <br>
Recall = เราไม่อยากทาย 1 ผิด

บริษัทนี้จะล้มละลาย(1) แต่เราทายผิดว่ามันไม่ล้ม(0) = ทาย 1 ผิด = ไม่อยากให้เกิดสิ่งนี้ขึ้น = ต้องการ recall สูงๆ 

โมเดลนี้มี recall ต่ำมาก = เป็นโมเดลที่ไม่ดี
'''

# 2.4) Take care of XGB

### Lower the probability threshold to improve Recall score

In [None]:
# Test set
model = models['XGBoost']
y_test_pred_prob = model.predict_proba(X_test)

y_test_pred_prob_lowerThres = y_test_pred_prob[:,1] > 0.1

arg_test = {'y_true':y_test, 'y_pred':y_test_pred_prob_lowerThres}
print("TEST\n")
print(confusion_matrix(**arg_test))
print(classification_report(**arg_test))


# Train set
y_train_pred_prob = model.predict_proba(X_train)

y_train_pred_prob_lowerThres = y_train_pred_prob[:,1] > 0.1

arg_train = {'y_true':y_train, 'y_pred':y_train_pred_prob_lowerThres}
print("TRAIN\n")
print(confusion_matrix(**arg_train))
print(classification_report(**arg_train))

In [None]:
rfe.ranking_ 
#array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1])
#--> 1 คืดไม่โดนตัดออกไป, 4 คือโดนตัดออกไปคนแรก, 3 คือโดนตัดคนถัดไป

#array([False, False, False, False, False, False,  True, False, False, False, False, False, False, False, False, False,  True, False, False, False, False])
#array([10, 17, 14, 15, 11,  9,  1, 16, 19,  2, 13, 20, 18,  4,  7, 12,  1,  3,  5,  6,  8])

### Cross validation score on Recall score

In [None]:
from sklearn.metrics import make_scorer, recall_score

recall_scorer = make_scorer(recall_score)
cv_score = cross_val_score(models['XGBoost'], X_train, y_train, cv=5, scoring=recall_scorer)

In [None]:
print('cv_score :', cv_score)
print('mean :',cv_score.mean())

# 2.5) ROC curve

In [None]:
from sklearn.metrics import roc_curve

fig, ax = plt.subplots()
fig.set_size_inches(13,6)

for m in models:
    y_pred = models[m].predict_proba(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred[:,1].ravel())
    plt.plot(fpr,tpr, label=m)
plt.xlabel('False-Positive rate')
plt.ylabel('True-Positive rate')
plt.legend()
plt.show()

# 2.6) Randomized search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {'eta':[0.2,0.3,0.4],
         'max_depth':[5,6,7],
         'sampling_method':['uniform','gradient_based'],
         'lambda':[1,1.5],
         'alpha':[0,0.5],
         }

search = RandomizedSearchCV(estimator = models['XGBoost'], n_iter=50, scoring = recall_scorer, cv=5, verbose=1, param_distributions=params)

In [None]:
search.fit(X_train, y_train)

In [None]:
print("Best cv score :",search.best_score_)
print("Best params :",search.best_params_)

We see that cv_score is improved.

In [None]:
models['XGB_searched'] = search.best_estimator_

model = models['XGB_searched']
test=X_test
print('TEST')
print('------------------------'+x+'------------------------')
y_test_pred = model.predict_proba(test)[:,1] > 0.1
arg_test = {'y_true':y_test, 'y_pred':y_test_pred}
print(confusion_matrix(**arg_test))
print(classification_report(**arg_test))


train = X_train
print("TRAIN")
print('------------------------'+x+'------------------------')
y_train_pred = model.predict_proba(train)[:,1] > 0.1
arg_train = {'y_true':y_train, 'y_pred':y_train_pred}
print(confusion_matrix(**arg_train))
print(classification_report(**arg_train))

Recall is also improved.