In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1>Bankruptcy Prediction</h1>

<img src= "https://images.cnbctv18.com/wp-content/uploads/2019/06/bankruptcy-768x512.jpg">

For this notebook, we'll be using a simple workflow with variable transformation, variable selection with a voting ensemble method, followed by a grid search with cross-validation on 3 classification models (logistic regression, random forest and ADA boost). We then selec the best model based on the CV results for the F1 score and use that model on our test data.

Loading the data and libraries

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_auc_score,roc_curve,accuracy_score,precision_score,recall_score
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import seaborn as sns
import numpy as np
from scipy.spatial.distance import pdist
import math
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import time
import matplotlib.patches as mpatches
import random
from tensorflow import keras
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_validate
from sklearn.model_selection import permutation_test_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import SelectFdr
from sklearn.feature_selection import SelectFwe
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge
df=pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')


<h1>Very quick EDA</h1>

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.hist(figsize = (35,30), bins = 50)
plt.show()

As we can see in the histograms above, some datapoints are very skewed, we'll use a log-transform to try and remedy that. Let's check if we have any N/As or negative values in our data (log transforms don't really like negative values)

In [None]:
# No N/As in the data
print("N/As: "+str(df.isna().sum().sum()))
# Are there negative values?
print("Negative values: "+str(df[df<0].sum().sum()))

As we can see below, there is severe class imbalance in the data, with postive cases representing only 3.23% of total cases so we'll use the Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
# Separating dependent and independent variables
y=df['Bankrupt?']
X=df.iloc[:,1:]
print("Percentage of positive cases: "+str(y.mean()))

We will need to transform skewed values with a log transform and standardize the data before proceeding to the be next st steps

# **LOG TRANSFORM AND STANDARDISATION**

If we look at say, at Total assets turnover, we can see the data is skewed:

In [None]:
X[' Total Asset Turnover'].hist()
plt.show()
print("Skewness score: "+str(X[' Total Asset Turnover'].skew()))

In [None]:
# credit goes to user Marto93 for this handy function
def log_trans(data):
    
    for col in data:
        skew = data[col].skew()
        if skew > 0.5 or skew < -0.5:
            data[col] = np.log1p(data[col])
        else:
            continue
            
    return data

In [None]:
X=log_trans(X)

In [None]:
X[' Total Asset Turnover'].hist()
plt.show()
print("Skewness score: "+str(X[' Total Asset Turnover'].skew()))

Not perfect, but it looks less skewed, let's now create our Train and Test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)


<h1>Variable Selection - part 1: removing correlated variables</h1>

We will now look at correlated variables and remove variables that are highly (linearly) correlated

In [None]:
# Let's look at correlation between variables and remove highly correlated variables with r>0.97 (more thorough variable selection will be done ion the next step)
correlations=X_train.corr()
correlated_vars=pd.DataFrame(columns=['var1','var2','corr'])
round=0
for c in correlations.columns:
    round=round+1
    for r in list(correlations.index)[round:]:
# Showing variables with correlation higher than 97%        
        if correlations.loc[r,c]>0.97:
            correlated_vars=correlated_vars.append({'var1':c,'var2':r,'corr':correlations.loc[r,c]},ignore_index=True)
print(correlated_vars)    

In [None]:
# Let's drop some variables
for v in [' ROA(C) before interest and depreciation before interest',
          ' ROA(A) before interest and % after tax',
          ' Realized Sales Gross Margin',
          ' Pre-tax net Interest Rate',
          ' Current Liabilities/Equity',
          ' Current Liabilities/Liability',
          ' Net Value Growth Rate',
          ' Continuous interest rate (after tax)',
          ' Net Value Per Share (B)',
          ' Net Value Per Share (C)',
          ' Gross Profit to Sales',
          ' Operating Profit Per Share (Yuan ¥)',
          ' Per Share Net profit before tax (Yuan ¥)',
          ' After-tax Net Profit Growth Rate',
         ' After-tax net Interest Rate',
         ' Borrowing dependency']:
    X_train=X_train.drop(v,axis=1)
    X_test=X_test.drop(v,axis=1)
print(X_train.shape)
print(X_test.shape)

<h1>Variable Selection - part 2: a voting ensemble</h1>

In [None]:
vote=pd.DataFrame(columns=X_train.columns)
vote['method']=['mutual_info','rfe','ctree','rforest','Fdr','permutation']
vote.iloc[:,:-1]=0

In [None]:
# Feature selection is performed using ANOVA F measure via the f_classif() function

print('mutual_info')
fs = SelectKBest(score_func=mutual_info_classif,k=20)
fs.fit(X_train, y_train==1)

vote.iloc[vote.method=='mutual_info',:-1]=(np.argsort(fs.scores_)+1)

print('Fdr')
fs = SelectFdr(alpha=0.01)
fs.fit_transform(X_train, y_train)

vote.iloc[vote.method=='Fdr',:-1]=(np.argsort(fs.scores_)+1)

print('permutation')
model = Ridge(alpha=1e-2)
model.fit(X_train, (y_train==1))
results = permutation_importance(model, X_train, y_train)
vote.iloc[vote.method=='permutation',:-1]=(np.argsort(results.importances_mean)+1)
print('rfe')
model = LogisticRegression(max_iter=20000)
rfe = RFE(model, n_features_to_select=1)
rfe = rfe.fit(X_train, y_train)
vote.iloc[vote.method=='rfe',:-1]=rfe.ranking_

In [None]:
model = RandomForestClassifier(max_depth=10, random_state=0)
# fit the model
model.fit(X_train, y_train)
# get importance
importance = model.feature_importances_
vote.iloc[vote.method=='rforest',:-1]=(np.argsort(importance)+1)

In [None]:
model = DecisionTreeClassifier(random_state=0)
# fit the model
model.fit(X_train, y_train)
# get importance
importance = model.feature_importances_
vote.iloc[vote.method=='ctree',:-1]=(np.argsort(importance)+1)

In [None]:
vote

**FINAL VOTE, AND THE WINNER IS...**

We will use a the Nauru variant of a Borda count to choose our top 9 features, out of 79 features.

In [None]:
final_vote=1/vote.iloc[:,:-1]
vote_results=final_vote.sum(axis=0).nlargest(9)
print(vote_results)

In [None]:
selection=vote_results.index
print(selection)

<h1>Defining the pipeline</h1>

<h2>Logistic Regression</h2>

In [None]:
from sklearn.model_selection import cross_val_score
import sklearn.metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import make_pipeline

params = {"logisticregression__penalty": ['l2','l1','elasticnet'],
                  'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],}
import warnings
warnings.filterwarnings("ignore")
pipe = make_pipeline(SMOTE(sampling_strategy=0.5), StandardScaler(), LogisticRegression(solver='saga',max_iter=5000))
grid_search_logit = GridSearchCV(pipe, param_grid=params,cv=3,scoring='f1')
grid_search_logit.fit(X_train[selection],y_train)

In [None]:
grid_search_logit.best_score_ 

<h2>Random Forest</h2>

In [None]:
params = {"randomforestclassifier__max_depth": [10,20,40,80],
                  'randomforestclassifier__criterion':['entropy','gini'],
                  'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
                  'randomforestclassifier__n_estimators': [5,10,15],
          'randomforestclassifier__class_weight' : ['balanced','balanced_subsample'],
          'randomforestclassifier__bootstrap' : [True,False]
         }
pipe = make_pipeline(SMOTE(sampling_strategy=0.5), StandardScaler(), RandomForestClassifier())
grid_search_rf = GridSearchCV(pipe, param_grid=params,cv=3,scoring='f1')
grid_search_rf.fit(X_train[selection],y_train)

In [None]:
grid_search_rf.best_score_

# **ADABoost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier

params = {'adaboostclassifier__learning_rate' : [0.1,0.01,0.001],
              'adaboostclassifier__n_estimators' : range(1,10)}

pipe = make_pipeline(SMOTE(sampling_strategy=0.5), StandardScaler(), AdaBoostClassifier(random_state=8))
grid_search_ada = GridSearchCV(pipe, param_grid=params,cv=3,scoring='f1')
grid_search_ada.fit(X_train[selection],y_train)

In [None]:
grid_search_ada.best_score_

As we can see above, we got the highest cross-validation score for our Random Forest model, let's see how it performs on our test...

In [None]:
#model_logit.fit(X_train[selection],y_train)
y_pred=grid_search_rf.predict(X_test[selection])     
# showing output:
cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
    
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))

# ROC curve
y_pred_proba = grid_search_rf.predict_proba(X_test[selection])[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
    
# precision recall curve
disp = plot_precision_recall_curve(grid_search_rf, X_test[selection], y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}')
plt.show()
    # Adding Moody's scoring methodology: “Rating Methodology: Moody’s Public Firm Risk Model: A Hybrid Approach to Modeling
#Short Term Default Risk,” Moody’s Investors Service, March 2000. The AC ratio is somewhat
#related to the Kolmogorov-Smirnov test.
# In (Vassalou and Xing, 2004)
    # We start by sorting the results by order of probability
y_pred_proba=pd.Series(y_pred_proba)
#y_test=y_test.reset_index()
data = pd.DataFrame({'y_pred_proba': y_pred_proba,'y_test':y_test})
#AR(data)
    
    # Printing the mean and median of probabilities versus real incidence
print('Mean probability: '+str(np.mean(y_pred_proba)))
print('Median probability: '+str(np.median(y_pred_proba)))
print('Real incidence: '+str(y_test.sum()/len(y_test)))

We get a final F1 score of 0.43 on our test...