## 0. Notebook Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from sklearn.metrics import matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

datafile="/kaggle/input/company-bankruptcy-prediction/data.csv"

## 1. Data inspection

In [None]:
rawdata=pd.read_csv(datafile)
rawdata

In [None]:
rawdata.describe()

## 2. Data cleaning

### 2.1 Undersampling

Despite being a huge dataset, it's very imbalanced.

In [None]:
counts=rawdata.groupby("Bankrupt?")["Bankrupt?"].count()
counts

This will make the positive bankrupt cases more difficult to identify. To deal with it, undersampling is applied to the dataset: All the positives are preserved and most of the negatives are dropped so we can set the fraction of positives of the new balanced dataset. In this case, we set the positive fraction to 40%. 

In [None]:
def undersample(data, class_column, minor_class_frac):
    counter=data.groupby(class_column)[class_column].count()
    minor_class=counter[counter==counter.min()].index[0]
    major_class=counter[counter==counter.max()].index[0]
    
    minor_class_data=data[data[class_column]==minor_class]
    major_class_data=data[data[class_column]==major_class]
    
    ratio=counter[minor_class]/counter[major_class]
    
    major_class_undersampling=(1./minor_class_frac - 1.)*ratio
    
    major_class_data=major_class_data.sample(frac=major_class_undersampling)
                          
    newdata=pd.concat([minor_class_data,major_class_data])
    
    return newdata

balancedata=undersample(rawdata,"Bankrupt?", minor_class_frac=0.4)
discardata=pd.concat([rawdata,balancedata]).drop_duplicates(keep=False)

balancedata.groupby("Bankrupt?")["Bankrupt?"].count()

In [None]:
balancedata

### 2.2 Logarithmic transformation to variables with extreme magnitude order differences

Some variables have values with very different magnitude order. Most of them have some values in the range of 10^(-4) and others  around 10^9. Both ranges have details in their distribution we don't want to miss. Now the boxplots of such variables will be shown.

In [None]:
for column in balancedata.columns:
    if column!="Bankrupt?":
        m=balancedata[column].max()
        if m>10:
            fig,ax_=plt.subplots(nrows=1,ncols=2,figsize=(10,5))
            ax_[0]=sns.boxplot(data=balancedata[balancedata[column]<=10], 
                               y=column, 
                               x="Bankrupt?",ax=ax_[0])
            ax_[0].set_title("Lower order values")
            ax_[1]=sns.boxplot(data=balancedata[balancedata[column]>10], 
                               y=column, 
                               x="Bankrupt?",ax=ax_[1])
            ax_[1].set_title("Upper order values")
            plt.show()

In order to "squeeze" the values while keeping the details of the set of values of both ranges. In order to do so. We make the following transformation:

\begin{equation}
x'=\ln{\left(x^{\frac{1}{k}}+1 \right)}
\end{equation}

where

\begin{equation}
k=\frac{\ln{(x_{\text{max}})}}{\ln{(e-1)}}
\end{equation}

so when x=0, x'=0 and when x=x_max, x'=1. Also the greater the value, the greater the squeezing effect.

Now let's show an example. We have picked a variable with this issue. First we apply the logarithmic transformation and then the standard scaling to represent what the machine learning algorithms will get an the input.

In [None]:
x=balancedata[" Operating Expense Rate"]
f=np.log(x**(np.log(np.e-1)/np.log(x.max()))+1)
d=pd.DataFrame()
d["Original"]=x
d["LogTrans"]=f
stdscaler=StandardScaler()
stdscaler.fit(f.values.reshape(-1, 1))
d["LogTrans + StdScaling"]=stdscaler.transform(f.values.reshape(-1, 1))
d.tail(50)

In [None]:
fig,ax_=plt.subplots(nrows=2,ncols=2,figsize=(10,10))

ax_[0,0]=sns.boxplot(data=balancedata[balancedata[" Operating Expense Rate"]<=10], 
                     y=" Operating Expense Rate", 
                     x="Bankrupt?",
                     ax=ax_[0,0])
ax_[0,0].set_title("Lower order values")
ax_[0,1]=sns.boxplot(data=balancedata[balancedata[" Operating Expense Rate"]>10],
                     y=" Operating Expense Rate", 
                     x="Bankrupt?",
                     ax=ax_[0,1])
ax_[0,1].set_title("Upper order values")

m=balancedata[" Operating Expense Rate"].max()

ax_[1,0]=sns.boxplot(data=balancedata[balancedata[" Operating Expense Rate"]<=10], 
                     y=balancedata[" Operating Expense Rate"].apply(lambda x: np.log(x**(np.log(np.e-1)/np.log(m))+1)), 
                     x="Bankrupt?",
                     ax=ax_[1,0])
ax_[1,0].set_title("Lower order values (LogTransformed)")
ax_[1,1]=sns.boxplot(data=balancedata[balancedata[" Operating Expense Rate"]>10],
                     y=balancedata[" Operating Expense Rate"].apply(lambda x: np.log(x**(np.log(np.e-1)/np.log(m))+1)), 
                     x="Bankrupt?",
                     ax=ax_[1,1])
ax_[1,1].set_title("Upper order values (LogTransformed)")

plt.show()

It represents quite well what we wanted to achieve. The price to pay is distorting the general value distribution, so the algorithms that suppose linear features-output dependencies will be affected, but the one that depend mainly on the relative differences between positive and negative distribution will perform even better, since they will be more sensitive to the lower order values distribution details.

Now the value of k for each affected variable will be shown, along with the maximum value, the mean and the median of each one.

In [None]:
dinfo={}
for col in balancedata.columns:
    info={}
    info["median"]=balancedata[col].median()
    info["mean"]=balancedata[col].mean()
    info["max"]=balancedata[col].max()
    info["k"]=np.log(balancedata[col].max())/np.log(np.e-1)
    dinfo[col]=info
datainfo=pd.DataFrame.from_dict(dinfo, orient="index")
datainfo[datainfo["max"]>1]

## 3. Data analysis

Now the distribution of the regular variables and the tranformed ones will be shown.

In [None]:
for column in balancedata.columns:
    if column!="Bankrupt?":
        if balancedata[column].max()>10:
            m=balancedata[column].max()
            sns.displot(data=balancedata, 
                        x=balancedata[column].apply(lambda x: np.log(x**(np.log(np.e-1)/np.log(m))+1)), 
                        hue="Bankrupt?", element="step", stat="probability",common_norm=False)
            plt.title("LogScaled")
        else:
            sns.displot(data=balancedata, 
                        x=balancedata[column], 
                        hue="Bankrupt?", element="step", stat="probability",common_norm=False)

Due to the nature of the variables, removing the outliers will reduce dramatically the data size. Let's remove the outliers using the quantile criteria:

\begin{equation}
Q_1- q · \ Q_{13}<{(X,y)} \leq Q_3+q · \ Q_{13}
\end{equation}

where

\begin{equation}
Q_{13}=Q_3-Q_1
\end{equation}

for q=3 (removing just the extreme outliers).

In [None]:
def outlier_remover(columnseries,q):
    Q1=columnseries.describe()["25%"]
    Q3=columnseries.describe()["75%"]
    Q13=Q3-Q1
    lowerbound=Q1-q*Q13
    upperbound=Q3+q*Q13
    newcolumnseries=columnseries[columnseries.between(lowerbound,upperbound)]
    return newcolumnseries

def outlier_clean(dataframe, exception_col=[],quo=1.5):
    if quo=="inf": 
        newdataframe=dataframe
    else:
        newdataframe=pd.DataFrame()
        for columnname in dataframe.columns:
            if columnname not in exception_col:
                newdataframe[columnname]=outlier_remover(dataframe[columnname],q=quo)
            else:
                newdataframe[columnname]=dataframe[columnname]
        newdataframe=newdataframe.dropna()
    return newdataframe
    
cleandata=outlier_clean(balancedata, exception_col=["Bankrupt?"] ,quo=3)
cleandata.describe()

The data size has reduced from 550 samples to just 90. We have to use the full balanced dataset to machine learn it.

## 4. Data learning

### Methodology

Using first the balanced data with the logarithmic transformation:
1) Split the data into k+1 sets. Then use the first k sets to apply a k-fold cross validation evaluation with F1-Score as the evaluation score with several algorithms with different hyperparameters. In this case, we will use k=5.

2) Get also the Accuracy, the Matthews Correlation Coefficient and the F2-Score of the best algorithms.

3) Use the remaining set to test the best algorithms found with the KFCV and also get the scores listed in 2).

4) Finally use the discarded negatives to do another test to the algorithms and get the accuracy, which is the only useful score since all the real outputs are negative so there will be just true negatives and false positives.

Then we repeat the same steps for the data set without the logarithmic transformation and finally we compare the results.

In [None]:
learndata=balancedata
finalresults=[]

### Data with logarithmic transformation

In [None]:
data_col=list(learndata.columns)
y_col=data_col.pop(data_col.index("Bankrupt?"))
X_col=data_col

X_data=pd.DataFrame()
X_discardata=pd.DataFrame()
for col in X_col:
    maxv=learndata[col].max()    
    if maxv>10:
        X_data[col]=learndata[col].apply(lambda x: np.log(x**(np.log(np.e-1)/np.log(maxv))+1))
        X_discardata[col]=discardata[col].apply(lambda x: np.log(x**(np.log(np.e-1)/np.log(maxv))+1))
    else:
        X_data[col]=learndata[col]
        X_discardata[col]=discardata[col]
y_data=learndata[y_col]
y_discardata=discardata[y_col]

# K value of K-Fold CV assignation.

k=5
r=29

# Split into CV Training data and Testing data. We choose the test size to be equal to the validation size of our K-Fold CV.
# For doing so, we split the data into K+1 pieces and we use the first K pieces to K-Fold CV and the left one for final testing.

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=1/(k+1), random_state=r)
X_discarded, y_discarded = X_discardata.values, y_discardata.values

# Standard scaling of the input data X.

Xscaler=StandardScaler()
Xscaler.fit(X_train)
X_train=Xscaler.transform(X_train)
X_test=Xscaler.transform(X_test)
X_discarded=Xscaler.transform(X_discarded)

# Creation of the variables we will use to store the evaluation results for each algorythm.

modelScore_train={}
modelScore_test={}
modelScore_discarded={}
modelScore_KFCV={}
model_KFCV={}

modelConfussionMatrix_train={}
modelConfussionMatrix_test={}
modelConfussionMatrix_discarded={}

bankruptevents_train=pd.DataFrame()
bankruptevents_test=pd.DataFrame()
bankruptevents_discarded=pd.DataFrame()
bankruptevents_train["Bankrupt?"]=y_train
bankruptevents_test["Bankrupt?"]=y_test
bankruptevents_discarded["Bankrupt?"]=y_discarded

# Scorers setup and F1 assignation as the evaluation score.
scorersCV={"Acc": make_scorer(accuracy_score),
           "MCC": make_scorer(matthews_corrcoef), 
           "F1": make_scorer(fbeta_score, beta=1), 
           "F2": make_scorer(fbeta_score, beta=2)}
scorerCVkey="F1"

# Definition of the algorythm evaluation function.

def tuned_model(model, param_grid, modelname, results):
    tuner=GridSearchCV(model,param_grid=param_grid,scoring=scorersCV,cv=k,refit=scorerCVkey)
    tuner.fit(X_train,y_train)
    best_model=tuner.best_estimator_
    best_params=tuner.best_params_
    model_KFCV[modelname]=best_model
    print(f"Best model: {best_model}")
    
    modelScore_KFCV[modelname]={}
    resultcols=['rank_test_'+scorerCVkey]
    resultsdata=pd.DataFrame(tuner.cv_results_)
    for paramkey in param_grid:
        paramcolname="param_"+paramkey
        resultcols.append(paramkey)
        resultsdata=resultsdata.rename(columns={paramcolname:paramkey})
        
    for i in range(0,k):
        old_split_score_name="split%s_test_%s" % (i,scorerCVkey)
        new_split_score_name="s%s_%s" % (i,scorerCVkey)
        resultsdata=resultsdata.rename(columns={old_split_score_name: new_split_score_name})
        resultcols.append(new_split_score_name)
    
    for scorerkey in scorersCV:
        old_mean_score_name="mean_test_"+scorerkey
        new_mean_score_name="mean_"+scorerkey
        resultsdata=resultsdata.rename(columns={old_mean_score_name: new_mean_score_name})
        resultcols.append(new_mean_score_name)
        
        best_row=resultsdata[resultsdata["params"]==best_params]
        modelScore_KFCV[modelname][scorerkey]=round(best_row[new_mean_score_name].values[0],3)
        
    if results==True: print(resultsdata[resultcols].sort_values(by=resultcols[0]).round(3))
        
    return best_model

def plot_confusion_matrix(confusion_matrix,model_name):
    ax=plt.axes()
    confusionmatrix=sns.heatmap(data=confusion_matrix,annot=True,ax=ax)
    ax.set_title(model_name)
    plt.show()

def execute_model(model,modelname):
    model.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    y_discarded_pred=model.predict(X_discarded)
    bankruptevents_train[modelname]=y_train_pred
    bankruptevents_test[modelname]=y_test_pred
    bankruptevents_discarded[modelname]=y_discarded_pred    
    modelScore_train[modelname]={}
    modelScore_test[modelname]={}
    modelScore_discarded[modelname]={}    
    for scorerkey in scorersCV:
        scorermaker=scorersCV[scorerkey]
        scorer=scorermaker._score_func
        scorerparams=scorermaker._kwargs
        modelScore_train[modelname][scorerkey]=round(scorer(y_train,y_train_pred,**scorermaker._kwargs) , 3)
        modelScore_test[modelname][scorerkey]=round(scorer(y_test,y_test_pred,**scorermaker._kwargs) , 3)
        modelScore_discarded[modelname][scorerkey]=round(scorer(y_discarded,y_discarded_pred,**scorermaker._kwargs) , 3)        
    modelConfussionMatrix_train[modelname]=confusion_matrix(y_train,y_train_pred,labels=[0, 1],normalize="true")
    modelConfussionMatrix_test[modelname]=confusion_matrix(y_test,y_test_pred,labels=[0, 1],normalize="true")
    modelConfussionMatrix_discarded[modelname]=confusion_matrix(y_discarded,y_discarded_pred,labels=[0, 1],normalize="true")
    
def eval_model(model_func,model_pgrid, model_name, results_info=False):
    model=tuned_model(model=model_func, param_grid=model_pgrid, modelname=model_name, results=results_info)
    execute_model(model,model_name)

#### Logistic Regression

In [None]:
LogitRegr_params={"C":[1,0.5,0.1,0.01],
                  "solver":["lbfgs", "liblinear"]}
eval_model(LogisticRegression(),LogitRegr_params,"Logistic Regr.",True)

#### Stochastic Gradient Descend Classifier

In [None]:
SGDClf_params={"alpha":[0.1,0.05,0.01,0.005],
               "loss":["hinge","log","modified_huber"]}
eval_model(SGDClassifier(),SGDClf_params,"Stochastic Gradient Descend Clf.",True)

#### Decision Tree Classifier

In [None]:
DTClf_params={"ccp_alpha":[0,0.05,0.001],
              "criterion":["gini", "entropy"],
              "max_features":[None,"auto", "sqrt", "log2"]}
eval_model(DecisionTreeClassifier(), DTClf_params, "Decision Tree Clf.", True)

#### Multi Layer Perceptron Classifier

In [None]:
MLPClf_params={"alpha":[0.05,0.01,0.001,0.0001],
               "activation":["tanh","relu"],
               "hidden_layer_sizes":[20,50],
               "learning_rate":["constant","adaptive"]}
eval_model(MLPClassifier(), MLPClf_params, "Multi-Layer Perceptron Clf.", True)

#### Random Forest Classifier

In [None]:
RFClf_params={"ccp_alpha":[0.01,0.03,0.05,0.005],
              "criterion":["gini", "entropy"],
              "n_estimators":[1,2,5,10]}
eval_model(RandomForestClassifier(), RFClf_params, "Random Forest Clf.", True)

#### SCORES

In [None]:
print("Score (%d-fold CV Train data)" % k)
for key in modelScore_KFCV:
    print("%s: %s" % (key, modelScore_KFCV[key]))
print("\n")    
print("Score (Test data)")
for key in modelScore_test:
    print("{}: {}".format(key,modelScore_test[key]))
print("\n")    
print("Accuracy (Discarded data)")
for key in modelScore_discarded:
    print("{}: {}".format(key,modelScore_discarded[key]["Acc"]))
    
for key in modelScore_KFCV:
    row={}
    row["Algorithm"]=key
    row["LogTrans"]=1
    row["F1-Score"]=modelScore_KFCV[key]["F1"]
    row["Accuracy"]=modelScore_KFCV[key]["Acc"]
    row["MCC"]=modelScore_KFCV[key]["MCC"]
    row["F2-Score"]=modelScore_KFCV[key]["F2"]    
    row["Discarded Accuracy"]=modelScore_discarded[key]["Acc"]
    exists=[]
    for row_ in finalresults:
        existsbool=False
        if (row_["Algorithm"]==row["Algorithm"]) and (row_["LogTrans"]==row["LogTrans"]): existsbool=True
        exists.append(existsbool)
    if True in exists:
        finalresults[exists.index(True)]=row
    else:
        finalresults.append(row)        

#### TEST SET PREDICTIONS OVERVIEW (SOME PREDICTIONS + NORMALIZED CONFUSSION MATRICES)

In [None]:
bankruptevents_test.head(50)

In [None]:
for modelname in bankruptevents_test.columns:
    if modelname!="Bankrupt?":
       c_matrix=modelConfussionMatrix_test[modelname]
       c_matrix_plot=plot_confusion_matrix(c_matrix,modelname)

### Data without logarithmic transformation

In [None]:
data_col=list(learndata.columns)
y_col=data_col.pop(data_col.index("Bankrupt?"))
X_col=data_col

X_data=pd.DataFrame()
X_discardata=pd.DataFrame()
for col in X_col:
    X_data[col]=learndata[col]
    X_discardata[col]=discardata[col]
y_data=learndata[y_col]
y_discardata=discardata[y_col]

# K value of K-Fold CV assignation.

k=5
r=29

# Split into CV Training data and Testing data. We choose the test size to be equal to the validation size of our K-Fold CV.
# For doing so, we split the data into K+1 pieces and we use the first K pieces to K-Fold CV and the left one for final testing.

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=1/(k+1), random_state=r)
X_discarded, y_discarded = X_discardata.values, y_discardata.values

# Standard scaling of the input data X.

Xscaler=StandardScaler()
Xscaler.fit(X_train)
X_train=Xscaler.transform(X_train)
X_test=Xscaler.transform(X_test)
X_discarded=Xscaler.transform(X_discarded)

# Creation of the variables we will use to store the evaluation results for each algorythm.

modelScore_train={}
modelScore_test={}
modelScore_discarded={}
modelScore_KFCV={}
model_KFCV={}

modelConfussionMatrix_train={}
modelConfussionMatrix_test={}
modelConfussionMatrix_discarded={}

bankruptevents_train=pd.DataFrame()
bankruptevents_test=pd.DataFrame()
bankruptevents_discarded=pd.DataFrame()
bankruptevents_train["Bankrupt?"]=y_train
bankruptevents_test["Bankrupt?"]=y_test
bankruptevents_discarded["Bankrupt?"]=y_discarded


#### Logistic Regression

In [None]:
eval_model(LogisticRegression(),LogitRegr_params,"Logistic Regr.",True)

#### Stochastic Gradient Descend Classifier

In [None]:
eval_model(SGDClassifier(),SGDClf_params,"Stochastic Gradient Descend Clf.",True)

#### Decision Tree Classifier

In [None]:
eval_model(DecisionTreeClassifier(), DTClf_params, "Decision Tree Clf.", True)

#### Multi Layer Perceptron Classifier

In [None]:
eval_model(MLPClassifier(), MLPClf_params, "Multi-Layer Perceptron Clf.", True)

#### Random Forest Classifier

In [None]:
eval_model(RandomForestClassifier(), RFClf_params, "Random Forest Clf.", True)

#### SCORES

In [None]:
print("Score (%d-fold CV Train data)" % k)
for key in modelScore_KFCV:
    print("%s: %s" % (key, modelScore_KFCV[key]))
print("\n")    
print("Score (Test data)")
for key in modelScore_test:
    print("{}: {}".format(key,modelScore_test[key]))
print("\n")    
print("Accuracy (Discarded data)")
for key in modelScore_discarded:
    print("{}: {}".format(key,modelScore_discarded[key]["Acc"]))   
    
for key in modelScore_KFCV:
    row={}
    row["Algorithm"]=key
    row["LogTrans"]=0
    row["F1-Score"]=modelScore_KFCV[key]["F1"]
    row["Accuracy"]=modelScore_KFCV[key]["Acc"]
    row["MCC"]=modelScore_KFCV[key]["MCC"]
    row["F2-Score"]=modelScore_KFCV[key]["F2"]    
    row["Discarded Accuracy"]=modelScore_discarded[key]["Acc"]
    exists=[]
    for row_ in finalresults:
        existsbool=False
        if (row_["Algorithm"]==row["Algorithm"]) and (row_["LogTrans"]==row["LogTrans"]): existsbool=True
        exists.append(existsbool)
    if True in exists:
        finalresults[exists.index(True)]=row
    else:
        finalresults.append(row)        

#### TEST SET PREDICTIONS OVERVIEW (SOME PREDICTIONS + NORMALIZED CONFUSSION MATRICES)

In [None]:
bankruptevents_test.head(50)

In [None]:
for modelname in bankruptevents_test.columns:
    if modelname!="Bankrupt?":
       c_matrix=modelConfussionMatrix_test[modelname]
       c_matrix_plot=plot_confusion_matrix(c_matrix,modelname)

### FINAL RESULTS

Now the scores of the algorithms applied to both logtransformed and non transformed datasets will be shown, ordered from thw best to the worst F1-Score.

In [None]:
pd.DataFrame(finalresults).sort_values(by="F1-Score",ascending=False)   

The logarithmic transformation has improved the performance of the algorithms slightly.

In general the best algorithms with the dataset with the logarithmic transformation show a F1-Score well above 0.8, which it's not ideal, but it's not bad at all.