In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A. Pre-processing 

# Workflow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_columns=100
pd.options.display.max_rows=100
pd.set_option('display.float_format','{:.2f}'.format)

### 1.1 Read - head  - shape
- We have total 6819 records from 96 columns in the dataset,of which:
    - 1 Target (Bankrupt?)
    - 95 features

In [None]:
data = pd.read_csv("/kaggle/input/company-bankruptcy-prediction/data.csv")
print(data.shape)
data.head(3)

### 1.2 Info , describe
- All features are numeric
- one features has 0 variance that is constant throughout
- major features are in the range of 0-1
- there are outlier infected features

In [None]:
data.info()

In [None]:
data.describe()

### 2. Sanity Check
#### Column Names:
- Remove leading whitespaces from Column names
- Replace " " with "_" in columnnames
- Rename Target column

#### Check constant Columns:
- ***Net Income Flag*** is constant, hence drop
- Rename ***Bankrupt?  -> Bankrupt***

#### Check Duplicates
 - No duplicated values

#### Check Missing Values
- No missing values

In [None]:
print("Column names before renaming","\n", data.columns[:5],"\n")
data.columns = data.columns.str.strip()
data.columns = data.columns.str.replace(" " ,"_")
data.rename(columns = {'Bankrupt?' :'Bankrupt' },inplace=True)
print("Column names after renaming","\n",data.columns[:5])

In [None]:
data.drop(['Net_Income_Flag'],axis=1,inplace=True) ## drop constant columns

In [None]:
data.duplicated(keep=False).sum()  ## no duplicates

In [None]:
data.isnull().values.sum() # check missing values

### 3.1 Target Class distribution
- Target is heavily imballanced 
- Bankruptcy Rate is around 3.2%

In [None]:
df1 = pd.DataFrame(data.Bankrupt.value_counts())
df2 = pd.DataFrame(100*data.Bankrupt.value_counts(normalize=True).astype(float))
tab = df1.merge(df2,left_index=True,right_index=True).rename(columns = {"Bankrupt_x" : "Count" , "Bankrupt_y" : "Percentage"})
print(tab)

In [None]:
plt.pie(tab['Count'], labels= [0,1])

### 3.2  Outliers Handling
- First separate all **94** features into two groups
    - _fraction-only_ features (i.e. features having values in [0,1])
    - _other than fraction-only_ features

- **70** features are _fraction-only_ features where as **24** are _other than fraction-only_ features.

- Outliers are mainly **present** in these 24 _"other than fraction-only"_ features

- To explore the outliers nature, distribution of these 24 features, are obtained using :
    - Histogram
    - Boxplot

In [None]:
## fn to separate only-fractional & other columns
def get_fraction_valued_columns(df):
    my_columns  = []
    for col in df.columns:
        if (data[col].max()<=1) & (data[col].min() >= 0):
            my_columns.append(col)
    return(my_columns)

fractional_columns = get_fraction_valued_columns(df=data.drop(['Bankrupt'],axis=1))
non_fraction_columns = data.drop(['Bankrupt'],axis=1).columns.difference(fractional_columns)
print("# Fraction-only Columns",len(fractional_columns),"\t","# Other than Fraction-only Columns", len(non_fraction_columns))

### Other than fration-only features : Histogram

In [None]:
data[non_fraction_columns].hist(figsize= (20,20),sharex=True,layout= (6,4))
plt.show()

 ### Other than fration-only features :  Boxplot

In [None]:
data[non_fraction_columns].boxplot(vert=False,figsize= (15,10))
plt.subplots_adjust(left=0.25)
plt.show()

### __Observations__
- Major values are concentrated around starting ranges yet there are very high valued records.
- Some features show outliers in top 1% values only.Few of such features are:
    - Total_debt/Total_net_worth 
    - Revenue_per_person
    - Net_Value_Growth_Rate
    - Revenue_Per_Share etc

2. There are some features that have significant number of higher values, like:
    - Current_Asset_Turnover_Rate
    - Cash_Turnover_Rate

- **These features may potentially seem to be outliers but not be practically so.Hence outliers here need to be handled more cautiously,simply eliminating records lying above some cutoff cant be applied for all throughout**. 

- We Need to have a rule: 

#### **Rule** :
- Only __other than fraction-only__ features are considered for outlier inspection
- Among them those meeting following conditions I decide to term them as outliers infected:
   - 100th percentile value is atleast 100 times greater than 99th percentile.
   - There are 10 or less records for 100th percentile.
- For the features satisfying above outlier condition we replace them  as x-> log(1+x)
- __N.B: This is just a rule of thumb I made based on the experience while playing with the data.__ 


In [None]:
## Outlier handling techniques
log_transformed_cols = []
for col in data[non_fraction_columns].columns:
    if (data[col].quantile(1) >= 100* data[col].quantile(0.99)) |  (sum(data[col] > data[col].quantile(0.99)) <= 10):
        data[col] = np.log1p(data[col])
        log_transformed_cols.append(col)
        
## Change names of log transformed column
log_names = "log_" + data[log_transformed_cols].columns
data.rename(columns={data[log_transformed_cols].columns[i]: log_names[i] for i in range(len(log_names))}, inplace = True)
#data.columns

In [None]:
print("The following features are log transformed after they fulfill outlier detection condition.","\n\n",log_transformed_cols)
data[log_names].boxplot(vert=False,figsize= (15,10))
plt.subplots_adjust(left=0.25)
plt.title("Boxplot of Outlier infected features after log transformation")
plt.show()

# B. Modelling

In [None]:
X = data.drop(["Bankrupt"],axis=1)
y = data.Bankrupt

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2 , mutual_info_classif 
from imblearn.over_sampling import SMOTE 

## 1.1 Feature Scaling
- Scale all features in order to have zero mean and unit variance

In [None]:
X_scaled = pd.DataFrame(StandardScaler(copy=False).fit_transform(X))
X_scaled.columns = X.columns

##  1.2 Feature Selection
- In order to select features to be fed into the predictive model mutual information is useed.
- Features with positive mutual information are retained for final model.

In [None]:
mutual_info = mutual_info_classif(X= X_scaled,y= y)  # get mutual info all predictors
pruned_features = X.columns[np.where(mutual_info>0)]  # retain features only with mi >0
X_scaled_pruned = X_scaled[pruned_features]

## 1.3 Target class Imballance : SMOTE
- only 3.2% of the companies has Bankrupted in the dataset,making it imballanced target class probelm.
- Hence positive target class( Bankrupt=1) is under-represented.This could be challenging as lack of positive class in the train data may lead machine learning model to have poor performance in terms of detecting positive class in the unseen data.
- SMOTE(Synthetic Minority Oversampling Techniwque) proposed by Chawla et al 2002, is a well applied technique to handle such scenerio.
- SMOTE actually creates as many synthetic examples for minority class as are requirred so that finally two target class are well represented. It does so by synthesising samples that are close to the feature space ,for the minority target class.
More about [SMOTE](http://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

In [None]:
sm = SMOTE(random_state=123)
X_sm , y_sm = sm.fit_resample(X_scaled_pruned,y)

print(f'''Shape of X before SMOTE:{X_scaled_pruned.shape}
Shape of X after SMOTE:{X_sm.shape}''',"\n\n")

print(f'''Target Class distributuion before SMOTE:\n{y.value_counts(normalize=True)}
Target Class distributuion after SMOTE :\n{y_sm.value_counts(normalize=True)}''')

## 2.1 Logistic Regression
- Will be using logistic regression as our benchmark model.
- Combination of L1 & L2 regularization(Elasticnet)is applied.
- With and Without Using SMOTE

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics

In [None]:
x_train , x_test , y_train ,y_test = train_test_split(X_sm,y_sm,test_size= 0.33)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_fit = LogisticRegression(penalty='elasticnet',solver='saga',l1_ratio=0.5,max_iter=10000).fit(x_train,y_train)

###  Performance metrics : (Cutoff independent )

In [None]:
lr_pred = lr_fit.predict_proba(x_test)  ## predicted probabilities
lr_pred = lr_pred[:,1]  #prob(Bankrupt=0)

lr_fpr, lr_tpr, _ = metrics.roc_curve(y_test,  lr_pred)  #fpr, tpr for AUC
lr_auc = metrics.roc_auc_score(y_test,lr_pred)   #AUC

## in order to collate FPR, TPR and AUC create a dataframe  
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
result_table.set_index('classifiers', inplace=True)
result_table = result_table.append({'classifiers':"Logistic Regression",
                                        'fpr':lr_fpr, 
                                        'tpr':lr_tpr, 
                                        'auc':lr_auc}, ignore_index=True)

print('Logistic Regression AUC=%.3f' % (lr_auc))

In [None]:
plt.plot([0,1], [0,1], color='black', linestyle='--')
plt.plot(lr_fpr,lr_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[0,'classifiers'], result_table.loc[0]['auc']))
plt.ylabel('True Positive Rate (Sensitivity)')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')
plt.show()

### Performance Metrics (Cutoff based)

In [None]:
yhat = lr_fit.predict(x_test,)
label = ['Fin.Stable', 'Fin.Unstable']
report = metrics.classification_report(y_test, yhat,target_names=label)

print(report)
print("Confusion Matrix :", "\n" ,metrics.confusion_matrix(y_test,yhat))

## 2.2 Support Vector Classifier

In [None]:
from sklearn import svm

In [None]:
svc_fit = svm.SVC(C=1,kernel= 'rbf',degree=3,probability=True).fit(x_train,y_train)

###  Performance metrics : (Cutoff independent )

In [None]:
svc_pred = svc_fit.predict_proba(x_test)
svc_pred = svc_pred[:,1]

svc_fpr, svc_tpr, _ = metrics.roc_curve(y_test,  svc_pred)
svc_auc = metrics.roc_auc_score(y_test,svc_pred)

result_table = result_table.append({'classifiers':"Support Vector Classifier",
                                        'fpr':svc_fpr, 
                                        'tpr':svc_tpr, 
                                        'auc':svc_auc}, ignore_index=True)

print('Support Vector Classifier AUC =%.3f' % (svc_auc))

In [None]:
plt.plot([0,1], [0,1], color='black', linestyle='--')
plt.plot(lr_fpr,lr_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[0,'classifiers'], result_table.loc[0]['auc']))
plt.plot(svc_fpr,svc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[1,'classifiers'], result_table.loc[1]['auc']))
plt.ylabel('True Positive Rate (Sensitivity)')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')
plt.show()

###  Performance metrics : (Cutoff based )

In [None]:
yhat = svc_fit.predict(x_test)
label = ['Fin.Stable', 'Fin.Unstable']
report = metrics.classification_report(y_test, yhat,target_names=label)

print(report)
print("Confusion Matrix :", "\n" ,metrics.confusion_matrix(y_test,yhat))

## Tree Based Ensamble Models
- Gradient Boosted Classifier
- Ada Boost Classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier

## 2.3.1 Gradient Boosted Classifiers

In [None]:
gbc = GradientBoostingClassifier(learning_rate=0.1,max_depth=2,random_state=123,subsample=0.8,n_estimators=600)
gbc_fit = gbc.fit(x_train,y_train)

###  Performance metrics : (Cutoff Independent )

In [None]:
gbc_pred = gbc_fit.predict_proba(x_test)
gbc_pred = gbc_pred[:,1]

gbc_fpr, gbc_tpr, _ = metrics.roc_curve(y_test,  gbc_pred)
gbc_auc = metrics.roc_auc_score(y_test,gbc_pred)

result_table = result_table.append({'classifiers':"Gradient Boosted Classifier",
                                    'fpr':gbc_fpr, 
                                    'tpr':gbc_tpr, 
                                    'auc':gbc_auc}, ignore_index=True)


print('Gradient Boosted Classifier AUC=%.3f' % (gbc_auc))

In [None]:
plt.plot([0,1], [0,1], color='black', linestyle='--')
plt.plot(lr_fpr,lr_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[0,'classifiers'], result_table.loc[0]['auc']))
plt.plot(svc_fpr,svc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[1,'classifiers'], result_table.loc[1]['auc']))
plt.plot(gbc_fpr,gbc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[2,'classifiers'], result_table.loc[2]['auc']))
plt.ylabel('True Positive Rate (Sensitivity)')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')
plt.show()

###  Performance metrics : (Cutoff based )

In [None]:
yhat = gbc_fit.predict(x_test)
label = ['Fin.Stable', 'Fin.Unstable']
report = metrics.classification_report(y_test, yhat,target_names=label)

print(report)
print("Confusion Matrix :", "\n" ,metrics.confusion_matrix(y_test,yhat))

##  2.3.2 AdaBoost Classifier

In [None]:
abc = AdaBoostClassifier(n_estimators=500,random_state=123,learning_rate=0.3)
abc_fit = abc.fit(x_train,y_train)

###  Performance metrics : (Cutoff Independent )

In [None]:
abc_pred = abc_fit.predict_proba(x_test)
abc_pred = abc_pred[:,1]

abc_fpr, abc_tpr, _ = metrics.roc_curve(y_test,  abc_pred)
abc_auc = metrics.roc_auc_score(y_test,abc_pred)

result_table = result_table.append({'classifiers':"Ada Boosted Classifier",
                                    'fpr':abc_fpr, 
                                    'tpr':abc_tpr, 
                                    'auc':abc_auc}, ignore_index=True)

print('Ada Boosted Classifier AUC=%.3f' % (abc_auc))

In [None]:
plt.plot([0,1], [0,1], color='black', linestyle='--')
plt.plot(lr_fpr,lr_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[0,'classifiers'], result_table.loc[0]['auc']))
plt.plot(svc_fpr,svc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[1,'classifiers'], result_table.loc[1]['auc']))
plt.plot(gbc_fpr,gbc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[2,'classifiers'], result_table.loc[2]['auc']))
plt.plot(abc_fpr,abc_tpr ,label="{}, AUC={:.3f}".format(result_table.loc[3,'classifiers'], result_table.loc[3]['auc']))
plt.ylabel('True Positive Rate (Sensitivity)')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':8}, loc='lower right')
plt.show()

###  Performance metrics : (Cutoff Based )

In [None]:
yhat = abc_fit.predict(x_test)
label = ['Fin.Stable', 'Fin.Unstable']
report = metrics.classification_report(y_test, yhat,target_names=label)

print(report)
print("Confusion Matrix :", "\n" ,metrics.confusion_matrix(y_test,yhat))

## Final Comment : 
- Best Model for the data: Gradient Boosted Tree
- GBC predicts with following score on test dataset
  - 96% accuracy 
  - 99.5% AUC
  - 97% f1 score  

### Feature Importance

In [None]:
var_imp = pd.concat([pd.DataFrame(X_scaled_pruned.columns),pd.DataFrame(np.transpose(gbc_fit.feature_importances_))],axis=1)
var_imp.columns = ['Features', 'Importance']
var_imp = var_imp.sort_values('Importance',ascending = False)
plt.figure(figsize=(20,20))
sns.barplot(y= 'Features',x= 'Importance',data= var_imp,orient='h')