In [None]:
# Import necessary libraries
import numpy as np
from numpy import median
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from scipy import stats
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix, classification_report, recall_score, precision_score, plot_precision_recall_curve
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import RFECV, SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Introduction

In this notebook, we're going to predict companies that go bankrupt. I'm new here on Kaggle and I'd really like an upvote from your part if you liked this work, this will really encourage me to keep doing projects like that. Also if you have any suggestions or remarks please let me know on the comment section. Thanks ! 

Dataset description provided: "The data were collected from the Taiwan Economic Journal for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange."

In [None]:
# Ignore warnings
warnings.filterwarnings("ignore") 
# Import data
data = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')

In [None]:
# Inspect data
data.head()

In [None]:
# Create copy of df
df = data.copy()
# Check Missing data
df.isnull().sum().any()

The data contains no missing values. 

In [None]:
# Shape of df
print("Number of rows : {}".format(df.shape[0]), '\n'
      "Number of cols : {}".format(df.shape[1]))

## A) Recursive Feature Elimination with Cross-validation (RFECV):




This data has a large number of features. In order to make the EDA easier and improve the accuracy of machine learning models, we need to reduce the dimensionality of the data. For this purpose, I'm going to use a famous feature selection method named Recursive Feature Elimination. Recursive feature elimination is an example of backward feature elimination in which we essentially first fit our model using all the features in a given set, then progressively one by one we remove the least significant features, each time re-fitting, until we are left with the desired number of features

In [None]:
# Set X and y 
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
# Set training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Set RandomForestClassifier as estimator for RFECV
cart = RandomForestClassifier(random_state=42)
# Minimum number of features to consider
min_features_to_select = 1  
# Set number of folds
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1) 
# Set cross-validation process
rfecv = RFECV(estimator=cart, step=1, cv=cv,
              scoring='accuracy',
              min_features_to_select=min_features_to_select, n_jobs=1)
# Fit the model
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(rfecv.support_ == True)]
print("Most relevant features are: ")
print(most_relevent_cols)

In [None]:
# Append target
most_relevent_cols = list(most_relevent_cols)
most_relevent_cols.append("Bankrupt?")

In [None]:
# Display df
rfecv_df = df[most_relevent_cols]
rfecv_df.head()

## B) EDA

In this part, I'm going to do some Exploratory Data Analysis on the features previously selected by RFECV. Understanding our features is key for building more accurate machine learning models.

### a) Descriptive statistics

In [None]:
# Descriptive stats
rfecv_df.describe()

Most of the values in the data range between 0 and 1. But we can see that there are some extreme values (outliers). 

In [None]:
# Analyse target var
sns.countplot(rfecv_df['Bankrupt?'])
# Target 
print("% of Data") 
print(df['Bankrupt?'].value_counts(normalize=True))
print("Count")
print(df['Bankrupt?'].value_counts())

We can observe that our dataset is very imbalanced. The minority class which is the one we're most interested by predicting represents about 3% of total observations. This can pose a real challenge to machine learning models.

In [None]:
# Looking at the histograms of numerical data
rfecv_df.hist(figsize = (35,30), bins = 50)
plt.show()

We can see that most features don't follow a normal distribution and many of them have very skwed distribution. This might be solved by applying a log transformation but as you'll see later in the notebook, I'll chose to simply replace extreme positive values with the median of the target group. 

In [None]:
# Correlations
rfecv_df.corr('spearman')["Bankrupt?"].sort_values() 
# Correlation Heatmap (Spearman)
f, ax = plt.subplots(figsize=(30, 25))
mat = rfecv_df.corr('spearman')  
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0,# annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

### b) Analysis of variance and hypothesis testing

In [None]:
# Seperate dfs
bankrupt_df = rfecv_df[rfecv_df['Bankrupt?']==True]
not_bankrupt_df = rfecv_df[rfecv_df['Bankrupt?']==False]

# Analyze distributions of selected features using rfecv
cols = rfecv_df.drop("Bankrupt?", axis=1).columns

for feature in cols:
  a = bankrupt_df[feature]
  b = not_bankrupt_df[feature]
  b = b.sample(n=len(a), random_state=42) # Take random sample from each feature to match length of target
  # Running t-tests
  test = stats.ttest_ind(a,b)   
  plt.figure() 
  sns.distplot(bankrupt_df[feature], kde=True, label="Bankrupt")
  sns.distplot(not_bankrupt_df[feature], kde=True, label="Not Bankrupt") 
  plt.title("{} / p-value of t-test = :{}".format(feature, test[1]))
  plt.legend()

### **Conclusions:** 
The following are some features that significantly differentiate the two groups of our target. The definitions of these indicators were all taken from Investopedia which is a widely known finance website.

- **ROA (C) + ROA(A)** : Companies that encounter financial difficulties  tend to have lower ROA. ROA is an indicator of how profitable a company is relative to its total assets. ROA gives a manager, investor, or analyst an idea as to how efficient a company's management is at using its assets to generate earnings. ROA is displayed as a percentage; the higher the ROA the better.

- **Net Value per Share** : This is the ratio of equity available to common shareholders divided by the number of outstanding shares. This figure represents the minimum value of a company's equity and measures the book value of a firm on a per-share basis. 

- **Persistent EPS & Per Share Net Profit**  : Earnings per share (EPS) is calculated as a company's profit divided by the outstanding shares of its common stock. The resulting number serves as an indicator of a company's profitability. 

- **Net worth/Total Assets** : The higher the equity-to-asset ratio, the less leveraged the company is, meaning that a larger percentage of its assets are owned by the company and its investors.

- **Cash/Totat Assets** : This figure is used to measure a firm's liquidity or its ability to pay its short-term obligations. The higher the better.


## C) Outliers's Analysis

In [None]:
# Visulize outliers using boxplots
plt.figure(figsize = (20,20))
ax =sns.boxplot(data= rfecv_df, orient="h")
ax.set_title('Features_selected Boxplots', fontsize = 18)
ax.set(xscale="log")
plt.show()

This dataset contains too many outliers. 

## D) Predicting Bankruptcy

Since this is a binary classification problem, we are more interested by predicting the positive value namely the Bankrupt category = 1. The positive category is our minority class and this will pose a real challenge for machine learning models. The best way to measure the performance of our models in this case is by computing the f1 score which is the average of our recall and precision. Recall or also called sensitivity is the metric that gives us true positive rate which is the correctly classified positives out of all possible positives, while precision will tell us what is the proportion of correctly classified positive out of all predicted positives. Recall has as main goal to minimize false negatives while precision aims to minimize false positives. 

### a) Testing models

I'll first use the features selected by the RFECV to test the models on. Later on, I'll use the SelectKbest features' selection.

In [None]:
# Evaluation function 
def evaluation(model):
  model.fit(X_train, y_train)
  ypred = model.predict(X_test)
  print(confusion_matrix(y_test, ypred))
  print(classification_report(y_test, ypred))
    
  N, train_score, val_score = learning_curve(model, X_train, y_train,
                                              cv=5, scoring='f1',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
  plt.figure(figsize=(12, 8))
  plt.plot(N, train_score.mean(axis=1), label='train score')
  plt.plot(N, val_score.mean(axis=1), label='validation score')
  plt.legend()

In [None]:
# Set X and y 
X = rfecv_df.iloc[:, :-1]
y = rfecv_df.iloc[:, -1]
# Set training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Set models with default params
RandomForest = RandomForestClassifier(random_state=0)
AdaBoost = AdaBoostClassifier(random_state=0)
KNN = make_pipeline(StandardScaler(), KNeighborsClassifier())
# Set dictionary of models
dict_of_models = {'RandomForest': RandomForest,
                  'AdaBoost' : AdaBoost,
                  'KNN': KNN
                  
                 }

In [None]:
# Evaluate models
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)

RFC and AdaBoost seem to yield highest f1 score. We'll continue using these models. But since the data is very imbalanced, the model will obviously be biased towards the majority class. We'll use SMOTE in order to rebalance the data. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

### b) SMOTE & RandomForestClassifier

We can use SMOTE to either oversample the minority class or undersample the majority class in order to rebalance the data. Other strategies of rebalancing the data are possible for example using both oversampling and undersampling at the same time. In our approach we'll test both the oversampling and over/undersampling strategy. Since machine learning models are automatically biased towards the majority class, using SMOTE will hopefully make the training process of the model more reliable.

In [None]:
# USING OVERSAMPLING ONLY
kf = RepeatedStratifiedKFold(n_splits=5)
# Set empty lists to store key metrics
accuracy = []
f1 = []
recall =[]
precision = []
# Loop over kfolds
for kf, (train_index, test_index) in enumerate(kf.split(X, y), 1):
    X_train = X[train_index, :]
    y_train = y[train_index]  
    X_test = X[test_index, :]
    y_test = y[test_index]  
    # Set pipeline where SMOTE with oversampling is applied before model fitting
    model = make_pipeline(SMOTE() ,RandomForestClassifier(random_state=0))
    # Fit model
    model.fit(X_train, y_train)
    # Predict on original test set  
    y_pred = model.predict(X_test)
    # Compute key metrics for target "Bankrupt = 1"
    accuracy.append(model.score(X_test, y_test))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
# Print key metrics      
print("Mean accuracy:", np.mean(accuracy))
print("Mean f1", np.mean(f1))
print("Mean recall", np.mean(recall))
print("Mean precision", np.mean(precision))

We can see that by applying oversampling, the f1 score slightly improve. We sig improved our recall but our precision went down. 

In [None]:
# SMOTE startegy with both over and undersampling
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

In [None]:
# USING BOTH OVER AND UNDERSAMPLING
kf = RepeatedStratifiedKFold(n_splits=5)
# Set empty lists to store key metrics
accuracy = []
f1 = []
recall =[]
precision = []
# Loop over kfolds
for kf, (train_index, test_index) in enumerate(kf.split(X, y), 1):
    X_train = X[train_index, :]
    y_train = y[train_index]  
    X_test = X[test_index, :]
    y_test = y[test_index]  
    # Set pipeline where SMOTE is applied before model fitting
    model = make_pipeline(over, under ,RandomForestClassifier(random_state=0))
    # Fit model
    model.fit(X_train, y_train)
    # Predict on original test set  
    y_pred = model.predict(X_test)
    # Compute key metrics for target "Bankrupt = 1"
    accuracy.append(model.score(X_test, y_test))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
# Print key metrics      
print("Mean accuracy:", np.mean(accuracy))
print("Mean f1", np.mean(f1))
print("Mean recall", np.mean(recall))
print("Mean precision", np.mean(precision))

Using both over and undersampling yielded a less greater f1 score but the recall increased sharply.

### c) SMOTE & AdaBoost

In [None]:
# USING OVERSAMPLING ONLY
kf = RepeatedStratifiedKFold(n_splits=5)
# Set empty lists to store key metrics
accuracy = []
f1 = []
recall =[]
precision = []
# Loop over kfolds
for kf, (train_index, test_index) in enumerate(kf.split(X, y), 1):
    X_train = X[train_index, :]
    y_train = y[train_index]  
    X_test = X[test_index, :]
    y_test = y[test_index]  
    # Set pipeline where SMOTE is applied before model fitting
    model = make_pipeline(SMOTE() ,AdaBoostClassifier(random_state=0))
    # Fit model
    model.fit(X_train, y_train)
    # Predict on original test set  
    y_pred = model.predict(X_test)
    # Compute key metrics for target "Bankrupt = 1"
    accuracy.append(model.score(X_test, y_test))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
# Print key metrics      
print("Mean accuracy:", np.mean(accuracy))
print("Mean f1", np.mean(f1))
print("Mean recall", np.mean(recall))
print("Mean precision", np.mean(precision))

Adaboost yielded lower f1 score, although the recall drastically improved. This model will have a quite high sensitivity meaning it will correctly detect True Positives namely companies that will go bankrupt but will also give high proportion of False Positives (around 80%). 

In [None]:
# # USING BOTH OVER AND UNDERSAMPLING
kf = RepeatedStratifiedKFold(n_splits=5)
# Set empty lists to store key metrics
accuracy = []
f1 = []
recall =[]
precision = []
# Loop over kfolds
for kf, (train_index, test_index) in enumerate(kf.split(X, y), 1):
    X_train = X[train_index, :]
    y_train = y[train_index]  
    X_test = X[test_index, :]
    y_test = y[test_index]  
    # Set pipeline where SMOTE is applied before model fitting
    model = make_pipeline(over, under ,AdaBoostClassifier(random_state=0))
    # Fit model
    model.fit(X_train, y_train)
    # Predict on original test set  
    y_pred = model.predict(X_test)
    # Compute key metrics for target "Bankrupt = 1"
    accuracy.append(model.score(X_test, y_test))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
# Print key metrics      
print("Mean accuracy:", np.mean(accuracy))
print("Mean f1", np.mean(f1))
print("Mean recall", np.mean(recall))
print("Mean precision", np.mean(precision))

Adaboost seem to yield slightly better f1 score with this SMOTE startegy. 

### Conclusion:

RandomForestClasifier + Oversampling gave us the highest f1 score of about 43% with 58% Recall and 34% Precision. 

  ## Final Conclusion