# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Company Bankruptcy

The data were collected from the Taiwan Economic Journal for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange


## <span style="font-family: Arial;font-size:1.2em;color:#3366ff"> challenges in data set

* <span style="font-family: Arial;font-size:1.1em;color:#333333">too many independent variables to learn for ML or ANN
* <span style="font-family: Arial;font-size:1.1em;color:#333333">Class imbalance -  Bankrupt is only 3.22% and 96.77% normal
* <span style="font-family: Arial;font-size:1.1em;color:#333333">highly correlated variables
* <span style="font-family: Arial;font-size:1.1em;color:#333333">Outlier presence

## <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Index

* <a href="#Packages">Packages</a>
* <a href="#EDA">EDA</a>
* <a href="#Class-imbalance">Class imbalance</a>
* <a href="#Scaling">Scaling</a>
* <a href="#Oversampling">Oversampling</a>
* <a href="#PCA">PCA</a>
* <a href="#Outlier">Outlier</a>
* <a href="#ANN">ANN</a>



# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Packages

In [None]:
# packages

import numpy as np 
import pandas as pd 

# plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# scaling
from sklearn.preprocessing import MinMaxScaler
# PCA
from sklearn.decomposition import PCA
# data split
from sklearn.model_selection import train_test_split


# Data imputation
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# ANN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping

# model evaluation
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import plot_confusion_matrix
# MISC
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# to reproduce same results every time we run the code
import random as python_random

np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)

In [None]:
# read df
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')


In [None]:
df.head()

# <span style="font-family: Arial;font-size:1.1em;color:#3366ff">EDA

In [None]:
df.describe()

In [None]:
# correlation above 0.50

correlation_five = df.corr()
correlation_five = correlation_five[correlation_five >0.50]

plt.figure(figsize=(10,15))
sns.set_style('white')
sns.set_context(context='notebook',font_scale=1.2)
sns.heatmap(correlation_five,cmap="Blues");
plt.title('correlation');

In [None]:
# filter the highly correlated variables

corr_df = df.corr()
high_corr = ~(corr_df.mask(np.eye(len(corr_df ), dtype=bool)).abs() > 0.5).any()
high_corr

corr_df = corr_df.loc[high_corr,high_corr]
print(corr_df.columns)

# <span style="font-family: Arial;font-size:1.1em;color:#3366ff">Missing values

In [None]:
# missing values
missing_value = 100 * df.isnull().sum()/len(df)
missing_value = missing_value.reset_index()
missing_value.columns = ['variables','missing values in percentage']
missing_value = missing_value.sort_values('missing values in percentage',ascending=False)


print((missing_value['missing values in percentage'] > 0).sum())
missing_value.head(4)

# <span style="font-family: Arial;font-size:1.1em;color:#3366ff">Class imbalance

In [None]:
# plot data
plot_target = df['Bankrupt?'].value_counts(1).reset_index()
plot_target.columns = ['Bankruptcy?','percentage']

# plot
sns.set_style('white')
sns.set_context(context='notebook',font_scale=1.2)
sns.barplot(x='Bankruptcy?',y='percentage',data=plot_target);
plt.title('Target variable');
print(df['Bankrupt?'].value_counts(1)*100)

In [None]:
# checking for duplicates

dup = df.duplicated()
dup.sum()

In [None]:
# drop target variable for training

X = df.drop(['Bankrupt?'],axis = 1)
y = df.pop('Bankrupt?')

In [None]:
# data split

X_train, X_test,y_train,y_test = train_test_split(X, y,stratify=y)

In [None]:
y_train.value_counts(1)

# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Oversampling

In [None]:
# smote oversampling

SMOTE_oversample = SMOTE()
X_train,y_train = SMOTE_oversample.fit_resample(X_train,y_train)

In [None]:
# class imbalance is treated
sns.set_style('white');
sns.set_context(context='notebook',font_scale=1.2)
sns.countplot(x=y_train);
plt.title('Target variable balanced');


# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Outlier

* In statistics, an outlier is a data point that differs significantly from other observations

In [None]:
# for plot I only selected 10 columns
df.iloc[:,10:40].plot(kind='box',figsize=(16,6));
plt.xticks(rotation=70);

In [None]:
#"""outlier
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range
   
#"""

In [None]:

for column in X_train.columns:
    lr,ur = remove_outlier(df[column])
    df[column] = np.where(df[column]>ur, ur,df[column])
    df[column] = np.where(df[column]<lr,lr,df[column])

In [None]:
df.iloc[:,10:40].plot(kind='box',figsize=(16,6))
plt.xticks(rotation=70);

# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">Scaling
* scaling is important for Neural networks

In [None]:
# scaling the data

std = MinMaxScaler()

X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

In [None]:
# we have 96 column that too much to learn for a ML or ANN
X_train.shape,X_test.shape

# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">PCA

* PCA, is a dimensionality-reduction method that is often used to reduce the dimensionality

In [None]:
# PCA 
pca = PCA(n_components=16)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
# reduce dimentionality 
X_train.shape,X_test.shape

# <span style="font-family: Arial;font-size:1.2em;color:#3366ff">ANN

In [None]:
# early stopping
early_stop =  EarlyStopping(monitor='val_auc',mode='max', verbose=1, patience=27,restore_best_weights=True)

# ANN
model =  Sequential()

model.add(Dense(units=8,activation='relu'))
model.add(Dropout(0.10))

model.add(Dense(units=4,activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

# compile ANN
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
# Train ANN
model.fit(x=X_train, 
          y=y_train, 
          epochs=120,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )

In [None]:
# model history to df
loss_plot = pd.DataFrame(model.history.history)
accuracy_plot = pd.DataFrame(model.history.history)

#  accuracy and loss plot
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,4))
plt.style.use('seaborn')
ax1.plot(loss_plot.loc[:, ['loss']], label='Training loss');
ax1.plot(loss_plot.loc[:, ['val_loss']],label='Validation loss');
ax1.set_title('Training and Validation loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc="best");

ax2.plot(accuracy_plot.loc[:, ['accuracy']],label='Training_accuracy');
ax2.plot(accuracy_plot.loc[:, ['val_accuracy']], label='Validation_accuracy');
ax2.set_title('Training_and_Validation_accuracy');
ax2.set_xlabel('epochs')
ax2.set_ylabel('accuracy')
ax2.legend(loc="best");

In [None]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred,normalize='true'), annot=True);#

#### <span style="font-family: Arial;font-size:1.5em;color:#3366ff">Feel free to post any suggestion! 

### <span style="font-family: Arial;font-size:1.5em;color:#3366ff">Thanks!