In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Importing Libararies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading data into Pandas Dataframe
data = pd.read_csv("/kaggle/input/company-bankruptcy-prediction/data.csv")

In [None]:
pd.options.display.max_columns=100
pd.options.display.max_rows=100
pd.set_option('display.float_format','{:.2f}'.format)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
#Check for Missing Values
plt.figure(figsize=(16,12))
sns.heatmap(data.isnull(), cmap = 'magma')

There are no missing values in the data

In [None]:
#Check if there are any duplicate rows
data.duplicated(keep=False).sum()

There are no duplicate rows

**Now let's remove spaces from the begining and end of the column names**

In [None]:
df_col = []
for i in range(len(data.columns)):
    df_col.append(str(data.columns[i]).strip())  

data.columns = df_col

In [None]:
data['Net Income Flag'].value_counts()

We will remove the column Net Income Flag since it has only single value.****

In [None]:
data.drop(['Net Income Flag'], axis = 1, inplace = True)

In [None]:
#Check if the data is balanced
data['Bankrupt?'].value_counts()

**Data is highly imbalanced. We will be using SMOTE to use balanced data.**

# **Multicollinearity-Check**

Check if there is perfect correlation between any of the columns. If the correlation is more than 0.95 or less than -0.95 then we would remove one of the columns to avoid multicollinearity in the data-set

In [None]:
corr_mat = data.corr()

In [None]:
corr_mat = corr_mat.iloc[1:,1:]

In [None]:
drop_list = []
for i in range(len(corr_mat.columns)):
    for j in range(i):
        if(corr_mat.iloc[i,j] >= 0.95 or corr_mat.iloc[i,j] <= -0.95):
            if corr_mat.columns[j] not in drop_list:
                drop_list.append(corr_mat.columns[j])            

In [None]:
len(drop_list)

In [None]:
data = data.drop(drop_list, axis = 1)

In [None]:
len(data.columns)

In [None]:
#Data Preparation
y = data['Bankrupt?']
X = data.drop(['Bankrupt?'], axis = 1)

In [None]:
#Importing Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# > Balancing Data - SMOTE****

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X,y)
y_sm.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state = 101, stratify = y_sm)

In [None]:
#Data Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# > **Hyper Parameter Tuning - Grid Search CV**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto',probability=True),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear','sigmoid']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,20,30,50],
            'criterion':['gini','entropy']
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(multi_class='auto'),
        'params': {
            'C': [1,5,10],
            'solver':['lbfgs','liblinear']
        }
    },
   'KNN': {
        'model':KNeighborsClassifier(),
        'params':{
            'n_neighbors' : [1,3,5,7],
            'algorithm':['auto','kd_tree']
        }
    }
}

# Data Normalization****

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sm)

**Implementing Grid Search CV for 4 models(SVM,Random Forest, Logistic Regression & K Nearest Neighbours)**

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=3, verbose=3, n_jobs=-1, scoring = 'recall', return_train_score=False)
    clf.fit(X_scaled, y_sm)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

The above table provides us with the tuned parameters for each of the models. Now let's implement eatch of the models and explore their accuracy and recall

# > **SVM**

In [None]:
model_svm = SVC(gamma='auto',C=20, kernel='rbf', probability=True )
model_svm.fit(X_train,y_train)
svm_predictions = model_svm.predict(X_test)

In [None]:
#Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,svm_predictions))


In [None]:
print(classification_report(y_test,svm_predictions))

# **SVM has high accuracy rate of 98%. Specifically, the model is able to predict the companies which can go bankcrupt with almost 100% accuracy.**

# > Random Forest Classifier****

In [None]:
rfc = RandomForestClassifier(n_estimators=50, criterion='entropy' )
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
#Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,rfc_pred))


In [None]:
print(classification_report(y_test,rfc_pred))

# Random Forest has accuracy of 98% and is able to identify accounts which can go brankrupt with alomost 100% accuracy.****

# > Logistic Regression********

In [None]:
logmodel = LogisticRegression(solver='liblinear',multi_class='auto', C = 5)
logmodel.fit(X_train,y_train)
log_predictions = logmodel.predict(X_test)

In [None]:
# Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,log_predictions))

In [None]:
#Printing Classification Report
print(classification_report(y_test,log_predictions))


In [None]:
knn = KNeighborsClassifier(n_neighbors=3,algorithm = 'auto')
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

In [None]:
# Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,knn_pred))

In [None]:
print(classification_report(y_test,knn_pred))

> # ROC Curve and Area Under the Curve****

In [None]:
from sklearn.metrics import roc_curve, auc

**ROC - Logistic**

In [None]:
y_pred_logistic = logmodel.predict_proba(X_test)[:,1]
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr, logistic_tpr)

**ROC - Random Forest**

In [None]:
y_pred_rfc = rfc.predict_proba(X_test)[:,1]
rfc_fpr, rfc_tpr, threshold = roc_curve(y_test, y_pred_rfc)
auc_rfc = auc(rfc_fpr, rfc_tpr)

**ROC - SVM**

In [None]:
y_pred_svm = model_svm.predict_proba(X_test)[:,1]
svm_fpr, svm_tpr, threshold = roc_curve(y_test, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)

**ROC - KNN**

In [None]:
y_pred_knn = knn.predict_proba(X_test)[:,1]
knn_fpr, knn_tpr, threshold = roc_curve(y_test, y_pred_knn)
auc_knn = auc(knn_fpr, knn_tpr)

**ROC - Plotting Graph**

In [None]:
plt.figure(figsize=(5, 5), dpi=100)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(rfc_fpr, rfc_tpr, linestyle='-', label='RFC (auc = %0.3f)' % auc_rfc)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.plot(svm_fpr, svm_tpr, marker='+', label='SVM (auc = %0.3f)' % auc_svm)
plt.plot(knn_fpr, knn_tpr, linestyle='-', label='KNN (auc = %0.3f)' % auc_knn)


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend(loc='best')

# > **Conclusion**

Perforamnce of Random Forest Classifier and SVM is better than rest of the models. Hence we can rely on the outcome of the 2 models for making decision.

> # Deep Learning****

In [None]:
#Importing Libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
#Converting to numpy array
y_train = y_train.values
y_test = y_test.values

In [None]:

model = Sequential()

model.add(Dense(units=77,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=77,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=1,activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=600,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )

In [None]:
model_loss = pd.DataFrame(model.history.history)

In [None]:
model_loss.plot()

In [None]:
ann_predictions = model.predict_classes(X_test)

In [None]:
#Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,ann_predictions))

In [None]:
#Printing Classification Report
print(classification_report(y_test,ann_predictions))

**Accuracy of deep learning model is 99% while the recall is of almost 100%.**

# > **ROC(With Deep Learning Results)**

In [None]:
model.predict_proba(X_test)[:,0]

In [None]:
y_pred_dl = model.predict_proba(X_test)
dl_fpr, dl_tpr, threshold = roc_curve(y_test, y_pred_dl)
auc_dl = auc(dl_fpr, dl_tpr)

In [None]:
plt.figure(figsize=(5, 5), dpi=100)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(rfc_fpr, rfc_tpr, linestyle='-', label='RFC (auc = %0.3f)' % auc_rfc)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.plot(svm_fpr, svm_tpr, marker='+', label='SVM (auc = %0.3f)' % auc_svm)
plt.plot(knn_fpr, knn_tpr, linestyle='-', label='KNN (auc = %0.3f)' % auc_knn)
plt.plot(dl_fpr, dl_tpr, linestyle='-', label='Deep Learning (auc = %0.3f)' % auc_dl)


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend(loc='best')

# > **Conclusion**

Performance of Deep learning model performance is at parwith Random Forest. We may emply Deep Learning model or Random Forest or SVM model to make decisions.