In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier



from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



from skopt import BayesSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.filterwarnings("ignore")

from sqlalchemy import Table, Column, Float, Integer, BigInteger
import missingno as msno

from sklearn.feature_selection import SelectFromModel


In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
df.shape

## Null value check
#### missingno.matrix-there is a bar on the right side of this diagram. This is a line plot for each row's data completeness.


In [None]:
plt.figure(figsize=(5,5))
msno.matrix(df)
plt.show()

#### missingno.heatmap visualizes the correlation matrix about the locations of missing values in columns.

In [None]:
plt.figure(figsize=(4,4))
msno.heatmap(df)
plt.show()

from the above graph we can clearly see that no null value is present in the data

## Info about the type of data type for feature present in the dataset 

In [None]:
df.info()

from the above analysis we can clearly see the only int and float value is these


## check data is balanced or imbalanced?

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df['Bankrupt?'])
plt.show()

From the above plot we can clearly see that data in imbalanced. To balance the data we have to do upsampling or downsampling technique

### Corelation Matrix

In [None]:
plt.figure(figsize=(17,17))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.show()

We can clearly see that the some feature are highly corelated to each other. we will eleminate it with feature elemination technique

### Seperating dependent and independent features

In [None]:
y = df['Bankrupt?']
X = df.drop('Bankrupt?', axis=1)

### Data is imbalanced , so to balance it we will use balancing technique ..Here we are using SMOT 

In [None]:
from imblearn.over_sampling import SMOTE
over = SMOTE()
X, y = over.fit_resample(X, y)

### Spliting of data into train and test 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=97, test_size=0.2)

In [None]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

### Standarization

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.columns, index=X_train.index)

## Feature elemination - using L1 Regularization

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1',solver='liblinear'))
sel_.fit(X_train_sc, y_train)

In [None]:
sel_.get_support()

In [None]:
sel_.estimator_.coef_

In [None]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
np.sum(sel_.estimator_.coef_ == 0)

In [None]:
removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats = removed_feats.to_list()

In [None]:
X_train_sc.drop(removed_feats, axis=1, inplace=True)

In [None]:
X_train_sc.shape, y_train.shape

In [None]:
X_test_sc = sc.transform(X_test)
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns, index=X_test.index)
X_test_sc.drop(removed_feats, axis=1, inplace=True)

In [None]:
X_test_sc.shape, y_test.shape

## PCA Visualization

In [None]:
pc = PCA(n_components=len(X_train_sc.columns))
X_train_pc=pc.fit_transform(X_train_sc)
PC_df_train=pd.DataFrame(X_train_pc,columns=['PC_' +str(i) for i in range(1,pc.n_components_+1)])

In [None]:
PC_df_train

## Scree Plot - PCA Analysis
In multivariate statistics, a scree plot is a line plot of the eigenvalues of factors or principal components in an analysis. The scree plot is used to determine the number of factors to retain in an exploratory factor analysis (FA) or principal components to keep in a principal component analysis (PCA)

To select number of principal components elbow method is used
We can clearly, proper elbow is not formed in the below graph, so we can select all the components

In [None]:
plt.figure(figsize=(12,6))
plt.plot(PC_df_train.std())
plt.title('Scree Plot - PCA components')
plt.xlabel('Principal Component')
plt.xticks(rotation=90)
plt.ylabel('Standard deviation')
plt.show()

We can see that it is forming an elbow at PC_15, So we can take 10 principal components for further analysis

In [None]:
pc = PCA(n_components=15)
X_train_pc=pc.fit_transform(X_train_sc)
PC_df_train=pd.DataFrame(X_train_pc,columns=['PC_' +str(i) for i in range(1,pc.n_components_+1)])

In [None]:
X_test_pc = pc.transform(X_test_sc)
PC_df_test=pd.DataFrame(X_test_pc,columns=['PC_' +str(i) for i in range(1,pc.n_components_+1)])

## Model Building

In [None]:
print(PC_df_train.shape)
y_train.shape

## Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier.fit(PC_df_train,y_train)
y_lr=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_lr,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_lr,y_test))
print()
print('Classification Report \n',classification_report(y_lr,y_test))

## SVC Classifier

In [None]:
classifier = SVC()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
y_svc=classifier.predict(X_test_pc)

print('Confusion Matrix \n',confusion_matrix(y_svc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_svc,y_test))
print()
print('Classification Report \n',classification_report(y_svc,y_test))

## Random forest Classifier

In [None]:
classifier = RandomForestClassifier()
classifier.fit(X_train_pc,y_train)
y_rfc=classifier.predict(X_test_pc)

print('Confusion Matrix \n',confusion_matrix(y_rfc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_rfc,y_test))
print()
print('Classification Report \n',classification_report(y_rfc,y_test))

Gradient Boosting classifier

In [None]:
classifier = GradientBoostingClassifier()
classifier.fit(X_train_pc,y_train)
y_gbc=classifier.predict(X_test_pc)

print('Confusion Matrix \n',confusion_matrix(y_gbc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_gbc,y_test))
print()
print('Classification Report \n',classification_report(y_gbc,y_test))

In [None]:
classifier = GaussianNB()
classifier.fit(X_train_pc,y_train)
y_gb=classifier.predict(X_test_pc)

print('Confusion Matrix \n',confusion_matrix(y_gb,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_gb,y_test))
print()
print('Classification Report \n',classification_report(y_gb,y_test))

## XGB Classifier

In [None]:
classifier = XGBClassifier()
classifier.fit(X_train_pc, y_train)
y_xg=classifier.predict(X_test_pc)

print('Confusion Matrix \n',confusion_matrix(y_xg,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_xg,y_test))
print()
print('Classification Report \n',classification_report(y_xg,y_test))

In [None]:
lr_df = pd.DataFrame(data=[f1_score(y_test,y_lr),accuracy_score(y_test, y_lr), recall_score(y_test, y_lr), precision_score(y_test, y_lr), roc_auc_score(y_test, y_lr)], 
             columns=['Logistic Regression'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
rf_df = pd.DataFrame(data=[f1_score(y_test,y_rfc),accuracy_score(y_test, y_rfc), recall_score(y_test, y_rfc),precision_score(y_test, y_rfc), roc_auc_score(y_test, y_rfc)], 
             columns=['Random Forest Score'],index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
nb_df = pd.DataFrame(data=[f1_score(y_test,y_gb),accuracy_score(y_test, y_gb), recall_score(y_test, y_gb), precision_score(y_test, y_gb), roc_auc_score(y_test, y_gb)], 
             columns=['Naive Bayes'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

xg_df = pd.DataFrame(data=[f1_score(y_test,y_xg),accuracy_score(y_test, y_xg), recall_score(y_test, y_xg), precision_score(y_test, y_xg), roc_auc_score(y_test, y_xg)], 
             columns=['XG Boost'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
gbc_df = pd.DataFrame(data=[f1_score(y_test,y_gbc),accuracy_score(y_test, y_gbc), recall_score(y_test, y_gbc), precision_score(y_test, y_gbc), roc_auc_score(y_test,y_gbc)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
svc_df = pd.DataFrame(data=[f1_score(y_test,y_svc),accuracy_score(y_test, y_svc), recall_score(y_test, y_svc), precision_score(y_test, y_svc), roc_auc_score(y_test,y_svc)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])


df_models = round(pd.concat([lr_df,rf_df,nb_df,gbc_df,xg_df,svc_df], axis=1),3)
colors = ["bisque","ivory","sandybrown","steelblue","lightsalmon"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "white"

fig = plt.figure(figsize=(18,26)) # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(df_models.T, cmap=colormap,annot=True,fmt=".1%",vmin=0,vmax=0.95, linewidths=2.5,cbar=False,ax=ax0,annot_kws={"fontsize":16})
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(0,-0.5,'Model Comparison',fontsize=20,fontweight='bold',fontfamily='serif')
plt.show()


# Conclusion
### From the above model analysis we can clearly see Random Forest and XG Boost is giving accuracy of 96%