In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from scipy.io import arff #for reading the arff file

In [None]:
# https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data
# https://archive.ics.uci.edu/ml/machine-learning-databases/00365/
data = arff.loadarff('5year.arff')
# - 5thYear â€“ the data contains financial rates from 5th year of the forecasting period and corresponding class label
# that indicates bankruptcy status after 1 year.
df = pd.DataFrame(data[0])
# the length of data object is 2. The first item gives the dataset. The second item merely 
# states whether the variable is numeric or nominal

In [None]:
type(data)

In [None]:
len(data)

In [None]:
data[1]

In [None]:
df.head(2)
# data contains year 5 data and bankruptcy status after 1 year
# The class variable is nominal. It is a string. 

## Variable Definitions

In [None]:
"""
X1 net profit / total assets
X2 total liabilities / total assets
X3 working capital / total assets
X4 current assets / short-term liabilities
X5 [(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365
X6 retained earnings / total assets
X7 EBIT / total assets
X8 book value of equity / total liabilities
X9 sales / total assets
X10 equity / total assets
X11 (gross profit + extraordinary items + financial expenses) / total assets
X12 gross profit / short-term liabilities
X13 (gross profit + depreciation) / sales
X14 (gross profit + interest) / total assets
X15 (total liabilities * 365) / (gross profit + depreciation)
X16 (gross profit + depreciation) / total liabilities
X17 total assets / total liabilities
X18 gross profit / total assets
X19 gross profit / sales
X20 (inventory * 365) / sales
X21 sales (n) / sales (n-1)
X22 profit on operating activities / total assets
X23 net profit / sales
X24 gross profit (in 3 years) / total assets
X25 (equity - share capital) / total assets
X26 (net profit + depreciation) / total liabilities
X27 profit on operating activities / financial expenses
X28 working capital / fixed assets
X29 logarithm of total assets
X30 (total liabilities - cash) / sales
X31 (gross profit + interest) / sales
X32 (current liabilities * 365) / cost of products sold
X33 operating expenses / short-term liabilities
X34 operating expenses / total liabilities
X35 profit on sales / total assets
X36 total sales / total assets
X37 (current assets - inventories) / long-term liabilities
X38 constant capital / total assets
X39 profit on sales / sales
X40 (current assets - inventory - receivables) / short-term liabilities
X41 total liabilities / ((profit on operating activities + depreciation) * (12/365))
X42 profit on operating activities / sales
X43 rotation receivables + inventory turnover in days
X44 (receivables * 365) / sales
X45 net profit / inventory
X46 (current assets - inventory) / short-term liabilities
X47 (inventory * 365) / cost of products sold
X48 EBITDA (profit on operating activities - depreciation) / total assets
X49 EBITDA (profit on operating activities - depreciation) / sales
X50 current assets / total liabilities
X51 short-term liabilities / total assets
X52 (short-term liabilities * 365) / cost of products sold)
X53 equity / fixed assets
X54 constant capital / fixed assets
X55 working capital
X56 (sales - cost of products sold) / sales
X57 (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation)
X58 total costs /total sales
X59 long-term liabilities / equity
X60 sales / inventory
X61 sales / receivables
X62 (short-term liabilities *365) / sales
X63 sales / short-term liabilities
X64 sales / fixed assets
"""

In [None]:
df.info() # class stored as object

In [None]:
df['class'].value_counts()

In [None]:
# !pip install --upgrade numpy

In [None]:
df['class'] = np.where(df['class']==b'0',0,1) #if

In [None]:
# df['class']=df['class'].map(lambda x: 0 if x==b'0' else 1)

In [None]:
df['class'] = df['class'].astype('category')

In [None]:
df.info()

In [None]:
print(df.shape)
print(df.dropna().shape)

In [None]:
df.isnull().sum().sort_values(ascending=False)
# Attribute 37 has 2548 missing values. 

In [None]:
# First drop Attr37 and then drop the missing values. 
df1 = df.drop('Attr37',axis=1).dropna(axis=0)
df1.shape

In [None]:
df.shape
# original data has 5910 rows. We lose about 1000 rows of data

In [None]:
df2 = df.drop(['Attr37','Attr27','Attr45'],axis=1).dropna(axis=0)
df2.shape

In [None]:
len(df2.columns)-1


In [None]:
df2.columns

In [None]:
feature_columns = df2.columns[:df2.shape[1]-1]

In [None]:
feature_columns

In [None]:
X = df2[feature_columns]
y=df2['class']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.75,random_state=13)

In [None]:
# !pip install feature_engine

In [None]:
from feature_engine.selection import DropCorrelatedFeatures

In [None]:
tr = DropCorrelatedFeatures(threshold=0.8)

In [None]:
X_train1 = tr.fit_transform(X_train)

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
X_train1.shape

In [None]:
y_train.shape

In [None]:
final_features = X_train1.columns

In [None]:
final_features

In [None]:
X_test1 = X_test[final_features]

In [None]:
y_train.value_counts()

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote= SMOTE(random_state=13)
X_resampled, y_resampled = smote.fit_resample(X_train1, y_train)

In [None]:
y_resampled.value_counts()

In [None]:
X_resampled.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(random_state=13,max_iter=1000)

In [None]:
lr.fit(X_resampled,y_resampled)

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
print(f'The accuracy score on the training dataset is {lr.score(X_resampled,y_resampled):.2f}')
print(f'The accuracy score on the test dataset is {lr.score(X_test1,y_test):.2f}')

In [None]:
y_pred = lr.predict(X_test1)

In [None]:
y_prob = lr.predict_proba(X_test1)[:,1]

In [None]:
X_test1.iloc[2].to_frame().T

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred),display_labels=["Non Bankrupt","Bankrupt"]).plot()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()

In [None]:
print(f'True Positive: {tp}')
print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')


In [None]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [None]:
print(f'Precision Score is: {precision_score(y_test,y_pred):.2f}')
print(f'Recall Score is: {recall_score(y_test,y_pred):.3f}')

In [None]:
lr.predict_proba(X_test1)

In [None]:
print(f'Area under the ROC curve is:\
      {roc_auc_score(y_test,lr.predict_proba(X_test1)[:, 1]):.2f}')

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(lr,X_test1,y_test);

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_resampled,y_resampled)
y_pred = knn.predict(X_test1)
y_prob = knn.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_resampled,y_resampled)
y_pred = nb.predict(X_test1)
y_prob = nb.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
from sklearn.svm import NuSVC

In [None]:
svm = NuSVC(kernel='rbf',random_state=13)
svm.fit(X_resampled,y_resampled)
y_pred = svm.predict(X_test1)
recall_score(y_test,y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=13,max_depth=5)
dt.fit(X_resampled,y_resampled)
y_pred = dt.predict(X_test1)
y_prob = dt.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
pd.Series(dt.feature_importances_,index=final_features).sort_values(ascending=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=500,random_state=13,max_depth=5)

In [None]:
rf.fit(X_resampled,y_resampled)

In [None]:
pd.Series(rf.feature_importances_,index=final_features).sort_values(ascending=False)

In [None]:
y_pred = rf.predict(X_test1)
y_prob = rf.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])