# 1. LOAD DATA

# 1.1 Loading important libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 1.2 Importing the Dataset

In [None]:
Bankrupt=pd.read_csv("bankruptcy-prevention.csv",sep=";")

In [None]:
Bankrupt

In [None]:
Bankrupt.head()

In [None]:
Bankrupt.tail()

# 2. EXPLORATORY DATA ANALYSIS

### 2.1 Understanding dataset

In [None]:
Bankrupt.info()

In [None]:
Bankrupt.shape

In [None]:
Bankrupt.isnull().sum()

Obeservations:- There is no null value in the data

In [None]:
Bankrupt.count()

In [None]:
Bankrupt.describe()

In [None]:
#Count of duplicated rows
Bankrupt[Bankrupt.duplicated()].shape

Obeservation:- There is 147 duplicate value in the dataset.

In [None]:
#Count of duplicated rows
Bankrupt[Bankrupt.duplicated()]

In [None]:
bankrupt1=Bankrupt.drop_duplicates()

In [None]:
Bankrupt[' class'].unique()

In [None]:
Bankrupt[' class'].value_counts()

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=Bankrupt[' class'], palette='turbo', linewidth=1)
plt.show()

In [None]:
Bankrupt['industrial_risk'].value_counts()

In [None]:
Bankrupt[' management_risk'].value_counts()

In [None]:
Bankrupt[' financial_flexibility'].value_counts()

In [None]:
Bankrupt[' credibility'].value_counts()

In [None]:
Bankrupt[' competitiveness'].value_counts()

In [None]:
Bankrupt[' operating_risk'].value_counts()

In [None]:
plt.figure(figsize=(15, 12))
for i, predictor in enumerate(Bankrupt.drop(columns = [' class'])):
    ax = plt.subplot(3, 2, i + 1)
    sns.countplot(data=Bankrupt, x=predictor, hue=' class')

# Transformations

#### Here are we taking a bankruptcy = 0 and Non-bankruptcy=1 by encoding data

In [None]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
Bankrupt[' class'] = encode.fit_transform(Bankrupt[' class'])
Bankrupt

# 3. EDA AND VISUALIZATION

### 3.1 Correlation

In [None]:
correlations = Bankrupt.corr()
correlations

In [None]:
sns.heatmap(correlations,xticklabels=correlations.columns, yticklabels=correlations.columns,annot=True)

### 3.2 Histogram

In [None]:
Bankrupt.hist(figsize=(12, 8),bins=20)
plt.show()

###  3.2Scatterplot Matrix

In [None]:
sns.pairplot(Bankrupt)

###  3.3 Pie chart

In [None]:
a =Bankrupt[' class'].value_counts()[0]     
b =Bankrupt[' class'].value_counts()[1]     


fig1, ax1 = plt.subplots(figsize=(8, 6))
label = ['bankruptcy', 'non-bankruptcy']
count = [a, b]
colors = ['red', 'yellowgreen']
explode = (0, 0.1)  # explode 2nd slice
plt.pie(count, labels=label, autopct='%0.2f%%', explode=explode, colors=colors,shadow=True, startangle=90)
plt.show()

### 3.4 Boxplot

In [None]:
Bankrupt.plot(kind='box', subplots=True, layout=(4,4), figsize=(12,10))
plt.show()

###  3.5  Density Plot

In [None]:
from pandas import read_csv

Bankrupt.plot(kind='density', subplots=True, layout=(4,4), sharex=False, figsize=(12,10))
plt.show()

# 4.Spliting dataset into X and y

In [None]:
# Input
x = Bankrupt.iloc[:,:-1]

# Target variable

y = Bankrupt.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split # trian and test
from sklearn import metrics
from sklearn import preprocessing 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

# 5. MODEL BUILDING

### Table of Contents

In [None]:
def Visualize_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Oranges',
                xticklabels=['No Bankruptcy','Bankruptcy'], yticklabels=['No Bankruptcy','Bankruptcy'])
    plt.title('Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.show()
    
    print("\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    return

### 5.1 Logistic Regression

#### Training the Logistic Regression on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(random_state = 0)
LR_model.fit(x_train, y_train)
y_pred = LR_model.predict(x_test)
acc1 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = LR_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the Logistic Regression on the Testing set

In [None]:
# Test score
f1_LR = f1_score(y_test, y_pred)
precision_LR = precision_score(y_test, y_pred)
recall_LR = recall_score(y_test, y_pred)
Accuracy_LR=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_LR)
print('Precision score: %0.4f'% precision_LR)
print('F1-Score: %0.4f'% f1_LR)
print('Accuracy score: %0.4f'% Accuracy_LR)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.2 Naive Bayes

#### Training the Naive Bayes on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
NB_model = GaussianNB()
NB_model.fit(x_train, y_train)
y_pred = NB_model.predict(x_test)
acc2 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = NB_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the Naive Bayes on the Testing set

In [None]:
# Test score
f1_NB = f1_score(y_test, y_pred)
precision_NB = precision_score(y_test, y_pred)
recall_NB = recall_score(y_test, y_pred)
Accuracy_NB=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_NB)
print('Precision score: %0.4f'% precision_NB)
print('F1-Score: %0.4f'% f1_NB)
print('Accuracy score: %0.4f'% Accuracy_NB)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.3 KNN

#### Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN_model.fit(x_train, y_train)
y_pred = KNN_model.predict(x_test)
acc3 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = KNN_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the KNN on the Testing set

In [None]:
# Test score
f1_KNN = f1_score(y_test, y_pred)
precision_KNN = precision_score(y_test, y_pred)
recall_KNN = recall_score(y_test, y_pred)
Accuracy_KNN=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_KNN)
print('Precision score: %0.4f'% precision_KNN)
print('F1-Score: %0.4f'% f1_KNN)
print('Accuracy score: %0.4f'% Accuracy_KNN)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.4 Decision Tree

#### Training Decision Tree Classification on Train set

In [None]:
from sklearn.tree import DecisionTreeClassifier
DTR_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DTR_model.fit(x_train, y_train)
y_pred = DTR_model.predict(x_test)
acc4 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = DTR_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing Decision Tree Classification on the Testing set

In [None]:
# Test score
f1_DT = f1_score(y_test, y_pred)
precision_DT = precision_score(y_test, y_pred)
recall_DT = recall_score(y_test, y_pred)
Accuracy_DT=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_DT)
print('Precision score: %0.4f'% precision_DT)
print('F1-Score: %0.4f'% f1_DT)
print('Accuracy score: %0.4f'% Accuracy_DT)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.5 Random Forest

#### Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RF_model.fit(x_train, y_train)
y_pred = RF_model.predict(x_test)
acc5 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = RF_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the Random Forest Classification model on the Testing set

In [None]:
# Test score
f1_RF = f1_score(y_test, y_pred)
precision_RF = precision_score(y_test, y_pred)
recall_RF = recall_score(y_test, y_pred)
Accuracy_RF=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_RF)
print('Precision score: %0.4f'% precision_RF)
print('F1-Score: %0.4f'% f1_RF)
print('Accuracy score: %0.4f'% Accuracy_RF)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.6 SVM

#### Training the SVM on the Training set

In [None]:
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', random_state = 0, probability=True)
SVM_model.fit(x_train, y_train)
y_pred = SVM_model.predict(x_test)
acc6 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = SVM_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the SVM model on the Testing set

In [None]:
# Test score
f1_SVM = f1_score(y_test, y_pred)
precision_SVM = precision_score(y_test, y_pred)
recall_SVM = recall_score(y_test, y_pred)
Accuracy_SVM=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_SVM)
print('Precision score: %0.4f'% precision_SVM)
print('F1-Score: %0.4f'% f1_SVM)
print('Accuracy score: %0.4f'% Accuracy_SVM)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

In [None]:
pip install XGBoost

### 5.7 XGBoost

#### Training the XGBoost model on the Training set

In [None]:
from xgboost import XGBClassifier
XGB_model = XGBClassifier()
XGB_model.fit(x_train, y_train)
y_pred = XGB_model.predict(x_test)
acc7 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = XGB_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the XGBoost model on the Testing set

In [None]:
# Test score
f1_XGB = f1_score(y_test, y_pred)
precision_XGB = precision_score(y_test, y_pred)
recall_XGB = recall_score(y_test, y_pred)
Accuracy_XGB=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_XGB)
print('Precision score: %0.4f'% precision_XGB)
print('F1-Score: %0.4f'% f1_XGB)
print('Accuracy score: %0.4f'% Accuracy_XGB)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

In [None]:
pip install catboost

### 5.8 CatBoost

#### Training CatBoost on the Training set

In [None]:
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier()
catboost_model.fit(x_train, y_train)
y_pred = catboost_model.predict(x_test)
acc8 = accuracy_score(y_test, y_pred)

In [None]:
# Train Score
train_score = catboost_model.score(x_train, y_train)
print('Training Score: %0.4f'% train_score)

#### Testing the CatBoost model on the Testing set

In [None]:
# Test score
f1_Cat = f1_score(y_test, y_pred)
precision_Cat = precision_score(y_test, y_pred)
recall_Cat = recall_score(y_test, y_pred)
Accuracy_Cat=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_Cat)
print('Precision score: %0.4f'% precision_Cat)
print('F1-Score: %0.4f'% f1_Cat)
print('Accuracy score: %0.4f'% Accuracy_Cat)

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_pred)

### 5.9 ANN

In [None]:
from tensorflow import keras
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
ANN_model =keras.Sequential([
    keras.layers.Dense(units=64, activation='relu', input_dim=6),  # Update input_dim to match your number of features
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
]) 

ANN_model.compile(
    optimizer='adam',
    metrics=['accuracy'],
    loss='binary_crossentropy'
)

In [None]:
ANN_model.fit(x_train, y_train, epochs=50)

## Predictions

In [None]:
def predict(model,x):
    pred  = ANN_model.predict(x)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    return pred

def predict_graph(y_true,y_pred,title):
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(7,5))
#     cmap = sns.color_palette("viridis", as_cmap=True)
    sns.heatmap(cm,annot=True,fmt='g',cmap='Oranges',
                xticklabels=['No Bankruptcy','Bankruptcy'], yticklabels=['No Bankruptcy','Bankruptcy'])
    plt.title(title)
    plt.show()
    
    print("\n")
    print("Classification Report:")
    print(classification_report(y_true,y_pred))

### Train Predictions

In [None]:
y_train_pred = predict(ANN_model, x_train)
predict_graph(y_train, y_train_pred, 'Train Data Predictions')

### Test Predictions

In [None]:
y_pred = predict(ANN_model, x_test)
acc9 = accuracy_score(y_test, y_pred)

In [None]:
# Test score
f1_ANN = f1_score(y_test, y_pred)
precision_ANN = precision_score(y_test, y_pred)
recall_ANN = recall_score(y_test, y_pred)
Accuracy_ANN=accuracy_score(y_test, y_pred)
print('Recall score: %0.4f'% recall_ANN)
print('Precision score: %0.4f'% precision_ANN)
print('F1-Score: %0.4f'% f1_ANN)
print('Accuracy score: %0.4f'% Accuracy_ANN)

In [None]:
predict_graph(y_test, y_pred, 'Test Data Predictions')

# 6. RESULT

### Compare the accuracy of the models on the training set

In [None]:
mylist=[]
mylist2=[]

mylist.append(acc1)
mylist2.append("Logistic Regression")
mylist.append(acc2)
mylist2.append("Naive Bayes")
mylist.append(acc3)
mylist2.append("KNN")
mylist.append(acc4)
mylist2.append("Decision Tree")
mylist.append(acc5)
mylist2.append("Random Forest")
mylist.append(acc6)
mylist2.append("SVM")
mylist.append(acc7)
mylist2.append("XG Boost")
mylist.append(acc8)
mylist2.append("Cat Boost")
mylist.append(acc9)
mylist2.append("ANN")

plt.figure(figsize=(22, 10))
sns.set_style("darkgrid")
ax = sns.barplot(x = mylist2, y = mylist, palette = "magma", saturation =1.5)
plt.xlabel("Classification Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Classification Models", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 0)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')

    # Add Text watermark
    plt.text(7.5, -0.1, 'Make by Lucas', fontsize = 15,
             color ='#034235', ha ='left', va ='bottom',
             alpha = 0.7)
       
plt.show();

## ROC Curve and Area Under the Curve

In [None]:
from sklearn.metrics import roc_curve, auc

### ROC - Logistic Regression

In [None]:
y_pred_logistic = LR_model.predict_proba(x_test)[:,1]
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr, logistic_tpr)

### ROC - Naive Bayes

In [None]:
y_pred_nb = NB_model.predict_proba(x_test)[:,1]
nb_fpr, nb_tpr, threshold = roc_curve(y_test, y_pred_nb)
auc_nb = auc(nb_fpr, nb_tpr)

### ROC - KNN

In [None]:
y_pred_knn = KNN_model.predict_proba(x_test)[:,1]
knn_fpr, knn_tpr, threshold = roc_curve(y_test, y_pred_knn)
auc_knn = auc(knn_fpr, knn_tpr)

### ROC - Decision Tree

In [None]:
y_pred_dtr = DTR_model.predict_proba(x_test)[:,1]
dtr_fpr, dtr_tpr, threshold = roc_curve(y_test, y_pred_dtr)
auc_dtr = auc(dtr_fpr, dtr_tpr)

### ROC - Random Forest

In [None]:
y_pred_rfc = RF_model.predict_proba(x_test)[:,1]
rfc_fpr, rfc_tpr, threshold = roc_curve(y_test, y_pred_rfc)
auc_rfc = auc(rfc_fpr, rfc_tpr)

### ROC - SVM

In [None]:
y_pred_svm = SVM_model.predict_proba(x_test)[:,1]
svm_fpr, svm_tpr, threshold = roc_curve(y_test, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)

### ROC - XG Boost

In [None]:
y_pred_xgb = XGB_model.predict_proba(x_test)[:,1]
xgb_fpr, xgb_tpr, threshold = roc_curve(y_test, y_pred_xgb)
auc_xgb = auc(xgb_fpr, xgb_tpr)

### ROC - CatBoost

In [None]:
y_pred_catboost = catboost_model.predict_proba(x_test)[:,1]
catboost_fpr, catboost_tpr, threshold = roc_curve(y_test, y_pred_catboost)
auc_catboost = auc(catboost_fpr, catboost_tpr)

### ROC - ANN

In [None]:
y_pred_ann = ANN_model.predict(x_test)
ann_fpr, ann_tpr, threshold = roc_curve(y_test, y_pred_ann)
auc_ann = auc(ann_fpr, ann_tpr)

### ROC - Plotting Graph

In [None]:
plt.figure(figsize=(8, 8), dpi=100)
plt.plot([0, 1], [0, 1], 'k--')
# Logistic Regression
plt.plot(logistic_fpr, logistic_tpr, label='Logistic (auc = %0.3f)' % auc_logistic)
# Naive Bayes
plt.plot(nb_fpr, nb_tpr, label='Naive Bayes (auc = %0.3f)' % auc_nb)
# KNN
plt.plot(knn_fpr, knn_tpr, label='KNN (auc = %0.3f)' % auc_knn)
# Decision Tree
plt.plot(dtr_fpr, dtr_tpr, label='Decision Tree (auc = %0.3f)' % auc_dtr)
# Random Forest
plt.plot(rfc_fpr, rfc_tpr, label='Random Forest (auc = %0.3f)' % auc_rfc)
# SVM
plt.plot(svm_fpr, svm_tpr, label='SVM (auc = %0.3f)' % auc_svm)
# XGBoost
plt.plot(xgb_fpr, xgb_tpr, label='XG Boost (auc = %0.3f)' % auc_xgb)
# CatBoost
plt.plot(catboost_fpr, catboost_tpr, linestyle='-', label='CatBoost (auc = %0.3f)' % auc_catboost)
# ANN
plt.plot(ann_fpr, ann_tpr, label='ANN (auc = %0.3f)' % auc_ann)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend(loc='best')
plt.show();

In [None]:
# Storing results in a data frame

d1 = {'Models':['Logistic Regression','Naive Bayes','KNN','Decision Tree','Random Forest',
                'SVM','XG Boost','CatBoost','ANN'], 
      'Accuracy':[Accuracy_LR, Accuracy_NB, Accuracy_KNN, Accuracy_DT, Accuracy_RF, 
             Accuracy_SVM, Accuracy_XGB, Accuracy_Cat, Accuracy_ANN],
     'F1_Score':[f1_LR, f1_NB, f1_KNN, f1_DT, f1_RF, f1_SVM, f1_XGB, f1_Cat, f1_ANN],
     'Precision':[precision_LR, precision_NB, precision_KNN, precision_DT, precision_RF, precision_SVM, precision_XGB, 
                  precision_Cat, precision_ANN],
     'Recall':[recall_LR, recall_NB, recall_KNN, recall_DT, recall_RF, recall_SVM, recall_XGB, recall_Cat, recall_ANN]}
results_df = pd.DataFrame(d1)
results_df.sort_values(by=['Accuracy'], ascending=False)

In [None]:
import pickle
pickle_out = open("LR_model.pkl","wb")
pickle.dump(SVM_model, pickle_out)
pickle_out.close()