In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200 
plt.rcParams["figure.figsize"] = (5,5)
%matplotlib inline

In [None]:
import imblearn
print(imblearn.__version__) # We need Version 0.8.0 for this Notebook

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['Bankrupt?'].value_counts().plot.pie(autopct="%.2f")

In [None]:
sns.set(rc={'figure.figsize':(10, 6)})
print(df['Bankrupt?'].value_counts().to_dict())
sns.countplot(df['Bankrupt?'])

In [None]:
sns.scatterplot(data = df, x = ' Total income/Total expense', 
                y = ' Net Value Per Share (A)',  
                hue ='Bankrupt?')

In [None]:
corr = df.corr()
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize':(15,15)})
sns.heatmap(corr, cmap="YlOrBr")

# Random Forest Classifier

In [None]:
X = df[[col for col in df.columns if col not in ['Bankrupt?']]]
y = df['Bankrupt?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
clf = RandomForestClassifier(max_depth=75, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="YlOrBr")

# Weighted Random Forest Classifier for Imbalanced Classes

In [None]:
clf = RandomForestClassifier(max_depth=75, 
                             random_state=42, 
                             n_estimators=10, 
                             class_weight='balanced') # place a penalty on misclassifying the minority class.
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="Greens")

# Random Forest Classifier for Imbalanced Classes with Bootstrap Class Weighting

In [None]:
clf = RandomForestClassifier(max_depth=75, 
                             random_state=42, 
                             n_estimators=10, 
                             class_weight='balanced_subsample')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="Purples")

# Balanced Random ForestClassifier

In [None]:
# is another ensemble method in which each tree of the forest will be 
# provided a balanced bootstrap sample [CLB+04]. 
# This class provides all functionality of the RandomForestClassifier:
clf = BalancedRandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from imblearn.metrics import macro_averaged_mean_absolute_error 
# Computes MAE for each class and averages them while giving an equal weight to each class.
macro_averaged_mean_absolute_error(y_test, y_pred)
# Requires version 0.8.0

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report:')

# precision, recall, specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="YlOrRd")

# Easy Ensemble Classifier

In [None]:
# A specific method which uses AdaBoostClassifier as learners in the bagging classifier is called “EasyEnsemble”. 
# The EasyEnsembleClassifier allows to bag AdaBoost learners which are trained on balanced bootstrap samples [LWZ08]. 
# Similarly to the BalancedBaggingClassifier API, one can construct the ensemble as:
clf = EasyEnsembleClassifier(n_estimators=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
from imblearn.metrics import macro_averaged_mean_absolute_error 
# Computes MAE for each class and averages them while giving an equal weight to each class.
macro_averaged_mean_absolute_error(y_test, y_pred)
# Requires version 0.8.0

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report:')

# precision, recall, specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="Wistia")

# Up-Sampling

In [None]:
col_list = df.columns.to_list()[1:] # All the columns except the target variable
# col_list

In [None]:
smote = SMOTE(random_state = 42)
X, y = smote.fit_resample(df.drop('Bankrupt?', axis = 1), df['Bankrupt?'])
df_oversampled = pd.DataFrame(X, columns = col_list)
df_oversampled['Bankrupt?'] = y

In [None]:
print(df_oversampled['Bankrupt?'].value_counts().to_dict())
sns.countplot(df_oversampled['Bankrupt?'])

In [None]:
sns.scatterplot(data = df_oversampled, 
                x = ' Total income/Total expense', 
                y = ' Net Value Per Share (A)',  
                hue ='Bankrupt?')

In [None]:
X = df_oversampled[col_list] #[[col for col in df_oversampled.columns if col not in ['Bankrupt?']]]
y = df_oversampled['Bankrupt?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
clf = RandomForestClassifier(max_depth=75, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

In [None]:
plt.rcParams["figure.figsize"] = (5,5)
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot(cmap="plasma")