In [None]:
! pip install lazypredict

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import inspect # Debugging

In [None]:
# Loading data 
data    = pd.read_csv("../input/company-bankruptcy-prediction/data.csv")
columns = data.columns.values

# No missing data
# data.isna().any()

# Strongly unbalanced dataset (w/ respect to target)
data["Bankrupt?"].value_counts()

In [None]:
# Correlation

m, n = data.shape
mean = data.mean().values             # Mean value for each column
std  = data.std().values * np.sqrt(m) # Std values for each column (* num samples)
corr = ((data.values - mean) * (data.values[:,0] - mean[0]).reshape(m, 1)).sum(axis=0) / (std * std[0])

fig, corr_ax = plt.subplots(figsize=(24, 8))
corr_ax.bar(x=np.arange(len(columns)), height=corr)
corr_ax.set_title("Correlation with target ('Bankrupt')")

plt.show()

In [None]:
# Splitting

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing   import StandardScaler

train  = data.drop(columns=["Bankrupt?"]).values
target = data["Bankrupt?"].values

scaler = StandardScaler()
train  = scaler.fit_transform(train)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(train, target):
    x_train, y_train = train[train_index, :], target[train_index]
    x_test, y_test = train[test_index, :], target[test_index]

In [None]:
# Quick models testing

from lazypredict.Supervised import LazyClassifier

from sklearn.metrics import recall_score, accuracy_score

lazy_clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=recall_score)
models, predictions = lazy_clf.fit(x_train, x_test, y_train, y_test)

print(models)

In [None]:
# Exploring deeper the models suggested above

from sklearn.model_selection import GridSearchCV
from sklearn.metrics         import make_scorer, recall_score, accuracy_score

from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors   import NearestCentroid

estimators = {"BernoulliNB"   : {"func"  : BernoulliNB(),
                                 "params": {"alpha"    : [0.1, 0.5, 1, 1.5, 2],
                                            "fit_prior": [True, False]}},
             "NearestCentroid": {"func"  : NearestCentroid(),
                                 "params": {"metric"          : ["euclidean", "manhattan"],
                                            "shrink_threshold": [None, 0.1, 0.5, 1, 2]}}                   
             }

models_to_test = estimators.keys()
for estimator_name in models_to_test:
    model = GridSearchCV(estimator=estimators[estimator_name]["func"],
                         param_grid=estimators[estimator_name]["params"],
                         scoring=make_scorer(recall_score))
    model.fit(x_train, y_train)
    preds    = model.predict(x_test)
    recall   = recall_score(y_test, preds)
    accuracy = accuracy_score(y_test, preds)
    print("{}: \n ACCURACY: {:.2f}, \n RECALL: {:.2f}, \n BEST PARAMS: {} \n".format(estimator_name, accuracy, recall, model.best_params_))

In [None]:
# Confusion matrix

from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(model, x_test, y_test)

In [None]:
# EDA

from sklearn.preprocessing import MinMaxScaler

normalized_data = MinMaxScaler().fit_transform(data.values)
normalized_data = pd.DataFrame(normalized_data, columns=columns)

'''
Comparison of each feature mean values for bankrupted (and non) companies. 
There is no significative difference between the two cases, both in
mean and in dispersion.

'''
pos_mean = normalized_data.loc[data["Bankrupt?"] == 1].mean()
pos_std  = normalized_data.loc[data["Bankrupt?"] == 1].std()
neg_mean = normalized_data.loc[data["Bankrupt?"] == 0].mean()
neg_std  = normalized_data.loc[data["Bankrupt?"] == 0].std()

avg_diff = np.sqrt(np.square(pos_mean - neg_mean).mean())

fig, mean_ax = plt.subplots(figsize=(24, 8))
mean_ax.bar(x=np.arange(len(columns)), height=pos_mean, yerr=pos_std)
mean_ax.bar(x=np.arange(len(columns)), height=-neg_mean, yerr=neg_std)
mean_ax.set_xticks([])
mean_ax.set_title("MinMax scaled features, mean & std comparison (Avg. mean difference {:.3f})".format(avg_diff))

plt.plot()
