In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/company-bankruptcy-prediction/data.csv")
df.head()

In [None]:
num_samples, num_features = df.shape
print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

# Analyze Dataframe

In [None]:
# check statistics of dataframe
df.describe()

In [None]:
# check balance on target class -> required to ensure that target class is balanced, if not: need to apply some counter meastures
count_target_1 = df.loc[df["Bankrupt?"] == 1].shape[0]
count_target_0 = df.loc[df["Bankrupt?"] == 0].shape[0]
print(f"Number of positive samples: {count_target_1} ({100 * count_target_1 / num_samples}% of total data).")
print(f"Number of negative samples: {count_target_0} ({100 * count_target_0 / num_samples}% of total data).")

As one can see, the dataset is very imbalanced with only a small ratio of positive samples and the majority of samples being negative. Therefore, the accuracy is not a quite good indicator here. It's better to use the F1-Score.

In [None]:
# create X and Y data for further analysis
X = df.drop("Bankrupt?", axis=1).to_numpy()
Y = df["Bankrupt?"].to_numpy()

In [None]:
# use a decision tree to find most important features to analyze further 
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X, Y)

In [None]:
#plot graph of feature importances for better visualization
plt.figure(figsize=(12,14))
feat_importances = pd.Series(model.feature_importances_, index=df.columns[1:])
feat_importances.nlargest(50).plot(kind='barh')
plt.show()

In [None]:
feature_importances_50_largest = list(feat_importances.nlargest(50).index)
feature_importances_sorted = list(feat_importances.nlargest(num_features).index)

# Visualizations

Let's now start with visualizing some of the relationships of the features to the target data in order to get a better understanding of the data.

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(data=df[list([*feature_importances_50_largest[0:3], "Bankrupt?"])], hue="Bankrupt?")

In [None]:
sns.pairplot(data=df[list([*feature_importances_50_largest[3:6], "Bankrupt?"])], hue="Bankrupt?")

# Feature Correlation

Check the correlation between the features. This is to ensure, that the model doesn't get too highly correlated features.

In [None]:
corr_matrix = df.corr().abs().round(2)

display(corr_matrix)

Get upper triangle of correlation matrix in order to plot the distribution of correlation values. The upper triangle is only required, because the correlation matrix is symmetric.

In [None]:
indices_upper_tri = np.triu_indices(n=corr_matrix.shape[0])
corr_vals = corr_matrix.to_numpy()[indices_upper_tri]
plt.hist(x=corr_vals, bins=20, range=[0.0, 0.99])
plt.xlabel("Correlation value")
plt.ylabel("Frequency")
plt.title("Histogram of correlation value distribution")
plt.show()

Most of the features are only correlated very small to each other and only a small portion of the features is correlated larger. Therefore, a correlation threshold can be choosen to be something like 0.6 or 0.7. 

# Create Dataset by removing too strong correlated features

The goal is to create a dataset only containing the most important features while removing too strong correlated features, because they don't include additional information and can lead to a decreasing prediction performance of the final model. 

In [None]:
def get_list_of_important_features(df, sorted_feature_importance, corr_threshold):
    # compute correlation matrix
    corr_matrix = df.drop("Bankrupt?", axis=1).corr().abs().round(2)
    
    # loop over sorted_feature_importance list and store only features which are not correlated more than corr_threshold
    final_feature_list = []
    blacklist_features = []
    for feature in sorted_feature_importance:
        if feature in blacklist_features: # skip if feature is on blacklist
            continue
        final_feature_list.append(feature)
        list_features_above_th = list(corr_matrix.loc[feature, corr_matrix[feature] > corr_threshold].index)
        
        for feature_above_th in list_features_above_th:
            # add to blacklist except feature_above_th is current feature (then has a correlation of 1)
            if feature_above_th == feature:
                continue
            else:
                blacklist_features.append(feature_above_th)
    return final_feature_list

In [None]:
final_feature_list = get_list_of_important_features(df, feature_importances_sorted, corr_threshold=0.6)
print(final_feature_list)

In [None]:
number_features_final = len(final_feature_list)
print(f"New number of features: {number_features_final}")

Now only 58 features are left, but they should be enough because all other features had a too large correlation to the current feature set and therefore are not boosting the performance further.

In [None]:
X_new = df[final_feature_list]
Y_new = df["Bankrupt?"]

# Create Train and Test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new.to_numpy(), Y_new.to_numpy(), test_size=0.3)

num_samples_train = X_train.shape[0]
num_samples_test = X_test.shape[0]
print(f"Number of samples of training data: {num_samples_train}")
print(f"Number of samples of test data: {num_samples_test}")

Check imbalance on training data.

In [None]:
# check balance on target class -> required to ensure that target class is balanced, if not: need to apply some counter meastures
count_target_1 = np.sum(y_train)
count_target_0 = np.sum(y_train==0)
print(f"Number of positive samples: {count_target_1} ({100 * count_target_1 / num_samples_train}% of total data).")
print(f"Number of negative samples: {count_target_0} ({100 * count_target_0 / num_samples_train}% of total data).")

In [None]:
# check the same for the final test set
# check balance on target class -> required to ensure that target class is balanced, if not: need to apply some counter meastures
count_target_1 = np.sum(y_test)
count_target_0 = np.sum(y_test==0)
print(f"Number of positive samples: {count_target_1} ({100 * count_target_1 / num_samples_test}% of total data).")
print(f"Number of negative samples: {count_target_0} ({100 * count_target_0 / num_samples_test}% of total data).")

# Scale Data 
<br>
Scale the data using the StandardScaler of scikit-learn. Important: Only train standard scaler on training data to not introduce a data leakage from training data to test data.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression Baseline Model

It's always good to start with an easy baseline model. In general, the easier a model, the better it can be understood. So it's preferable to have an as easy as possible model. The baseline can then be used to benchmark other approaches in order to find the best performing one.

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000).fit(X_train, y_train)

In [None]:
print(f"Accuracy on test set: {model.score(X_test, y_test)}")
print(f"Recall: {recall_score(y_test, model.predict(X_test))}")
print(f"Precision: {precision_score(y_test, model.predict(X_test))}")
print(f"F1-Score: {f1_score(y_test, model.predict(X_test))}")

The accuracy is quite larger, but be carefull: The accuracy can be misleading, because the dataset is very imbalanced. The model could only predict False and would have an accuracy of 0.9687!
Let's take a closer look into the Confusion Matrix.

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test)

As one can see, the model has a lot of false negatives which is very bad! <br>
<br>
Let's include sample weights in order to boost the learning towards the minority class. 

In [None]:
model_balanced = LogisticRegression(class_weight="balanced", max_iter=1000).fit(X_train, y_train)
print(f"Accuracy: {model_balanced.score(X_test, y_test)}")
print(f"Recall: {recall_score(y_test, model_balanced.predict(X_test))}")
print(f"Precision: {precision_score(y_test, model_balanced.predict(X_test))}")
print(f"F1-Score: {f1_score(y_test, model_balanced.predict(X_test))}")

In [None]:
plot_confusion_matrix(model_balanced, X_test, y_test)

As one can see, the accuracy is now lower than before, but the model has more positive predicted labels which are indeed true. In reality, this model would be preferred over the first one, because it's way more important to correctly predict whether a company is banrupt than to predict that the company is not bankrupt.

# Train different Classifiers to Improve Performance

Lets now train more classifiers to further improve the performance. <br>
The following classifiers shall be evaluated: 
<ol>
    <li> Decision Tree </li>
    <li> Random Forest </li>
    <li> Gradient Boosting Classifier </li>
</ol>

In [None]:
# first: balance data set using random over-sampling to set focus on recall
from imblearn.over_sampling import RandomOverSampler

over_sample = RandomOverSampler()
X_train, y_train = over_sample.fit_resample(X_train, y_train)

# check balance on target class -> required to ensure that target class is balanced, if not: need to apply some counter meastures
num_samples_train = X_train.shape[0]
count_target_1 = np.sum(y_train)
count_target_0 = np.sum(y_train==0)
print(f"Number of positive samples: {count_target_1} ({100 * count_target_1 / num_samples_train}% of total data).")
print(f"Number of negative samples: {count_target_0} ({100 * count_target_0 / num_samples_train}% of total data).")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

classifiers_dict = {
    "decision tree": DecisionTreeClassifier(),
    "random forest": RandomForestClassifier(n_estimators=100),
    "gradient boosting": GradientBoostingClassifier(n_estimators=100),
    "xgboost": xgb.XGBClassifier(eval_metric="logloss", max_depth=5, learning_rate=0.01, n_estimators=100)
}

In [None]:
for clf_name, clf in classifiers_dict.items():
    print(f"Start evaluating {clf_name}...")
    clf.fit(X_train, y_train)
    
    print(f"Accuracy: {clf.score(X_test, y_test)}")
    
    f1_val = f1_score(y_test, clf.predict(X_test))
    precision_val = precision_score(y_test, clf.predict(X_test))
    recall_val = recall_score(y_test, clf.predict(X_test))
    print(f"Precision: {precision_val}")
    print(f"Recall: {recall_val}")
    print(f"F1-Score: {f1_val}")
    print("\n\n")

In [None]:
plot_confusion_matrix(classifiers_dict["decision tree"], X_test, y_test)
    

In [None]:
plot_confusion_matrix(classifiers_dict["random forest"], X_test, y_test)
    

In [None]:
plot_confusion_matrix(classifiers_dict["gradient boosting"], X_test, y_test)
    

In [None]:
plot_confusion_matrix(classifiers_dict["xgboost"], X_test, y_test)
    

# Hyperparameter Optimization Gradient Boosting Classifier

Lets start with bayesian hyperparameter optimization of the gradient boosting classifier, because he had the best recall while still having feasible precision measured on the test set.

In [None]:
from skopt import BayesSearchCV

opt = BayesSearchCV(
    GradientBoostingClassifier(n_iter_no_change=10),
    {
        "loss": ["deviance", "exponential"],
        "learning_rate": (1e-4, 1e-1, "log-uniform"),
        "n_estimators": (10, 100),
        "subsample": (0.5, 1.0),
        "criterion": ["friedman_mse", "mse", "mae"],
        "max_depth": (2, 5),
        "max_features": ["auto", "sqrt", "log2", None],
    },
    n_iter=32,
    cv=3,
    verbose=1,
    scoring="recall"
)

In [None]:
opt.fit(X_train, y_train)

In [None]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

In [None]:
print(f"Recall: {recall_score(y_test, opt.predict(X_test))}")
print(f"Precision: {precision_score(y_test, opt.predict(X_test))}")
print(f"F1-Score: {f1_score(y_test, opt.predict(X_test))}")