In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statistics

from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import plot_confusion_matrix, f1_score

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
# remove spaces from column names and rename target

new_name_map = {}

for col in df.columns:
    new_name_map[col] = col.replace(' ','_').replace('%','percent').replace('?','ed')
    
df.rename(columns = new_name_map, inplace = True)

# quick dataset analysis on target

In [None]:
# dataset in higly unbalanced
df_count = df.groupby('Bankrupted').count()
num_bankrupted = df_count.CFO_to_ASSETS[0] #any column is good
num_solid = df_count.CFO_to_ASSETS[1]
x = np.arange(1)
width = 0.5

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, num_bankrupted, width, label='Bankrupted')
rects2 = ax.bar(x + width, num_solid, width, label='Solid')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Number')
ax.set_title('Samples by target')
ax.set_xticks(x)
#ax.set_xticklabels("")
ax.legend()

# Classifying with the original dataset

In [None]:
# try to run a classifier without taking care of target imbalancing
X = df.drop('Bankrupted', axis=1)
y = df['Bankrupted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=0)

classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

In [None]:
# see how the accuracy score is high (around 0.96) but the performance of the classifier is poor
# (it is just predicting always "Solid").

class_names = { 'Solid' : 0, 'Bankrupted' : 1}

plot_confusion_matrix(classifier, X_test, y_test,  display_labels=class_names, cmap=plt.cm.Blues)

In [None]:
# compute f1 score

f1_score(y_test,classifier.predict(X_test))

# Try to balance the dataset

In [None]:
# Try to combine oversampling with undersampling and get the dataset more balanced

# define oversampling strategy
over = RandomOverSampler(sampling_strategy=0.1, random_state=0)
# fit and apply the transform
X, y = over.fit_resample(X, y)
# define undersampling strategy
under = RandomUnderSampler(sampling_strategy=0.8, random_state=0)
# fit and apply the transform
X, y = under.fit_resample(X, y)

# show new target counts: now the dataset is more balanced
y.value_counts()

# Classifying with the re-balanced dataset

In [None]:
# Run the classifier with the new dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=0)
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

In [None]:
# the classifier is working much better now
# also, only 7 bankrupted companies were classified as solid, against 31 solid classified as bankrupted
# making less mistakes in the first case (bankrupted companies classified as solid) could be much more acceptable
# than the opposite

class_names = { 'Solid' : 0, 'Bankrupted' : 1}

plot_confusion_matrix(classifier, X_test, y_test,  display_labels=class_names, cmap=plt.cm.Blues)

In [None]:
# compute f1 score

f1_score(y_test,classifier.predict(X_test))

In [None]:
# compute mean accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
scores = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Mean Accuracy: %.3f' % statistics.mean(scores))

# Possible improvements

1) play with the undersampler/oversampler in order to get better classification results

2) a deep data analysis/feature engineering could help to remove outliers and unneeded features

3) tune the classifier parameters