In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Table Description**

In [None]:
df = pd.read_csv("/kaggle/input/company-bankruptcy-prediction/data.csv")
df.describe()

In [None]:
print('Total number of records is {}'.format(df.shape[0]))
print('Total number of features is {}'.format(df.shape[1] - 1))

# Change Pandas Data frame to NumPy

In [None]:
target = df.to_numpy()[:, 0]
data = df.to_numpy()[:, 1:]

print('Shape of target {}'.format(target.shape))
print('Shape of data {}'.format(data.shape))

print('We have data spanning 95 features')
print('We have target response as Yes or No, 0 or 1')

# Linear Support Vector Classifier

## Standardisation

SVC requires data to be standardized for better predictions.

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(data)
X = scaler.transform(data)

## Principle Component Analysis

First we need to identify if there are any distinction between bankrupt and finacially statble companies. We can achieve this using reducing dimensionalities to two and scattering the plots. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
X_pca.shape

In [None]:
import matplotlib.pyplot as plt
import matplotlib

font = {'family' : 'normal',
        'size'   : 22}

matplotlib.rc('font', **font)

txt = "We can clearly see distinction in two classes, but obviously data for stable companies are relatively small"

plt.figure(figsize=(15,15)).text(.5, .05, txt, ha='center')
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=target)
plt.legend(handles=scatter.legend_elements()[0], labels=['Bankrupt', 'Stable'])
plt.xlabel("First principal component")
plt.ylabel("Second principal component")


## Splitting data for training and testing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, random_state=42)

print('Number of Training data {}'.format(y_train.shape[0]))
print('Number of Testing data {}'.format(y_test.shape[0]))

## Training Linear SVC

I have increased regularization so that the model does not overfit. I do not want my training data to be 100% accurate when predicted by my model.

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


lsvc_c_002 = LinearSVC(dual=False, C=0.02).fit(X_train, y_train)
y_pred_c_002 = lsvc_c_002.predict(X_test)
acc_for_test = accuracy_score(y_test, y_pred_c_002)
y_pred_train_c_002 = lsvc_c_002.predict(X_train)
acc_for_train = accuracy_score(y_train, y_pred_train_c_002)

print('Accuracy for test data is {}'.format(acc_for_test))
print('Accuracy for training data is {}'.format(acc_for_train))

## Training using Gradient boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


gbrt = GradientBoostingClassifier(random_state=0, learning_rate = 0.06).fit(X_train, y_train)


In [None]:
y_pred_test_gbrt = gbrt.predict(X_test)
acc_for_test_gbrt = accuracy_score(y_test, y_pred_test_gbrt)
y_pred_train_gbrt = gbrt.predict(X_train)
acc_for_train_gbrt = accuracy_score(y_train, y_pred_train_gbrt)

print('Accuracy for test data is {}'.format(acc_for_test_gbrt))
print('Accuracy for training data is {}'.format(acc_for_train_gbrt))

print('I have included regularization by decreasing learning rate to 0.06 for optimal accuracy in the test data and reduction in training data.')

### Feature importance for GB Classifier

In [None]:


df_copy = df
feature_cols = df_copy.drop(['Bankrupt?'], axis = 1).columns

feature_importance = gbrt.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
txt = "Note that some of the attributes doesn't play any part in companies financial stability. And very few has a major role like Total Assets and Nwet Income "
plt.figure(figsize=(30,50)).text(.5, .05, txt, ha='center')
plt.barh(pos, feature_importance[sorted_idx],0.8,  align='center')
plt.yticks(pos, np.array(feature_cols)[sorted_idx])
plt.title('Feature Importance')