In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# EDA

In [None]:
df['Bankrupt?'].value_counts().plot(kind='bar')
sns.set_style('whitegrid')
sns.set_palette('bwr')
plt.title('Survive(0) vs Bankrupt(1)')
plt.show()

In [None]:
plt.figure(figsize=(20,20))
sns.set_style('whitegrid')
sns.set_palette('bwr')
sns.boxplot(data = df.drop(['Bankrupt?'], axis=1), orient='h', color='blue')
plt.title('Data range')
plt.show()

In [None]:
y = df['Bankrupt?']
X = df.drop(['Bankrupt?'], axis=1)

# 1.SVM

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [None]:
##Split train and test ste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state=15)

In [None]:
#Fine tune SVM parameters
for weight in [1, 3, 5, 6, 10, 50, 100]:
    #Build SVC model
    pipe_svc = Pipeline(steps=[('scale', StandardScaler()), ('SVC', SVC(class_weight={0:1, 1:weight}))])
    score = cross_val_score(pipe_svc, X_train, y_train, n_jobs=4, scoring = 'f1', cv=5).mean()
    print('Mean F1 cross-val-score for model with weight %i is %.2f' % (weight, score))

From the results above we pick the weight of for SVM is 0:1 - 1:6

In [None]:
#Train and evaluate on test set
pipe_svc = Pipeline(steps=[('scale', StandardScaler()), ('SVC', SVC(class_weight={0:1, 1:6}, random_state=10))])
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print('Test f1_score is', test_f1)

In [None]:
test_recall =  recall_score(y_test, y_pred)
print('Test recall score is ', test_recall)
test_accuracy = accuracy_score(y_test, y_pred)
print('Test accuarcy score is', test_accuracy) 

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(y_pred.reshape(2046,1), cmap='bwr')
plt.title('Visualize predicted bankrupt company')
plt.show()

In [None]:
##Memorizing support vector: 
s_vector = pipe_svc[1].support_vectors_
vector_index = pipe_svc[1].support_

##Create a new column in the dataset indicate whether the sample is a support vector or not:
S_vector = []
for i in range(0, len(X_train)):
    if i in vector_index:
        S_vector.append(1)
    else:
        S_vector.append(0)
X_train['Is Vector'] = S_vector

# 2. PCA

In [None]:
from sklearn.decomposition import PCA

**Scale X_train**

In [None]:
Scaled_X_train = StandardScaler().fit_transform(X_train)

**Choosing the number of components**

In [None]:
##Choosing the best number of components
TEV = []
n_components = np.arange(3,50)
for n in n_components:
    pca = PCA(n_components = n)
    pca.fit_transform(Scaled_X_train)
    total_variance = pca.explained_variance_ratio_.sum()
    TEV.append(total_variance)

##Plotting total explained variance to the number of components
sns.set_style('whitegrid')
sns.set_palette('bwr')
plt.figure(figsize = (10,8))
plt.plot(n_components, TEV)
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
plt.title('Total explained variance per components')
plt.show()

We can see from above that we need roughly 50 components to explain 95% of the data variance. However, the first 3 components already explain nearly 25% of the variances. 

Off the back of this notebook, I've tried visualise the dataset with 8 components and 3 components. I noticed that if only using 3 components, I can intereprete the data quite well using my business knowledge already, though the total variance is not as high as 8. The 3 components extracted do not share any dominant features, while it also have strong internal consistency and is closely related to existing business/ finance concepts. 

I'll demonstrate how the data can be visualised with 3 components and write some brief business interpretation of each components below. 

In [None]:
#Fit and visualised the PCA with 3 components:
pca = PCA(n_components = 3, random_state = 10)

In [None]:
t_X_train = pca.fit_transform(Scaled_X_train)

In [None]:
t_X_train.shape

In [None]:
pca.components_[0][:20]

In [None]:
#PCs = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8']
PCs = ['PC1', 'PC2', 'PC3']
components_df = pd.DataFrame(pca.components_)
components_df = components_df.T
components_df.columns = PCs
components_df.head(20)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(14,8), gridspec_kw={'height_ratios':[4,1]})
sns.heatmap(components_df,ax = ax[0], cmap='bwr')
ax[0].set_xlabel('Components')
ax[0].set_ylabel('Feature weights')
ax[0].set_title('Feature weights heatmap of PCA')

ax[1].bar(PCs, pca.explained_variance_ratio_)
ax[1].set_xlabel('Components')
ax[1].set_ylabel('Explained variance ratio of each components')
plt.show()

In [None]:
total_val = pca.explained_variance_ratio_.sum()
print('Total variance explained by these components is %.2f' %total_val)

** Analysing the components' structure**

Fromt the heat map, we can see that each component is constructed by 4 - 5 dominant features (those with lightest or darkest color). It's reasonable to consider them to be the more meaningful metrics to look at when monitoring company's likelihood to when bankrupt. 

I'll map the features index with their name to have a closer look at what they are. 

In [None]:
##Get names and weights of 15 most dominant featyres for each component:
PC_feature_names = []
PC_feature_weights = []
for PC in PCs:
    dominant_features = np.abs(components_df[PC]).sort_values(ascending = False)[:5]
    for i in dominant_features.index:
        PC_feature_weights.append(components_df[PC][i])
    for i in dominant_features.index:
        PC_feature_names.append(X.columns[i]) 

In [None]:
## Create 3 dataframes to store names and weights of 3 components:
PC_df = ['PC1_df', 'PC2_df', 'PC3_df']
for i in np.arange(0, 15, 5):
     PC_df[int(i/5)]= pd.DataFrame(zip(PC_feature_names[i:i+5], PC_feature_weights[i:i+5]), columns = ['Feature names',  PC_df[int(i/5)]])

## Noted that some components have the same dominant feature, therefore I Outter joined 8 dataframes to visualise components with unique features:
PC_DF = PC_df[0]
for i in np.arange(1, 3):
    PC_DF = PC_DF.merge(PC_df[i], on='Feature names', how='outer')
PC_DF = PC_DF.fillna(0).set_index('Feature names')

In [None]:
##Create a heatmapt to visualise the construction of each components
plt.figure(figsize=(12,12))
sns.heatmap(PC_DF, cmap = 'bwr')
plt.title('Visualize Components Structure')
plt.show()

I like how each components do not share any features. There're some internal correlation between the feature of one components as well. In details: 

PC1: Profitability: All features making up this componets are related to return, earning, profit. The main differences in each feature are how tax, interest, depreciatation, capital and seasonality are taken into account. 

PC2: Liability: 4/5 features are related to how much debt (long/short term) the company bares. 

PC3: Liquidity: 2/5 features are clear indicator of the company liquidity (Current Assets/ Total Assets, Quick Assets/ Total Assets), while the other 3 indicate the demand for liquidifiable assets to finance company operations (Total Asset Turnover) or debt repayment (Current Liability to Assets).

# **Visualise data using PCA components**

In [None]:
#Create PCA train data:
PC_train_df = pd.DataFrame(t_X_train)
PC_train_df.columns = ['Profitability', 'Liability', 'Liquidity']
PC_train_df = pd.concat([y_train.reset_index().drop(columns='index'), PC_train_df], axis=1)
PC_train_df.head()

In [None]:
##Visualise data on a 3D map:
fig = plt.figure(figsize = (16, 16))
ax = fig.add_subplot(111,projection='3d')
ax.scatter(PC_train_df['Profitability'], PC_train_df['Liability'], PC_train_df['Liquidity'], c=PC_train_df['Bankrupt?'], cmap = 'viridis')
ax.set_xlabel('PC1 - Profitability')
ax.set_ylabel('PC2 - Liability')
ax.set_zlabel('PC3 - Liquidity')
ax.set_title('Bankruptcy visualisation with PCA')
plt.show()

In [None]:
##Visualise on 2D relational plot:
plt.figure(figsize = (16, 6))
sns.relplot(
    data=PC_train_df, x='Liability', y="Profitability",
    col="Bankrupt?", hue="Liquidity",kind="scatter")
plt.show()

The visualising effort does not seem to be as rewarding as I thought. However, it's quite clear that PC1 - Profitability does have some predictive power as most bankrupt cases have PC1 value larger than 0.

# **3. Taking a step further: Using liability to predict profitability**

I noted there are opposite linear trends in the chart above. 
- For surviving companies, their profitability and liability components follow a negative linear relationship. 
- For bankrupt companies, their profitability and liability components follow a positive linear relationship. 

I'll take some statistic tests to see if this relationship is meaningful.

In [None]:
## Plot the regression line
plt.figure(figsize = (16, 16))
sns.lmplot(x="Liability", y="Profitability", col="Bankrupt?",
               data=PC_train_df, scatter=True, fit_reg=True)
plt.show()

In [None]:
## Check the residual plot
fig, ax =  plt.subplots(1, 2, figsize = (16, 6))
sns.residplot(
    data=PC_train_df[PC_train_df['Bankrupt?']==0], x='Liability', y="Profitability", ax=ax[0])
sns.residplot(
    data=PC_train_df[PC_train_df['Bankrupt?']==1], x='Liability', y="Profitability", ax=ax[1])
plt.show()

In [None]:
import statsmodels.api as sm

In [None]:
## Check regression model for surviving company: 
data = PC_train_df[PC_train_df['Bankrupt?']==0]
X = data['Liability']
y = data['Profitability']

results = sm.OLS(y,X).fit()
print(results.summary())

In [None]:
## Check regression model for surviving company: 
data = PC_train_df[PC_train_df['Bankrupt?']==1]
X = data['Liability']
y = data['Profitability']

results = sm.OLS(y,X).fit()
print(results.summary())

We can see that the relationship in both models are statistically significant (Prob (F-statisitc)) < 0.005). It looks like for bankrupt companies, liability can predict profitability better than for surviving companies (R-squared is 35.1% versus 6.9%). 

This raises several interesting hypothesis regarding the nature of bankrupt companies. My first thought is how company's interest baring debt may play a role in the company state's of survival. (i.e: Bankrupt companies may have liability with higher interest than surviving companies, which hurts their profitability). Another hypothesis is where company spend their revenue (i.e: Bankrupt companies may spend all their revenue paying back debt instead of investments to increase profitability). These hypothesis need to be validated using business knowledge, not just statistical models, so I'll end the notebook here and leave the question open for anyone interested. 

# 4. Descriptive analysis of the support vectors

In [None]:
X_train_svector = X_train.iloc[vector_index]

In [None]:
X_train.shape

In [None]:
X_train['Is Vector'].value_counts()

In [None]:
X_train.shape

In [None]:
Mean_diff = []
Std_diff = []
for i in range(0, 95):
    mean_diff = np.abs(X_train_svector.iloc[:, i].mean() -     X_train.iloc[:,i].mean())/ X_train.iloc[:,i].mean()
    Mean_diff.append(mean_diff)
    std_diff = np.abs(X_train_svector.iloc[:, i].std() - X_train.iloc[:,i].std())/X_train.iloc[:,i].std()
    Std_diff.append(std_diff)

In [None]:
X_train_svector.shape

In [None]:
Diff_df = pd.DataFrame([Mean_diff, Std_diff], columns = X_train_svector.iloc[:, :95].columns)
Diff_df_t = Diff_df.T
Diff_df_t.columns = ['Mean_diff', 'Std_diff']
Diff_df_t['Total_diff'] = Diff_df_t['Mean_diff'] + Diff_df_t['Std_diff']

In [None]:
##Create a heatmapt to visualise the difference of each components
plt.figure(figsize=(12,12))
sns.heatmap(Diff_df_t[['Total_diff']], cmap = 'bwr')
plt.title('Visualize Difference between Support Vectors and Training data')
plt.show()

In [None]:
Diff_df_t['Total_diff'].sort_values(ascending=False).head(15)

In [None]:
SVC_features = list(Diff_df_t['Total_diff'].sort_values(ascending=False).head(15).index)

In [None]:
PCA_features = list(PC_DF.index) 

In [None]:
Features_df = pd.DataFrame(zip(PCA_features, SVC_features), columns=['PCA_features', 'SVC_features'])

In [None]:
Features_df

In [None]:
Features_df.to_csv('Features_df.csv')