In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import norm
pd.set_option('display.max_columns', None)
# Right after importing seaborn (could also use 'whitegrid')
sns.set_theme(style='darkgrid', context='talk')

import warnings
warnings.filterwarnings('ignore')

In [None]:
url='../input/company-bankruptcy-prediction/'

df=pd.read_csv(url+'data.csv')
df.head()




In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
print(df['Bankrupt?'].value_counts())
print('\n')
print('Financially Stable: ', round(df['Bankrupt?'].value_counts()[0]/len(df)*100, 2), '% of the dataset')
print('Financially Unstable: ', round(df['Bankrupt?'].value_counts()[1]/len(df)*100, 2), '% of the dataset')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

# Exloration Data Analysis

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(df['Bankrupt?'])
plt.show()

The data seems quite unbalanced. So, it needs to be solved.

In [None]:
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
fig, ax=plt.subplots(figsize=(15, 15))
sns.heatmap(corr, ax=ax, cmap='viridis', linewidth=0.1)

In [None]:
df.hist(figsize=(60, 50), bins=50)
plt.show()

In [None]:
# Ploting interesting features

f, axes=plt.subplots(ncols=4, figsize=(24, 6))

sns.boxplot(x='Bankrupt?', y=' Net Income to Total Assets', data=df, ax=axes[0])
axes[0].set_title('Bankrupt Vs Net Income to Total Assets')

sns.boxplot(x='Bankrupt?', y=' Total debt/Total net worth', data=df, ax=axes[1])
axes[1].set_title('Bankrupt Vs Total debt/Total net worth Correlation')

sns.boxplot(x='Bankrupt?', y=' Debt ratio %', data=df, ax=axes[2])
axes[2].set_title('Bankrupt Vs Debt ratio % Correlation')

sns.boxplot(x='Bankrupt?', y=' Net worth/Assets', data=df, ax=axes[3])
axes[3].set_title('Bankrupt Vs Net worth/Assets Correlation')

plt.show()
        

In [None]:
# Let's take a lookt at the distribution of these features for companies that are close to bankruptcy.
f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(24, 6))

cash_flow_rate = df[' Net Income to Total Assets'].loc[df['Bankrupt?'] == 1].values
sns.distplot(cash_flow_rate,ax=ax1, fit=norm, color='#FB8861')
ax1.set_title(' Net Income to Total Assets \n (Unstable companies)', fontsize=14)

tot_debt_net = df[' Total debt/Total net worth'].loc[df['Bankrupt?'] == 1].values
sns.distplot(tot_debt_net ,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('total debt/tot net worth \n (Unstable companies)', fontsize=14)


debt_ratio = df[' Debt ratio %'].loc[df['Bankrupt?'] == 1].values
sns.distplot(debt_ratio,ax=ax3, fit=norm, color='#C5B3F9')
ax3.set_title('debt_ratio \n (Unstable companies)', fontsize=14)

net_worth_assets = df[' Net worth/Assets'].loc[df['Bankrupt?'] == 1].values
sns.distplot(net_worth_assets,ax=ax4, fit=norm, color='#C5B3F9')
ax4.set_title('net worth/assets \n (Unstable companies)', fontsize=14)

plt.show()

# Removing the outliers

In [None]:
def remove_outliers(feature, feature_name, df):
    q25, q75=np.percentile(feature, 25), np.percentile(feature, 75) # It identifies 25th and 75th quartiles 
    print('Quartle 25: {} | Quartile 75: {}'.format(q25, q75))
    feat_iqr=q75-q25
    print('iqr: {}'.format(feat_iqr))
    feat_cut_off=feat_iqr*1.5
    feat_lower, feat_upper=q25 - feat_cut_off, q75 + feat_cut_off
    print('Cut Off: {}'.format(feat_cut_off))
    print(feature_name +' Lower: {}'.format(feat_lower))
    print(feature_name +' Upper: {}'.format(feat_upper))
    
    outliers = [x for x in feature if x < feat_lower or x > feat_upper]
    print(feature_name + ' outliers for close to bankruptcy cases: {}'.format(len(outliers)))
    #print(feature_name + ' outliers:{}'.format(outliers))

    dataset = df.drop(df[(df[feature_name] > feat_upper) | (df[feature_name] < feat_lower)].index)
    print('-' * 65)
    
    return df

for col in df:
    new_df=remove_outliers(df[col], str(col), df)


In [None]:
new_df.head()

In [None]:
print('After remover outliers: ', new_df.shape)
print('Original dataset: ', df.shape)

In [None]:
df[' Net Income Flag'].value_counts()

In [None]:
# The data seems clean for now. So let's split it into train and test datasets

from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier


X=df.drop(['Bankrupt?'], axis=1)
X=X.drop([' Net Income Flag'], axis=1)
y=df['Bankrupt?']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

In [None]:
# Let's train the model with SGDClassifier, first.
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
y_pred=sgd_clf.predict(X_test)

# Performance Evaluation

## Cross Validation

One way to evaluate the performance and to find its accuracy is to use k-fold cross validation technique. The following code randomly splits the data into 
10 distinct subsets or folds, then it trains and evaluates the model 10 times by picking a different fold for 
evaluation every time and training on other 9 folds. 

In [None]:

from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring='accuracy')

It shows the model's accuracy between 77% to 97% based on training and evaluating the model on 10 different folds. 

## Confusion Matrix

Another way to evaluate a model's performance is to take a look at the confusion matrix becuase it counts the number of times instances of class A are classifed as class B. 


In [None]:
from sklearn.metrics import confusion_matrix
print('Confusion Matrix: ')
confusion_matrix(y_test, y_pred)

Each row in this confusion matrix represents an actual class, and each column represents a predicted class. 1966 of them are correctly classified which is called 'True Negative', while the remaining 2 are wrongly classified, called 'False Positive'. In the second row, 78 of them are wrongly classifed which is called 'False Negative', while there is 0 'True positive'.

In [None]:
#To be more accurate, let's take a look at the accuracy of positive prediction by finding its precision and recall.
from sklearn.metrics import precision_score, recall_score, f1_score
print('Accuracy score: ', metrics.accuracy_score(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))

print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1_score: ', f1_score(y_test, y_pred))
print('Classification Report: ')
print(metrics.classification_report(y_test, y_pred))
y_pred=pd.DataFrame(y_pred)
print(y_pred[0].value_counts())

# Logistic Regression

In [None]:

model=LogisticRegression()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print('Accuracy score: ', metrics.accuracy_score(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1_score: ', f1_score(y_test, y_pred))
print('Confusion Matrix: ')
confusion_matrix(y_test, y_pred)
print('Classification Report: ')
print(metrics.classification_report(y_test, y_pred))
y_pred=pd.DataFrame(y_pred)
print(y_pred[0].value_counts())

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

forest_clf=RandomForestClassifier(random_state=42, n_estimators=1000, n_jobs=-1)
forest_clf.fit(X_train, y_train)
y_pred=forest_clf.predict(X_test)

print('Accuracy score: ', metrics.accuracy_score(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1_score: ', f1_score(y_test, y_pred))
print('Confusion Matrix: ')
confusion_matrix(y_test, y_pred)
print('Classification Report: ')
print(metrics.classification_report(y_test, y_pred))
y_pred=pd.DataFrame(y_pred)
print(y_pred[0].value_counts())


# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm_clf=SVC()
svm_clf.fit(X_train, y_train)
y_pred=svm_clf.predict(X_test)

print('Accuracy score: ', metrics.accuracy_score(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1_score: ', f1_score(y_test, y_pred))
print('Confusion Matrix: ')
confusion_matrix(y_test, y_pred)
print('Classification Report: ')
print(metrics.classification_report(y_test, y_pred))

y_pred=pd.DataFrame(y_pred)
print(y_pred[0].value_counts())