In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import GridSearchCV

## Load Data

****

In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df.head()

## Some basic analysis of the dataset like no of rows, columns, datatypes,basic statistics

In [None]:
df.count()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

## Check the distribution of the Classes in the dataset

In [None]:
sns.countplot(df['Bankrupt?'])
plt.title('Count of Companies Bankrupt',fontsize=16, x = 0.50)
#sns.set_style(style='darkgrid') 
colours = {
    'Cyan':'#00AEEF'
    ,'Burgundy':'#9D063B'
}
plt.style.use('seaborn')
sns.set(font_scale=1.5)
sns.set_palette(colours.values())
pd.set_option('display.max_columns', None) 

### We can clearly see the label is highly skewed towards class 0, which means the count of bankrupt companies is much lower than those who arent.

## Split the features into categorical and numerical, basis their values

In [None]:
categorical = [' Liability-Assets Flag',' Net Income Flag']

numerical = []
for i in df.columns:
     if i not in categorical:
        numerical.append(i)

In [None]:
n = 0
for i in numerical:
    n = n + 1
print(n)

## Plot the numerical features against the label

###  since the features are numerical, categorical plot is the best option to clearly understand how the feature values are distributed for each class of the label

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(20, 10))
for i in numerical:
    sns.catplot(x ='Bankrupt?' , y = i , data = df)
    

## Plot of Categorical Variables. 

### Count plot is the best option for categorical features, to get the count of each value for each class of label

In [None]:
sns.countplot( y = ' Liability-Assets Flag', data = df, hue = 'Bankrupt?')

In [None]:
sns.countplot( y = ' Net Income Flag', data = df, hue = 'Bankrupt?')

## Plot the correlation matrix to understand how the features are correlated to each other

In [None]:
c = df.corr()
mask = np.zeros_like(c, dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(12, 6))
plt.title('Pearson Correlation Matrix',fontsize=30)

sns.set_style('darkgrid')
sns.heatmap(df.corr(),linewidths=0.3,square=True,annot=False,annot_kws={"size":10},mask=mask);

In [None]:
label = 'Bankrupt?'
features = []
for i in df.columns:
    if i not in label:
        features.append(i)
print(f'features are: {features}')

### Split data and train models

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df[features], df[label],test_size=0.33, random_state=15)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=49)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_test.shape)
print(y_valid.shape)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')


In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')


In [None]:
model = SVC()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')

    

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')



In [None]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')



In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)
print(model) 
print('Confussion Matrix: ')

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, y_test)
print(cm)

tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall   : {recall}')
print(f'F1 score : {f1score}')
print(f'Classification Report: {classification_report(predictions, y_test)}')



## Conclusion

#### Though the accuracy and other parameters differ for every model, the KNN classifier is the one with least false negatives. In cases such as bankruptcy prediction, we can't afford to have high false negatives as it puts our model in risk of missing potential bankrupts. Hence, the model we can go for, basis all the analysis till now is KNN Classifier.