In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics



In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().values.sum()

In [None]:
df.columns = df.columns.str.strip()
df.rename(columns={'Bankrupt?': 'Bankrupt'}, inplace=True)

In [None]:
fig = px.bar(x=df['Bankrupt'].value_counts().index, y=df['Bankrupt'].value_counts(), text=(df['Bankrupt'].value_counts()/len(df['Bankrupt'])*100),
            height=500, width=600, title='Bankrupcy')
fig.update_traces(textposition='outside', texttemplate='%{text:.4s}%', marker=dict(color = 'snow', line=dict(color='black', width=3)))
fig.show()

In [None]:
fig = px.histogram(x=df['Total Asset Growth Rate'],
                   color=df['Bankrupt'], 
                   log_y=True,
                   template='ggplot2',
                  title='Income VS Bankrupcy',
                  width=700)
fig.show()

In [None]:
fig = px.histogram(x=df['Cash/Total Assets'], 
                   color=df['Bankrupt'], 
                   log_y=True,
                   color_discrete_sequence=['lightcyan','teal'],
                  width=700,
                  title='Total Cash VS Bankrupcy')
fig.show()

In [None]:
x = df.drop('Bankrupt',1)
y=df['Bankrupt']

print(x.shape)
print(y.shape)

In [None]:
sc = StandardScaler()
x = sc.fit_transform(x)
y = pd.factorize(y)[0]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=101, test_size=0.2, shuffle=True )

In [None]:
lgr = LogisticRegression(max_iter=100000)
lgr.fit(x_train, y_train)

y_pred = lgr.predict(x_test)


print('Accuracy of Logistic Regression is: ', accuracy_score(y_test,y_pred))
print('Recall Score of Logistic Regression is: ', metrics.recall_score(y_test, y_pred))

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)

y_pred = lgbm.predict(x_test)



print('Accuracy of LGBM Classifier is: ', accuracy_score(y_test,y_pred))
print('Recall Score of LGBM Classifier is: ', metrics.recall_score(y_test, y_pred))

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

y_pred = xgb.predict(x_test)

print('Accuracy of XGB Classifier is: ', accuracy_score(y_test,y_pred))
print('Recall Score of XGB Classifier is: ', metrics.recall_score(y_test, y_pred))


# Handling ImBalance Data
# SMOTE (OverSampling )

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, random_state=101, test_size=0.2, shuffle=True )

In [None]:
# Before Applying Smote

one_count = 0
zero_count =0

for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1

print('Number of one count after applying SMOTE is: ', one_count)
print('Number of zero count after applying SMOTE is: ', zero_count)

In [None]:
smote = SMOTE()
x_train1, y_train1 = smote.fit_resample(x_train1,y_train1)

In [None]:
# After Applying SMOTE

one_count = 0
zero_count =0

for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1

print('Number of one count after applying SMOTE is: ', one_count)
print('Number of zero count after applying SMOTE is: ', zero_count)

In [None]:
lgr = LogisticRegression(max_iter=100000)
lgr.fit(x_train1, y_train1)

y_pred = lgr.predict(x_test1)


print('Accuracy of Logistic Regression is: ', accuracy_score(y_test1,y_pred))
print('Recall Score of Logistic Regression is: ', metrics.recall_score(y_test1, y_pred))

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train1, y_train1)

y_pred = lgbm.predict(x_test1)


print('Accuracy of Logistic Regression is: ', accuracy_score(y_test1,y_pred))
print('Recall Score of Logistic Regression is: ', metrics.recall_score(y_test1, y_pred))

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train1, y_train1)

y_pred = xgb.predict(x_test1)

print('Accuracy of XGB Classifier is: ', accuracy_score(y_test1,y_pred))
print('Recall Score of XGB Classifier is: ', metrics.recall_score(y_test1, y_pred))

# SMOTEENN (OverSampling & UnderSampling Combined)

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, random_state=101, test_size=0.2, shuffle=True )

In [None]:
# Before Applying Smoteenn

one_count = 0
zero_count =0

for i in y_train2:
    if i==1:
        one_count +=1
    else:
        zero_count +=1

print('Number of one count after applying SMOTE is: ', one_count)
print('Number of zero count after applying SMOTE is: ', zero_count)

In [None]:
smoteenn = SMOTEENN()
x_train2, y_train2 = smoteenn.fit_resample(x_train2, y_train2)

In [None]:
# After Applying Smote

one_count = 0
zero_count =0

for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1

print('Number of one count after applying SMOTE is: ', one_count)
print('Number of zero count after applying SMOTE is: ', zero_count)

In [None]:
lgr = LogisticRegression(max_iter=100000)
lgr.fit(x_train2, y_train2)

y_pred = lgr.predict(x_test2)


print('Accuracy of Logistic Regression is: ', accuracy_score(y_test2,y_pred))
print('Recall Score of Logistic Regression is: ', metrics.recall_score(y_test2, y_pred))

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train2, y_train2)

y_pred = lgbm.predict(x_test2)


print('Accuracy of Logistic Regression is: ', accuracy_score(y_test2,y_pred))
print('Recall Score of Logistic Regression is: ', metrics.recall_score(y_test2, y_pred))

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train2, y_train2)

y_pred = xgb.predict(x_test2)

print('Accuracy of XGB Classifier is: ', accuracy_score(y_test2,y_pred))
print('Recall Score of XGB Classifier is: ', metrics.recall_score(y_test2, y_pred))