In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif,chi2,SelectKBest
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
plt.rcParams['figure.figsize']=(12,8)

In [None]:
df=pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False)

**There is no null value present in the Dataset**

In [None]:
df['Bankrupt?'].value_counts()

In [None]:
df.columns

In [None]:
df['Bankrupt?'].value_counts()

In [None]:
corr_matrix=df.corr()
corr_matrix

In [None]:
#function to remove features which are highly correlated
def correlation_check(df,val):
    features=set()
    corr_matrix=df.corr()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if corr_matrix.iloc[i,j]>val:
                features.add(corr_matrix.columns[i])
    return features

In [None]:
features=correlation_check(df,0.8)

In [None]:
df.drop(features,axis=1,inplace=True)

In [None]:
df.shape

In [None]:
sns.countplot(x='Bankrupt?',data=df)

In [None]:
#splitting the dataset into independent and dependent data
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [None]:
#selection important_features
skb=SelectKBest(chi2,k=20)
test=skb.fit(X,y)
test.scores_

In [None]:
test=pd.Series(test.scores_)
test.index=X.columns
test

In [None]:
test_20=test.nlargest(20)

In [None]:
sns.set_style('darkgrid')
fig,axis=plt.subplots(5,4,figsize=(18,18))
sns.distplot(df[test_20.index[0]],ax=axis[0,0])
sns.distplot(df[test_20.index[1]],ax=axis[0,1])
sns.distplot(df[test_20.index[2]],ax=axis[0,2])
sns.distplot(df[test_20.index[3]],ax=axis[0,3])
sns.distplot(df[test_20.index[4]],ax=axis[1,0])
sns.distplot(df[test_20.index[5]],ax=axis[1,1])
sns.distplot(df[test_20.index[6]],ax=axis[1,2])
sns.distplot(df[test_20.index[7]],ax=axis[1,3])
sns.distplot(df[test_20.index[8]],ax=axis[2,0])
sns.distplot(df[test_20.index[9]],ax=axis[2,1])
sns.distplot(df[test_20.index[10]],ax=axis[2,2])
sns.distplot(df[test_20.index[11]],ax=axis[2,3])
sns.distplot(df[test_20.index[12]],ax=axis[3,0])
sns.distplot(df[test_20.index[13]],ax=axis[3,1])
sns.distplot(df[test_20.index[14]],ax=axis[3,2])
sns.distplot(df[test_20.index[15]],ax=axis[3,3])
sns.distplot(df[test_20.index[16]],ax=axis[4,0])
sns.distplot(df[test_20.index[17]],ax=axis[4,1])
sns.distplot(df[test_20.index[18]],ax=axis[4,2])
sns.distplot(df[test_20.index[19]],ax=axis[4,3])
plt.show()

In [None]:
#Normalization
ss=StandardScaler()
X_norm=ss.fit_transform(X)

In [None]:
#train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_norm,y,test_size=0.25,random_state=0)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

# Model Creation

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(class_weight='balanced')
lr.fit(X_train,y_train)
yhat=lr.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f',cmap='Blues')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat)

**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbor=[i for i in range(1,11)]
accuracy=[]

for i in neighbor:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    accuracy.append(accuracy_score(y_test,knn.predict(X_test)))

plt.plot(neighbor,accuracy)
plt.xlabel('Neighbors')
plt.ylabel('Accuracy')
plt.title('Neighbors vs Accuracy')

In [None]:
print('Highest Accuracy {} can be obtained at the neighbor value {} '.format(max(accuracy),neighbor[accuracy.index(max(accuracy))]))

In [None]:
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
yhat=knn.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f',cmap='Blues')

In [None]:
f1_score(y_test,yhat)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dst=DecisionTreeClassifier(criterion='entropy')
dst.fit(X_train,y_train)
yhat=dst.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f',cmap='Blues')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat)

# Random Over Sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
over=RandomOverSampler(sampling_strategy='minority')
X_over,y_over=over.fit_resample(X_norm,y)

In [None]:
print(Counter(y))
print(Counter(y_over))

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_over,y_over,test_size=0.2,random_state=0)

In [None]:
Counter(y_test)

In [None]:
Counter(y_train)

In [None]:
dst=DecisionTreeClassifier(criterion='entropy')
dst.fit(X_train,y_train)
yhat=dst.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test,yhat),fmt='.0f',annot=True,cmap='Greens')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat)

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
yhat=lr.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test,yhat),fmt='.0f',annot=True,cmap='Greens')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat)