In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')


In [None]:
pd.options.display.max_columns = 50

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

#### looking at the data we can see that majority of the data is scaled. But let us check if all of the columns are feature scaled or not

In [None]:
count = 0
not_scaled = []
for col in df.columns:
    if max(df[col])>1:
        print("not scaled : ", col)
        count += 1
        not_scaled.append(col)

In [None]:
len(not_scaled)

### above is the list of data that are not in scale. So we will use MinMaxScaler 

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[not_scaled] = scaler.fit_transform(df[not_scaled])

In [None]:
df.head()

### Checking for missing values

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(df.isnull(), cmap = 'Blues', yticklabels = False)

In [None]:
msno.matrix(df)

### correlation

In [None]:
plt.figure(figsize=(17,17))
sns.heatmap(df.corr(), annot=False, cmap='Blues')
plt.show()

### countplot for bankrupt column

In [None]:
sns.countplot(df['Bankrupt?'])

- We can see that the data is highly imbalanced

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE

In [None]:
X = df.drop('Bankrupt?', axis = 1)
y = df['Bankrupt?']

#### The below code is to balance the data

In [None]:
sm = SMOTE()
X_sm,y_sm = sm.fit_resample(X,y)

In [None]:
y.value_counts()

In [None]:
y_sm.value_counts()

### splitting the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.2, random_state = 0)

# SVM

In [None]:
from sklearn.svm import SVC
svc = SVC(gamma='auto',C=20, kernel='rbf', probability=True)

svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)
print("Train score: {}" .format(svc.score(X_train,y_train)))
print("Accuracy score: {}" .format(accuracy_score(y_test,svc_pred)))
print(classification_report(y_test, svc_pred))
sns.heatmap(confusion_matrix(y_test,svc_pred), annot = True, cmap='Blues',fmt = 'd')

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)
print("Train score: {}" .format(rfc.score(X_train,y_train)))
print("Accuracy score: {}" .format(accuracy_score(y_test,rfc_pred)))
print(classification_report(y_test, rfc_pred))
sns.heatmap(confusion_matrix(y_test,rfc_pred), annot = True, cmap='Blues',fmt = 'd')

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
print("Train score: {}" .format(lr.score(X_train,y_train)))
print("Accuracy score: {}" .format(accuracy_score(y_test,lr_pred)))
print(classification_report(y_test, lr_pred))
sns.heatmap(confusion_matrix(y_test,lr_pred), annot = True, cmap='Blues',fmt = 'd')

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4, algorithm='auto')
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
print("Train score: {}" .format(knn.score(X_train,y_train)))
print("Accuracy score: {}" .format(accuracy_score(y_test,knn_pred)))
print(classification_report(y_test, knn_pred))
sns.heatmap(confusion_matrix(y_test,knn_pred), annot = True, cmap='Blues',fmt = 'd')