In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#open the data file
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv', sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)

In [None]:
df.describe()

In [None]:
#let's see output values
df['Bankrupt?'].value_counts()
# we can notice imbalanced data

In [None]:
#remove features not correlated to the output

# correlation to the output
corr = abs(df.corr()['Bankrupt?'])

# decide the threshold
corr_feat = corr[abs(corr) >= 0.01]

# columns above the theshold
corr_feat_col = corr_feat.index
df=df[corr_feat_col]

In [None]:
df.head()

In [None]:
#find highly correlated features
def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold and (corr_matrix.columns[j] not in col_corr): 
                colname = corr_matrix.columns[i]  
                col_corr.add(colname)
    return col_corr

In [None]:
# our threshold will be 0.6
corr_features = correlation(df, 0.6)
# now let's see the number of features
len(set(corr_features))

In [None]:
#feature names
set(corr_features)

In [None]:
#drop highly correlated variables
df=df.drop(corr_features,axis=1)

In [None]:
df.describe()

In [None]:
#split features and target
X = df.drop('Bankrupt?', axis = 1)       
y = df['Bankrupt?']

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#since we are dealing with imbalanced data, we need to apply the synthetic minority oversampling technique 
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [None]:
#split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#Classifier 1 - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
test_pred = rfc.predict(X_test)
train_pred = rfc.predict(X_train)

In [None]:
print(classification_report(y_test,test_pred))

In [None]:
#Classifier 2 - ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)
test_pred = etc.predict(X_test)

In [None]:
print(classification_report(y_test,test_pred))

In [None]:
# Classifier 3 - StackingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
estimators = [
    ('etc', ExtraTreesClassifier()),
    ('rfc', RandomForestClassifier(n_estimators = 100, max_depth= 12))
    ]
stc= StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stc.fit(X_train, y_train)
test_pred = stc.predict(X_test)

In [None]:
print(classification_report(y_test,test_pred))

In [None]:
#improve our model performances using cross validation
def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

In [None]:
cross_val(rfc)

In [None]:
cross_val(etc)

In [None]:
cross_val(stc)