# Company Bankruptcy prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report,accuracy_score
from matplotlib import pyplot
from xgboost import plot_importance
import warnings
warnings.filterwarnings(action="ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/company-bankruptcy-prediction/data.csv")
df.head()

# **Data Profiling**

In [None]:
#Shape of the dataset
df.shape

In [None]:
#Statistical summary of data
df.describe()

In [None]:
#feature information
df.info()

Observation:
* All the features are numeric. No need of any encoding techniques

In [None]:
#Checking for missing values
c=0
for col in df.columns:
    null_values_count = df[col].isnull().sum()
    if null_values_count > 0:
        c=c+1
        print(col,null_values_count)

if c==0:
    print("No missing values found in the dataset")

# Feature Selection

In [None]:
X=df.drop(['Bankrupt?'],axis=1)
y=df['Bankrupt?']
var_thres = VarianceThreshold(3.0)
var_thres.fit(X)

In [None]:
required_features = [col for col in X.columns if col in X.columns[var_thres.get_support()]]
print(required_features)

In [None]:
len(required_features)

Observation:
* 24 features are selected. Features having variance of less than 3.0 has been dropped

In [None]:
df_v1 = df[required_features]
df_v1.head()

In [None]:
#Checking for multicollinearity
df_v1_corr = df_v1.corr()
df_v1_corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Distribution of features
df_v1.hist(bins=25,figsize = (30,30))

# SMOTE technique

In [None]:
df.groupby('Bankrupt?')['Bankrupt?'].count()

Observation:
* The dataset contains unbalanced classes. This issue can be resolved by using SMOTE technique

In [None]:
X_new = df[required_features] #Taking the required independent variables
y_new = df['Bankrupt?'] #Dependent variable

#Initializing SMOTE
sm = SMOTE(random_state=42)
X_smote,y_smote = sm.fit_sample(X_new,y_new)

In [None]:
count_class = Counter(y_smote)
print(count_class)

# Model selection and building

In [None]:
#Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_smote,y_smote,test_size=0.3, random_state=42)

In [None]:
models = {
    'SVM':{'model':svm.SVC(gamma='auto',C=5,kernel='rbf'),'params': {'C': [1,5,10]}},
    'xgboost':{'model':xgb.XGBClassifier(),'params': {'max_depth':[4,6,8]}},
    'random_forest': {'model': RandomForestClassifier(),'params': {'n_estimators': [1,5]}},
    'logistic_regression' : {'model': LogisticRegression(solver='liblinear',multi_class='auto'),'params': {'C': [1,5]}},
    'naive_bayes_gaussian': {'model': GaussianNB(),'params': {}},
    'decision_tree': {'model': DecisionTreeClassifier(),'params': {'criterion': ['gini','entropy']}}
}

In [None]:
scores = []

for model_name, mp in models.items():
    clf =  GridSearchCV(mp['model'],mp['params'] ,cv= 2, return_train_score=False)
    clf.fit(X_smote, y_smote)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_model = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model

**Observation:**
* Xgboost classifier performs well than other models

In [None]:
#Xgboost classifier
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train,y_train)
ypred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test,ypred_xgb))

In [None]:
print(accuracy_score(y_test,ypred_xgb))

In [None]:
#Plotting features importance
plot_importance(xgb_model)
pyplot.show()