In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import chi2, SelectKBest
import xgboost
import sklearn

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
plt.figure(figsize=(25,25))
sns.heatmap(df.corr())

### There are so many features that correlations matrix doesn't make much sense here.

In [None]:
df['Bankrupt?'].value_counts()

In [None]:
X = df.drop('Bankrupt?',axis=1)
y = df['Bankrupt?']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Model AUC with all the features included(base)
## We are using XGBOOST, as it has overall good performance in this type of data.

In [None]:
model = xgboost.XGBRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)
sklearn.metrics.roc_auc_score(y_test, pred)

## Using different transformers to see which one give the highest AUC

In [None]:
standard_scaler = sklearn.preprocessing.StandardScaler()
robust_scaler = sklearn.preprocessing.RobustScaler()
minmax_scaler = sklearn.preprocessing.MinMaxScaler()
normalizer_scaler = sklearn.preprocessing.Normalizer()

## 1.Feature selection using sklearn's SelectKBest

In [None]:
bestfeatures = SelectKBest(score_func=chi2, k=30)

In [None]:
fit = bestfeatures.fit(X,y)
dfscore = pd.DataFrame(fit.scores_)
dfcolumn = pd.DataFrame(X.columns)
featureScore = pd.concat([dfcolumn,dfscore],axis=1)
featureScore.columns = ['Features','Score']

In [None]:
featureScore.sort_values('Score',ascending=False,inplace=True)
featureScore.reset_index(drop=True,inplace=True)

In [None]:
featureScore

### 1.1 Selecting one features from features with similar Score

### 1.1.1 Using every 3rd column

In [None]:
new_feature = []
for i in range (0, len(featureScore.Features.to_list()),3):
    new_feature.append(featureScore.Features.to_list()[i])

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df[new_feature], y, test_size=0.3, random_state=42)

In [None]:
standard_scaler.fit(X_train1)
X_train1 = standard_scaler.transform(X_train1)
standard_scaler.fit(X_test1)
X_test1 = standard_scaler.transform(X_test1)

In [None]:
model = xgboost.XGBRegressor()
model.fit(X_train1, y_train1)
pred1 = model.predict(X_test1)
sklearn.metrics.roc_auc_score(y_test1, pred1)

### 1.1.2 Using every 2nd columns

In [None]:
new_feature1 = []
for i in range (0, len(featureScore.Features.to_list()),2):
    new_feature1.append(featureScore.Features.to_list()[i])

In [None]:
df[new_feature1]

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df[new_feature1], y, test_size=0.3, random_state=42)

In [None]:
standard_scaler.fit(X_train2)
X_train2 = standard_scaler.transform(X_train2)
standard_scaler.fit(X_test2)
X_test2 = standard_scaler.transform(X_test2)

In [None]:
model = xgboost.XGBRegressor()
model.fit(X_train2, y_train2)
pred2 = model.predict(X_test2)
sklearn.metrics.roc_auc_score(y_test2, pred2)

### 1.1.3 Using every 4th columns

In [None]:
new_feature2 = []
for i in range (0, len(featureScore.Features.to_list()),4):
    new_feature2.append(featureScore.Features.to_list()[i])

In [None]:
df[new_feature2]

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(df[new_feature2], y, test_size=0.3, random_state=42)

In [None]:
standard_scaler.fit(X_train3)
X_train3 = standard_scaler.transform(X_train3)
standard_scaler.fit(X_test3)
X_test3 = standard_scaler.transform(X_test3)

In [None]:
model = xgboost.XGBRegressor()
model.fit(X_train3, y_train3)
pred3 = model.predict(X_test3)
sklearn.metrics.roc_auc_score(y_test3, pred3)

### 1.1.4 Using random top features

In [None]:
for i in range(50,80):
    top_features = featureScore.Features.to_list()[:i]
    X_trains, X_tests, y_trains, y_tests = train_test_split(df[top_features], y, test_size=0.3, random_state=42)
    standard_scaler.fit(X_trains)
    X_trains = standard_scaler.transform(X_trains)
    standard_scaler.fit(X_tests)
    X_tests = standard_scaler.transform(X_tests)
    model.fit(X_trains, y_trains)
    preds = model.predict(X_tests)
    results = sklearn.metrics.roc_auc_score(y_tests, preds)
    print(i, results)

In [None]:
top_76_features = featureScore.Features.to_list()[:76]

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(df[top_76_features], y, test_size=0.3, random_state=42)

standard_scaler.fit(X_train4)
X_train4 = standard_scaler.transform(X_train4)
standard_scaler.fit(X_test4)
X_test4 = standard_scaler.transform(X_test4)

In [None]:
model.fit(X_train4, y_train4)
pred4 = model.predict(X_test4)
sklearn.metrics.roc_auc_score(y_test4, pred4)

## **So using every 2nd column gives us the best AUC score**
====================================================================================================================================

# Feature Selection Using Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mutual_info =  mutual_info_classif(X, y)

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info = mutual_info.sort_values(ascending=False)

In [None]:
top_33_features = mutual_info.index[:33].to_list()

In [None]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(df[top_33_features], y, test_size=0.3, random_state=42)

In [None]:
standard_scaler.fit(X_train5)
X_train5 = standard_scaler.transform(X_train5)
standard_scaler.fit(X_test5)
X_test5 = standard_scaler.transform(X_test5)

In [None]:
model.fit(X_train5, y_train5)
pred5 = model.predict(X_test5)
sklearn.metrics.roc_auc_score(y_test5, pred5)

#  **Conclusion**


### Machine learning algorithm doesn't understand what feature means and what they effect in decision making, what it understand is numbers and the patterns within it. So, after trying different combinations and few selection techniqies, the highest Area Under Curve accuracy in acheived was 92.86% with the Sklearn's SelectKBest's top 76 features according to their features score in Chi square Test. All features are given below.

In [None]:
top_76_features