In [17]:
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [36]:
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))
    
def calc_mertic_info(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global mertic_info
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)
            
    r2_train = metrics.r2_score(y_train, y_train_predicted)
    r2_test = metrics.r2_score(y_test, y_test_predicted)
    
    rms_train = np.sqrt(metrics.mean_squared_error(y_train, y_train_predicted))
    rms_test = np.sqrt(metrics.mean_squared_error(y_test, y_test_predicted))
        
    mae_train = metrics.mean_absolute_error(y_train, y_train_predicted)
    mae_test = metrics.mean_absolute_error(y_test, y_test_predicted)
        
    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
        
    rmse_dict[modelname] = rms_test
        
    df_local = pd.DataFrame({'Model':[modelname],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test],
                            'mape_train':[mape_train],
                            'mape_test':[mape_test]})
        
    mertic_info = pd.concat([mertic_info, df_local])
    return mertic_info    

data = pd.read_csv("boruta_features.csv")


In [37]:
data.head()

Unnamed: 0,1 net profit / total assets,2 total liabilities / total assets,6 retained earnings / total assets,5 [(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365,8 book value of equity / total liabilities,9 sales / total assets,10 equity / total assets,12 gross profit / short-term liabilities,13 (gross profit + depreciation) / sales,16 (gross profit + depreciation) / total liabilities,...,26 (net profit + depreciation) / total liabilities,27 profit on operating activities / financial expenses,29 logarithm of total assets,34 operating expenses / total liabilities,38 constant capital / total assets,46 (current assets - inventory) / short-term liabilities,51 short-term liabilities / total assets,55 working capital,64 sales / fixed assets,65 Bankrupt
0,0.20055,0.37951,0.38825,32.351,1.3305,1.1389,0.50494,0.6598,0.1666,0.73378,...,0.60411,1.4582,5.9443,0.56393,0.50591,1.5225,0.37854,348690.0,7.4277,0
1,0.18732,0.61323,0.18732,-7.3128,0.6307,1.1559,0.38677,0.33147,0.12182,0.32211,...,0.32211,1.4138,4.1424,0.3234,0.43489,0.95787,0.56511,3186.6,7.898,0
2,0.00902,0.63202,0.0,-37.842,0.58223,1.3332,0.36798,0.033921,0.038938,0.082138,...,0.073572,1.0714,5.9479,1.7697,0.49344,0.81192,0.42554,1.1263,2.5603,0
3,0.26669,0.34994,0.55983,43.087,1.8577,1.1268,0.65006,1.0993,0.12047,0.99444,...,0.80759,1.1885,3.9412,0.87075,0.69793,2.0239,0.30207,5340.0,33.413,0
4,0.067731,0.19885,0.21265,90.606,4.029,1.257,0.80115,1.8736,0.31036,0.39415,...,0.3422,2.6744,5.2684,0.27021,0.95834,2.2195,0.041664,15132.0,0.28803,0


In [38]:
X = data.drop(['65 Bankrupt'], axis =1)
y = data['65 Bankrupt']

# Random Forest Regression

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
# splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

In [41]:
# build normal model
pipeline = make_pipeline(preprocessing.MinMaxScaler(), 
                         RandomForestClassifier())
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), RandomForestClassifier())
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

# classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, smote_prediction))

# print information about both models
print()
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

print()
print('normal Pipeline Score {}'.format(pipeline.score(X_test, y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))

print()
print_results("normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()

calc_mertic_info('Regression', gs_lr, X_train, y_train, X_test, y_test)

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       946
          1       0.00      0.00      0.00        20

avg / total       0.96      0.98      0.97       966

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.98      0.20      0.98      0.44      0.21       946
          1       0.17      0.20      0.98      0.18      0.44      0.18        20

avg / total       0.97      0.96      0.22      0.96      0.44      0.21       966


normal data distribution: Counter({0: 3787, 1: 76})
SMOTE data distribution: Counter({0: 3787, 1: 3787})

normal Pipeline Score 0.9782608695652174
SMOTE Pipeline Score 0.9627329192546584

normal classification
accuracy: 0.9782608695652174
precision: 0.0
recall: 0.0
f1: 0.0

SMOTE classification
accuracy: 0.9627329192546584
precision: 0.16666666666666666
recall: 0.2
f1: 0.1818181818181818



# Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

In [None]:
# lrpipeline = make_pipeline(preprocessing.MinMaxScaler(), 
#                          LogisticRegression())

# model = lrpipeline.fit(X_train, y_train)
# prediction = model.predict(X_test)

# lr.best_estimator_

# lrscore = lr.score(X_test,y_test)

In [44]:
# build normal model
LR_pipeline = make_pipeline(preprocessing.MinMaxScaler(), 
                          LogisticRegression())
model = LR_pipeline.fit(X_train, y_train)
LR_prediction = model.predict(X_test)

# build model with SMOTE imblearn
LR_smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),  LogisticRegression())
LR_smote_model = LR_smote_pipeline.fit(X_train, y_train)
LR_smote_prediction = LR_smote_model.predict(X_test)

# classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, LR_smote_prediction))

# print information about both models
print()
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

print()
print('normal Pipeline Score {}'.format(LR_pipeline.score(X_test, y_test)))
print('SMOTE Pipeline Score {}'.format(LR_smote_pipeline.score(X_test, y_test)))

print()
print_results("normal classification", y_test, LR_prediction)
print()
print_results("SMOTE classification", y_test, LR_smote_prediction)
print()

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       946
          1       0.00      0.00      0.00        20

avg / total       0.96      0.98      0.97       966

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.77      0.90      0.87      0.83      0.68       946
          1       0.08      0.90      0.77      0.14      0.83      0.70        20

avg / total       0.98      0.77      0.90      0.85      0.83      0.68       966


normal data distribution: Counter({0: 3787, 1: 76})
SMOTE data distribution: Counter({0: 3787, 1: 3787})

normal Pipeline Score 0.979296066252588
SMOTE Pipeline Score 0.772256728778468

normal classification
accuracy: 0.979296066252588
precision: 0.0
recall: 0.0
f1: 0.0

SMOTE classification
accuracy: 0.772256728778468
precision: 0.07627118644067797
recall: 0.9
f1: 0.140625



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Neural Nets

In [45]:
from sklearn.neural_network import MLPClassifier

In [46]:
# build normal model
NN_pipeline = make_pipeline(preprocessing.MinMaxScaler(), 
                          MLPClassifier())
model = NN_pipeline.fit(X_train, y_train)
NN_prediction = model.predict(X_test)

# build model with SMOTE imblearn
NN_smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),  LogisticRegression())
NN_smote_model = NN_smote_pipeline.fit(X_train, y_train)
NN_smote_prediction = NN_smote_model.predict(X_test)

# classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, NN_smote_prediction))

# print information about both models
print()
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

print()
print('normal Pipeline Score {}'.format(NN_pipeline.score(X_test, y_test)))
print('SMOTE Pipeline Score {}'.format(NN_smote_pipeline.score(X_test, y_test)))

print()
print_results("normal classification", y_test, NN_prediction)
print()
print_results("SMOTE classification", y_test, NN_smote_prediction)
print()

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       946
          1       0.00      0.00      0.00        20

avg / total       0.96      0.98      0.97       966

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.77      0.90      0.87      0.83      0.68       946
          1       0.08      0.90      0.77      0.14      0.83      0.70        20

avg / total       0.98      0.77      0.90      0.85      0.83      0.68       966


normal data distribution: Counter({0: 3787, 1: 76})
SMOTE data distribution: Counter({0: 3787, 1: 3787})

normal Pipeline Score 0.979296066252588
SMOTE Pipeline Score 0.772256728778468

normal classification
accuracy: 0.979296066252588
precision: 0.0
recall: 0.0
f1: 0.0

SMOTE classification
accuracy: 0.772256728778468
precision: 0.07627118644067797
recall: 0.9
f1: 0.140625



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# SVC

In [47]:
from sklearn.svm import SVC

In [48]:
SVC_pipeline = make_pipeline(preprocessing.MinMaxScaler(), 
                          SVC())
model = SVC_pipeline.fit(X_train, y_train)
SVC_prediction = model.predict(X_test)

# build model with SMOTE imblearn
SVC_smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),  LogisticRegression())
SVC_smote_model = SVC_smote_pipeline.fit(X_train, y_train)
SVC_smote_prediction = SVC_smote_model.predict(X_test)

# classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, SVC_smote_prediction))

# print information about both models
print()
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

print()
print('normal Pipeline Score {}'.format(SVC_pipeline.score(X_test, y_test)))
print('SMOTE Pipeline Score {}'.format(SVC_smote_pipeline.score(X_test, y_test)))

print()
print_results("normal classification", y_test, SVC_prediction)
print()
print_results("SMOTE classification", y_test, SVC_smote_prediction)
print()

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       946
          1       0.00      0.00      0.00        20

avg / total       0.96      0.98      0.97       966

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.77      0.85      0.87      0.81      0.65       946
          1       0.07      0.85      0.77      0.14      0.81      0.66        20

avg / total       0.98      0.78      0.85      0.86      0.81      0.65       966


normal data distribution: Counter({0: 3787, 1: 76})
SMOTE data distribution: Counter({0: 3787, 1: 3787})

normal Pipeline Score 0.979296066252588
SMOTE Pipeline Score 0.7753623188405797

normal classification
accuracy: 0.979296066252588
precision: 0.0
recall: 0.0
f1: 0.0

SMOTE classification
accuracy: 0.7753623188405797
precision: 0.0735930735930736
recall: 0.85
f1: 0.1354581673306773



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
