In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [36]:
data = pd.read_csv("Datasets/Bankcruptdata.csv")

In [37]:
X = data.drop("Bankrupt?",axis=1)
y = data['Bankrupt?']
print(y.value_counts())
print(y.value_counts(normalize=True)*100)

Bankrupt?
0    6599
1     220
Name: count, dtype: int64
Bankrupt?
0    96.77372
1     3.22628
Name: proportion, dtype: float64


In [38]:
# underSampling
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=23)
X_us,y_us = under_sampler.fit_resample(X,y)
print(y_us.value_counts())
print(y_us.value_counts(normalize=True)*100)

Bankrupt?
0    220
1    220
Name: count, dtype: int64
Bankrupt?
0    50.0
1    50.0
Name: proportion, dtype: float64


In [39]:
#OverSampling
from imblearn.over_sampling import RandomOverSampler
over_sampler = RandomOverSampler(random_state=23)
X_os,y_os = over_sampler.fit_resample(X,y)
print(y_os.value_counts())
print(y_os.value_counts(normalize=True)*100)

Bankrupt?
1    6599
0    6599
Name: count, dtype: int64
Bankrupt?
1    50.0
0    50.0
Name: proportion, dtype: float64


In [40]:
#synthetic minority oversampling technique for estimation
from imblearn.over_sampling import SMOTE
smote_sampler = SMOTE(random_state=23)
X_smote,y_smote = smote_sampler.fit_resample(X,y)
print(y_smote.value_counts())
print(y_smote.value_counts(normalize=True)*100)

Bankrupt?
1    6599
0    6599
Name: count, dtype: int64
Bankrupt?
1    50.0
0    50.0
Name: proportion, dtype: float64


In [41]:
#SVM-SMOTE
from imblearn.over_sampling import SVMSMOTE
svm_smote_sampler = SVMSMOTE(random_state=23)
X_svm_smote,y_svm_smote = svm_smote_sampler.fit_resample(X,y)
print(y_svm_smote.value_counts())
print(y_svm_smote.value_counts(normalize=True)*100)

Bankrupt?
0    6599
1    3726
Name: count, dtype: int64
Bankrupt?
0    63.912833
1    36.087167
Name: proportion, dtype: float64


In [42]:
#Adasyn
from imblearn.over_sampling import ADASYN
adasyn_sampler =ADASYN(random_state=23)
X_adasyn,y_adasyn = adasyn_sampler.fit_resample(X,y)
print(y_adasyn.value_counts())
print(y_adasyn.value_counts(normalize=True)*100)

Bankrupt?
0    6599
1    6523
Name: count, dtype: int64
Bankrupt?
0    50.28959
1    49.71041
Name: proportion, dtype: float64


In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
#undersampling
rfc = RandomForestClassifier(random_state=23)
print("Cross val score using undersampling:",cross_val_score(rfc,X_us,y_us).mean())
print("Cross val score using oversampling:",cross_val_score(rfc,X_os,y_os).mean())
print("Cross val score using SMOTE:",cross_val_score(rfc,X_smote,y_smote).mean())
print("Cross val score using SVM_SMOTE:",cross_val_score(rfc,X_svm_smote,y_svm_smote).mean())
print("Cross val score using ADASYN:",cross_val_score(rfc,X_adasyn,y_adasyn).mean())


Cross val score using undersampling: 0.875
Cross val score using oversampling: 0.9919690367104159
Cross val score using SMOTE: 0.9748463605360158
Cross val score using SVM_SMOTE: 0.9716222760290558
Cross val score using ADASYN: 0.9616693089430894
