In [1]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns',None)

## Borderline smote (Oversampling)

In [2]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')

X_test = pd.read_csv('Data/X_test_enc.csv')
y_test = pd.read_csv('Data/y_test_enc.csv')

In [3]:
print("\n\nValue counts: \n",y['Attrition'].value_counts())



Value counts: 
 0    863
1    166
Name: Attrition, dtype: int64


In [27]:
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [28]:
import imblearn
print(imblearn.__version__)

0.7.0


In [30]:
from imblearn.over_sampling import BorderlineSMOTE
X, y = BorderlineSMOTE().fit_resample(X, y)
print("Shape: ",X.shape,y.shape)
print("\n\nValue counts: \n",y['Attrition'].value_counts())

Shape:  (1726, 43) (1726, 1)


Value counts: 
 1    863
0    863
Name: Attrition, dtype: int64


In [31]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

In [32]:
rfc = RandomForestClassifier()
ada = AdaBoostClassifier()
gbc = GradientBoostingClassifier()
lr = LogisticRegression()
svc = SVC()
gnb = GaussianNB()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()

In [33]:
estimators = [
    ('ada',ada),
    
    ('gbc',gbc),
    ('lr',lr),
    ('svc',svc),
    
    ('gnb',gnb),
    
    ('xgb',xgb),
    
    ('dt',dt)
]

In [34]:
stk_classifier = StackingClassifier(estimators=estimators, 
                                    final_estimator=RandomForestClassifier(n_estimators=100),
                                    passthrough=True,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=3)

In [35]:
stk_classifier.fit(X,y['Attrition'])

StackingClassifier(cv=5,
                   estimators=[('ada', AdaBoostClassifier()),
                               ('gbc', GradientBoostingClassifier()),
                               ('lr', LogisticRegression()), ('svc', SVC()),
                               ('gnb', GaussianNB()),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraint...
                                              min_child_weight=None,
                                              missing=nan,
                                              m

In [36]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,fbeta_score

In [37]:
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.8707482993197279
[[360  10]
 [ 47  24]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.93       370
           1       0.71      0.34      0.46        71

    accuracy                           0.87       441
   macro avg       0.80      0.66      0.69       441
weighted avg       0.86      0.87      0.85       441



## Smote (Oversampling)

In [38]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')

In [39]:
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [40]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE().fit_resample(X, y)
print("Shape: ",X.shape,y.shape)
print("\n\nValue counts: \n",y['Attrition'].value_counts())

Shape:  (1726, 43) (1726, 1)


Value counts: 
 1    863
0    863
Name: Attrition, dtype: int64


In [41]:
stk_classifier.fit(X,y['Attrition'])

StackingClassifier(cv=5,
                   estimators=[('ada', AdaBoostClassifier()),
                               ('gbc', GradientBoostingClassifier()),
                               ('lr', LogisticRegression()), ('svc', SVC()),
                               ('gnb', GaussianNB()),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraint...
                                              min_child_weight=None,
                                              missing=nan,
                                              m

In [42]:
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.891156462585034
[[363   7]
 [ 41  30]]
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       370
           1       0.81      0.42      0.56        71

    accuracy                           0.89       441
   macro avg       0.85      0.70      0.75       441
weighted avg       0.88      0.89      0.88       441



## ADASYN

In [43]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')

In [44]:
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [45]:
from imblearn.over_sampling import ADASYN
X, y = ADASYN().fit_resample(X, y)
print("Shape: ",X.shape,y.shape)
print("\n\nValue counts: \n",y['Attrition'].value_counts())

Shape:  (1685, 43) (1685, 1)


Value counts: 
 0    863
1    822
Name: Attrition, dtype: int64


In [46]:
stk_classifier.fit(X,y['Attrition'])

StackingClassifier(cv=5,
                   estimators=[('ada', AdaBoostClassifier()),
                               ('gbc', GradientBoostingClassifier()),
                               ('lr', LogisticRegression()), ('svc', SVC()),
                               ('gnb', GaussianNB()),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraint...
                                              min_child_weight=None,
                                              missing=nan,
                                              m

In [47]:
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.8888888888888888
[[360  10]
 [ 39  32]]
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       370
           1       0.76      0.45      0.57        71

    accuracy                           0.89       441
   macro avg       0.83      0.71      0.75       441
weighted avg       0.88      0.89      0.88       441



## No sampling

In [49]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [50]:
stk_classifier.fit(X,y['Attrition'])
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.8820861678004536
[[357  13]
 [ 39  32]]
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       370
           1       0.71      0.45      0.55        71

    accuracy                           0.88       441
   macro avg       0.81      0.71      0.74       441
weighted avg       0.87      0.88      0.87       441



## Smotetomek (Combination of over and under sampling)

In [51]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [52]:
from imblearn.combine import SMOTETomek
X, y = SMOTETomek().fit_resample(X, y)
print("Shape: ",X.shape,y.shape)
print("\n\nValue counts: \n",y['Attrition'].value_counts())

Shape:  (1616, 43) (1616, 1)


Value counts: 
 1    808
0    808
Name: Attrition, dtype: int64


In [53]:
stk_classifier.fit(X,y['Attrition'])
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.8798185941043084
[[361   9]
 [ 44  27]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       370
           1       0.75      0.38      0.50        71

    accuracy                           0.88       441
   macro avg       0.82      0.68      0.72       441
weighted avg       0.87      0.88      0.86       441



## Smoteenn (Combination of over and under sampling)

In [54]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [55]:
from imblearn.combine import SMOTEENN
X, y = SMOTEENN().fit_resample(X, y)
print("Shape: ",X.shape,y.shape)
print("\n\nValue counts: \n",y['Attrition'].value_counts())

Shape:  (1015, 43) (1015, 1)


Value counts: 
 1    593
0    422
Name: Attrition, dtype: int64


In [56]:
stk_classifier.fit(X,y['Attrition'])
y_pred = stk_classifier.predict(X_test)
print('Accuracy:',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.8253968253968254
[[326  44]
 [ 33  38]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       370
           1       0.46      0.54      0.50        71

    accuracy                           0.83       441
   macro avg       0.69      0.71      0.70       441
weighted avg       0.84      0.83      0.83       441

