In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV

from sklearn.metrics import recall_score


In [None]:
att = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
att.head()

In [None]:
att.info()

# Preprocessing

In [None]:
attFeatures = []
for i in att.columns:
    attFeatures.append([i, att[i].nunique(), att[i].drop_duplicates().values])
pd.DataFrame(attFeatures, columns = ['Features', 'Unique Number', 'Values'])

**Preprocessing Scheme**
* OneHotEncoding: BusinessTravel, Department, EducationField, Gender, JobRole, MaritalStatus, OverTime
* The rest will be pass through.

In [None]:
att['Attrition'] = np.where(att['Attrition'] == 'Yes', 1, 0)

*I define number **1 is Yes, means resign** and number **0 is No, means stay**.*

In [None]:
att.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)

*I drop these columns because it's only has one value for all rows.*

In [None]:
transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(drop = 'first'), ['BusinessTravel', 'Department', 'EducationField', 'Gender',
                                                'JobRole', 'MaritalStatus', 'OverTime']),
], remainder = 'passthrough')

In [None]:
att['Attrition'].value_counts()/att.shape[0]*100

*Indicated imbalance data*

* *0 = Stay*
* *1 = Resign*

        - TN: Predicted: Stay and Actual: Stay
        - TP: Predicted: Resign and Actual: Resign
        - FP: Predicted: Resign and Actual: Stay
        - FN: Predicted: Stay and Actual: Resign

*From this matrix, I choose to push the FN or recall score to anticipate the employees not to resign because of the prediction is wrong.*

**Splitting Data**

In [None]:
X = att.drop('Attrition', axis = 1)
y = att['Attrition']

X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   stratify = y,
                                                   test_size = 0.3,
                                                   random_state = 3131)

*I use 0.3 as default score for test_size and X.shape for random_state so the data will be devided equally.*

# Modeling

**Define Model**
- I test with 4 models to find the best model:

    * Logistic Regression
    * Decision Tree Classifier
    * K-Nearest Neighbor
    * Random Forest Classifier

In [None]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 3131)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = 3131)

# Cross Validation

In [None]:
logreg_pipe = Pipeline([('transformer', transformer), ('logreg', logreg)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
knn_pipe = Pipeline([('transformer', transformer), ('scale', MinMaxScaler()), ('knn', knn)])
rf_pipe = Pipeline([('transformer', transformer), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_pipe_cv = model_evaluation(logreg_pipe, 'recall')
tree_pipe_cv = model_evaluation(tree_pipe, 'recall')
knn_pipe_cv = model_evaluation(knn_pipe, 'recall')
rf_pipe_cv = model_evaluation(rf_pipe, 'recall')

for model in [logreg_pipe, tree_pipe, knn_pipe, rf_pipe]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_pipe_cv.mean(), tree_pipe_cv.mean(), knn_pipe_cv.mean(), rf_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(), tree_pipe_cv.std(), knn_pipe_cv.std(), rf_pipe_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe.predict(X_test)),
            recall_score(y_test, tree_pipe.predict(X_test)), 
            recall_score(y_test, knn_pipe.predict(X_test)), 
            recall_score(y_test, rf_pipe.predict(X_test))]
method_name = ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier', 'Random Forest Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
cv_summary

*From this method, I will choose Decision Tree Classifier because it has the highest recall score. But overall, the score still not good enough to do a prediction. While I process the data, it already indicates that the data is imbalanced. I decide to handle it using Under Sampling and Over Sampling.*

# Handling Imbalance

### UnderSampling

**RandomUnderSampler Model**

In [None]:
rus = RandomUnderSampler(random_state = 3131)
X_under, y_under = rus.fit_resample(X_train, y_train) 

logreg_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('logreg', logreg)])
tree_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('tree', tree)])
knn_pipe_under = Pipeline([('transformer', transformer), ('scale', MinMaxScaler()), ('rus', rus), ('knn', knn)])
rf_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_under_cv = model_evaluation(logreg_pipe_under, 'recall') 
tree_under_cv = model_evaluation(tree_pipe_under, 'recall')
knn_under_cv = model_evaluation(knn_pipe_under, 'recall')
rf_under_cv = model_evaluation(rf_pipe_under, 'recall')

for model in [logreg_pipe_under, tree_pipe_under, knn_pipe_under, rf_pipe_under]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_under_cv.mean(), tree_under_cv.mean(), knn_under_cv.mean(),
              rf_under_cv.mean()]
score_std = [logreg_under_cv.std(), tree_under_cv.std(), knn_under_cv.std(),
             rf_under_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe_under.predict(X_test)),
            recall_score(y_test, tree_pipe_under.predict(X_test)), 
            recall_score(y_test, knn_pipe_under.predict(X_test)), 
            recall_score(y_test, rf_pipe_under.predict(X_test))]
method_name = ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling',
              'KNN Classifier UnderSampling', 'Random Forest Classifier UnderSampling']
under_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
under_summary

*The score looks good rather than before. From this Under Sampling, I will choose Decision Tree Classifier because it has the highest recall score.*

### OverSampling

**RandomOverSampler Model**

In [None]:
ros = RandomOverSampler(random_state = 3131)
X_over, y_over = ros.fit_resample(X_train, y_train) 

logreg_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('logreg', logreg)])
tree_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('tree', tree)])
knn_pipe_over = Pipeline([('transformer', transformer), ('scale', MinMaxScaler()), ('ros', ros), ('knn', knn)])
rf_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_over_cv = model_evaluation(logreg_pipe_over, 'recall') 
tree_over_cv = model_evaluation(tree_pipe_over, 'recall')
knn_over_cv = model_evaluation(knn_pipe_over, 'recall')
rf_over_cv = model_evaluation(rf_pipe_over, 'recall')

for model in [logreg_pipe_over, tree_pipe_over, knn_pipe_over, rf_pipe_over]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_over_cv.mean(), tree_over_cv.mean(), knn_over_cv.mean(),
              rf_over_cv.mean()]
score_std = [logreg_over_cv.std(), tree_over_cv.std(), knn_over_cv.std(),
             rf_over_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe_over.predict(X_test)),
            recall_score(y_test, tree_pipe_over.predict(X_test)), 
            recall_score(y_test, knn_pipe_over.predict(X_test)), 
            recall_score(y_test, rf_pipe_over.predict(X_test))]
method_name = ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling',
              'KNN Classifier OverSampling', 'Random Forest Classifier OverSampling']
over_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
over_summary

*Now, the score getting worse again. The only one who stands out from the others. It's Logistic Regression using Over Sampling method. The recall score is the highest of other models. The rest models indicates underfitting.*

*Based on 3 methods, Cross Validation, Under Sampling, and Over Sampling, I pick Logistic Regression as the most stable model and using Over Sampling method, and continue to HyperParameter Tuning process.*

# HyperParam Tuning

In [None]:
estimator = Pipeline([
    ('transformer', transformer),
    ('ros', ros),
    ('model', logreg)
])

hyperparam_space = {
    'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'model__solver': ['liblinear', 'newton-cg', 'lbfgs'],
    'model__max_iter': [50, 100, 150, 200],
    'model__random_state': [3131]
}

random = RandomizedSearchCV(
                estimator,
                param_distributions = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'recall',
                n_iter = 10,
                n_jobs = -1)

random.fit(X_train, y_train)

print('best score', random.best_score_)
print('best param', random.best_params_)

After HyperParameter Tuning process, the score is getting higher, it means that tuning process can improve the model.

# Compairing Result

In [None]:
estimator.fit(X_train, y_train)
y_pred_estimator = estimator.predict(X_test)
recall_estimator = recall_score(y_test, y_pred_estimator)

random.best_estimator_.fit(X_train, y_train)
y_pred_random = random.best_estimator_.predict(X_test)
recall_best_estimator = recall_score(y_test, y_pred_random)

score_list = [recall_estimator, recall_best_estimator]
method_name = ['Logistic Regression OverSampling Before Tuning', 'Logistic Regression OverSampling After Tuning']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

So far, this is the best model that I got for predicting attrition in this case.