In [2]:
#Import common libraries 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import time 
%matplotlib notebook
import pprint

In [None]:
# Import libraries for analysis

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [173]:
#Import data

#url = "https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset/download"
#df = pd.read_csv(url, compression='zip')

df = pd.read_csv('IBM-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [17]:
df.drop(['EmployeeCount', 'StandardHours', 'Over18','EmployeeNumber'], axis=1, inplace = True)
cat_features = [x for x in df.select_dtypes(include =['object']).columns if x not in ['Attrition']]
num_features = [x for x in df.select_dtypes(exclude =['object']).columns if x not in ['Attrition']]

df.Attrition = df.Attrition.map(dict(Yes = 1, No = 0))

print('Categorical:',cat_features)
print('Numeric:',num_features)

df.head()

Categorical: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
Numeric: ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


# Model building

In [18]:
# Encoding categorical variables 
dum_df = pd.get_dummies(df, columns=cat_features, prefix = cat_features)
dum_df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,1,1102,1,2,2,94,3,2,4,...,0,0,0,1,0,0,0,1,0,1
1,49,0,279,8,1,3,61,2,2,2,...,0,0,1,0,0,0,1,0,1,0
2,37,1,1373,2,2,4,92,2,1,3,...,0,0,0,0,0,0,0,1,0,1
3,33,0,1392,3,4,4,56,3,1,3,...,0,0,1,0,0,0,1,0,0,1
4,27,0,591,2,1,1,40,3,1,2,...,0,0,0,0,0,0,1,0,1,0


In [19]:
#from sklearn.model_selection import train_test_split

SEED = 80
y = dum_df.Attrition
X = dum_df.drop(['Attrition'], axis = 1)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= 0.3, random_state = SEED)
print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)

(1029, 51) (1029,)
(441, 51) (441,)


In [46]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 1092 to 1199
Data columns (total 50 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Age                                1029 non-null   int64
 1   DailyRate                          1029 non-null   int64
 2   DistanceFromHome                   1029 non-null   int64
 3   Education                          1029 non-null   int64
 4   EnvironmentSatisfaction            1029 non-null   int64
 5   HourlyRate                         1029 non-null   int64
 6   JobInvolvement                     1029 non-null   int64
 7   JobSatisfaction                    1029 non-null   int64
 8   MonthlyIncome                      1029 non-null   int64
 9   MonthlyRate                        1029 non-null   int64
 10  NumCompaniesWorked                 1029 non-null   int64
 11  PercentSalaryHike                  1029 non-null   int64
 12  PerformanceRating

In [21]:
# Normalizing 
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
Xtrain_scaled = minmax_scaler.fit_transform(Xtrain)
Xtest_scaled = minmax_scaler.transform(Xtest)

In [46]:
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

LR = LogisticRegression()
LR.fit(Xtrain_scaled, ytrain)

preds_LR = LR.predict(Xtest_scaled)
print("Logistic Regression Performance Before Undersampling: \n \n",
      classification_report(ytest, preds_LR),
      "\n ROC AUC score: ", roc_auc_score(ytest, preds_LR))

Logistic Regression Performance Before Undersampling: 
 
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       363
           1       0.85      0.36      0.50        78

    accuracy                           0.88       441
   macro avg       0.86      0.67      0.72       441
weighted avg       0.87      0.88      0.85       441
 
 ROC AUC score:  0.6726001271455817


In [73]:
#from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=SEED)
#undersample the majority class to have same number of observations as minority class 

In [74]:
Xtrain_under, ytrain_under = undersample.fit_resample(Xtrain, ytrain)
print("Before undersampling: \n", ytrain.value_counts())
print("After undersampling: \n", ytrain_under.value_counts())

Before undersampling: 
 0    870
1    159
Name: Attrition, dtype: int64
After undersampling: 
 1    159
0    159
Name: Attrition, dtype: int64


In [47]:
Xtrain_under_scaled = minmax_scaler.transform(Xtrain_under)
LR_under = LogisticRegression().fit(Xtrain_under_scaled, ytrain_under)

preds_LR_under = LR_under.predict(Xtest_scaled)
print("Logistic Regression Performance After Undersampling: \n \n",
      classification_report(ytest, preds_LR_under),
     "\n ROC AUC score: ", roc_auc_score(ytest, preds_LR_under))

Logistic Regression Performance After Undersampling: 
 
               precision    recall  f1-score   support

           0       0.92      0.70      0.80       363
           1       0.35      0.73      0.47        78

    accuracy                           0.71       441
   macro avg       0.63      0.72      0.63       441
weighted avg       0.82      0.71      0.74       441
 
 ROC AUC score:  0.716624284806103


In [77]:
#from imblearn.over_sampling import SMOTE

# define oversampling strategy
oversampl = SMOTE(sampling_strategy=0.6, random_state=SEED)

# fit and apply the transform
Xtrain_SMOTE, ytrain_SMOTE = oversampl.fit_resample(Xtrain, ytrain)

print("Before SMOTE: \n", ytrain.value_counts())
print("After SMOTE: \n", ytrain_SMOTE.value_counts())

Before SMOTE: 
 0    870
1    159
Name: Attrition, dtype: int64
After SMOTE: 
 0    870
1    522
Name: Attrition, dtype: int64


In [50]:
Xtrain_SMOTE_scaled = minmax_scaler.transform(Xtrain_SMOTE)
LR_SMOTE = LogisticRegression().fit(Xtrain_SMOTE_scaled, ytrain_SMOTE)

preds_LR_SMOTE = LR_SMOTE.predict(Xtest_scaled)
print("Logistic Regression Performance After SMOTE: \n \n",
      classification_report(ytest, preds_LR_SMOTE),
     "\n ROC AUC score: ", roc_auc_score(ytest, preds_LR_SMOTE))

Logistic Regression Performance After SMOTE: 
 
               precision    recall  f1-score   support

           0       0.88      0.98      0.93       363
           1       0.77      0.38      0.51        78

    accuracy                           0.87       441
   macro avg       0.82      0.68      0.72       441
weighted avg       0.86      0.87      0.85       441
 
 ROC AUC score:  0.6799109980928163


In [124]:
#from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import SMOTE
#from sklearn.model_selection import cross_val_score, KFold

In [143]:
#Combination of over and under sampling

over_values = [0.3,0.4,0.5]  # minority class obs = 0.3* majority class obs 
under_values = [0.7,0.6,0.5,0.4] # majority class obs = minority class obs/ 0.5 
for o in over_values:
  for u in under_values:
    # define pipeline
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=o)
    under = RandomUnderSampler(sampling_strategy=u)
    steps = [('over', over), ('under', under), ('scaler', MinMaxScaler()), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = KFold(shuffle=True, random_state=SEED)
    scores = cross_val_score(pipeline, Xtrain, ytrain, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    
    print('SMOTE oversampling rate:%.1f, Random undersampling rate:%.1f , Mean ROC AUC: %.3f' % (o, u, score))

SMOTE oversampling rate:0.3, Random undersampling rate:0.7 , Mean ROC AUC: 0.800
SMOTE oversampling rate:0.3, Random undersampling rate:0.6 , Mean ROC AUC: 0.830
SMOTE oversampling rate:0.3, Random undersampling rate:0.5 , Mean ROC AUC: 0.820
SMOTE oversampling rate:0.3, Random undersampling rate:0.4 , Mean ROC AUC: 0.821
SMOTE oversampling rate:0.4, Random undersampling rate:0.7 , Mean ROC AUC: 0.824
SMOTE oversampling rate:0.4, Random undersampling rate:0.6 , Mean ROC AUC: 0.826
SMOTE oversampling rate:0.4, Random undersampling rate:0.5 , Mean ROC AUC: 0.828
SMOTE oversampling rate:0.4, Random undersampling rate:0.4 , Mean ROC AUC: 0.815
SMOTE oversampling rate:0.5, Random undersampling rate:0.7 , Mean ROC AUC: 0.828
SMOTE oversampling rate:0.5, Random undersampling rate:0.6 , Mean ROC AUC: 0.830
SMOTE oversampling rate:0.5, Random undersampling rate:0.5 , Mean ROC AUC: 0.826
SMOTE oversampling rate:0.5, Random undersampling rate:0.4 , Mean ROC AUC: nan


In [147]:
pipeline.set_params(over__sampling_strategy=0.5, under__sampling_strategy=0.7, model = LogisticRegression(),
                    over__random_state = SEED, under__random_state = SEED)
pipeline.fit(Xtrain, ytrain)
print(pipeline)

pipeline_preds = pipeline.predict(Xtest)
print("Logistic Regression Performance for combination of under and over sampling: \n \n",
      classification_report(ytest, pipeline_preds),
     "\n ROC AUC score: %.3f " % roc_auc_score(ytest, pipeline_preds))

Pipeline(steps=[('over', SMOTE(random_state=80, sampling_strategy=0.5)),
                ('under',
                 RandomUnderSampler(random_state=80, sampling_strategy=0.7)),
                ('scaler', MinMaxScaler()), ('model', LogisticRegression())])
Logistic Regression Performance for combination of under and over sampling: 
 
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       363
           1       0.69      0.44      0.54        78

    accuracy                           0.87       441
   macro avg       0.79      0.70      0.73       441
weighted avg       0.85      0.87      0.85       441
 
 ROC AUC score: 0.697 


In [141]:
#Under-sampling only
under_values = [1,0.9,0.8,0.7,0.6,0.5,0.4,0.3] # majority class obs = minority class obs/ 0.5 

for u in under_values:
    # define pipeline
    model = LogisticRegression()
    under = RandomUnderSampler(sampling_strategy=u)
    stepsu = [('under', under), ('scaler', MinMaxScaler()), ('model', model)]
    pipe_under = Pipeline(steps=stepsu)
    # evaluate pipeline
    cv = KFold(shuffle=True, random_state=SEED)
    scores_under = cross_val_score(pipe_under, Xtrain, ytrain, scoring='roc_auc', cv=cv, n_jobs=-1)
    score_under = np.mean(scores_under)
    
    print('Random undersampling rate:%.1f , Mean ROC AUC: %.3f' % (u, score_under))

Random undersampling rate:1.0 , Mean ROC AUC: 0.773
Random undersampling rate:0.9 , Mean ROC AUC: 0.809
Random undersampling rate:0.8 , Mean ROC AUC: 0.805
Random undersampling rate:0.7 , Mean ROC AUC: 0.802
Random undersampling rate:0.6 , Mean ROC AUC: 0.809
Random undersampling rate:0.5 , Mean ROC AUC: 0.815
Random undersampling rate:0.4 , Mean ROC AUC: 0.826
Random undersampling rate:0.3 , Mean ROC AUC: 0.809


In [47]:
# After rerunning the above for a few different seeds, 
# Under-sampling alone with a sampling ratio of 0.3 gives the best ROC AUC score
# Combination of SMOTE and under-sampling with sampling ratios of 0.5 each gives the best ROC AUC score

In [150]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC

In [157]:
#Under-sampling only

models = {
    'Random Forest': RandomForestClassifier(n_estimators = 500, 
                                            max_features= 'auto',
                                            random_state=SEED,  
                                            n_jobs=-1),
    
    'Logistic Regression': LogisticRegression(random_state=SEED, n_jobs=-1), 
    
    'Support Vector Clf': SVC(random_state=SEED) 
}
under_values = [0.7,0.6,0.5,0.4,0.3] # majority class obs = minority class obs/ 0.5 
for u in under_values:
    print("Undersampling rate: %.1f" % u)
    r = dict()
    for name, mod in models.items():
        pipe_under.set_params(under__sampling_strategy=u, under__random_state = SEED, model = mod)
        # evaluate pipeline
        cv = KFold(shuffle=True, random_state=SEED)
        scores_under = cross_val_score(pipe_under, Xtrain, ytrain, scoring='roc_auc', cv=cv, n_jobs=-1)
        r[name] = scores_under

        print(name, 'Accuracy Mean: %.3f, Std: %.3f' % (scores_under.mean(), scores_under.std()))

Undersampling rate: 0.7
Random Forest Accuracy Mean: 0.799, Std: 0.053
Logistic Regression Accuracy Mean: 0.817, Std: 0.043
Support Vector Clf Accuracy Mean: 0.795, Std: 0.041
Undersampling rate: 0.6
Random Forest Accuracy Mean: 0.797, Std: 0.050
Logistic Regression Accuracy Mean: 0.818, Std: 0.043
Support Vector Clf Accuracy Mean: 0.795, Std: 0.042
Undersampling rate: 0.5
Random Forest Accuracy Mean: 0.804, Std: 0.054
Logistic Regression Accuracy Mean: 0.822, Std: 0.046
Support Vector Clf Accuracy Mean: 0.801, Std: 0.046
Undersampling rate: 0.4
Random Forest Accuracy Mean: 0.809, Std: 0.053
Logistic Regression Accuracy Mean: 0.829, Std: 0.047
Support Vector Clf Accuracy Mean: 0.811, Std: 0.047
Undersampling rate: 0.3
Random Forest Accuracy Mean: 0.814, Std: 0.055
Logistic Regression Accuracy Mean: 0.832, Std: 0.048
Support Vector Clf Accuracy Mean: 0.817, Std: 0.052


In [148]:
pipe_under.set_params(under__sampling_strategy=0.3, under__random_state = SEED, model = LogisticRegression())
pipe_under.fit(Xtrain, ytrain)
print(pipe_under) 

pipe_under_preds = pipe_under.predict(Xtest)
print("Logistic Regression Performance for undersampling: \n \n",
      classification_report(ytest, pipe_under_preds),
     "\n ROC AUC score: %.3f " % roc_auc_score(ytest, pipe_under_preds))

Pipeline(steps=[('under',
                 RandomUnderSampler(random_state=80, sampling_strategy=0.3)),
                ('scaler', MinMaxScaler()), ('model', LogisticRegression())])
Logistic Regression Performance for undersampling: 
 
               precision    recall  f1-score   support

           0       0.89      0.96      0.93       363
           1       0.73      0.47      0.57        78

    accuracy                           0.88       441
   macro avg       0.81      0.72      0.75       441
weighted avg       0.86      0.88      0.86       441
 
 ROC AUC score: 0.718 
