In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
import statsmodels.api as sm
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import utils
%matplotlib inline
sns.set_style('white')

In [2]:
df = pd.read_csv("IBM HR attrition.csv")
#display(df.select_dtypes('object'))
df['Attrition'] = np.where(df['Attrition']=='Yes', 1, 0)
df['OverTime'] = np.where(df['OverTime']=='Yes', 1, 0)
df['Over18'] = np.where(df['Over18']=="Y", 1, 0)
df['Gender'] = np.where(df['Gender']=='Male', 1, 0)
df = pd.concat([df, pd.get_dummies(df['Department'])], axis=1)
df.columns
df.drop(columns=['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus'], inplace=True)
display(df.head())
#Now all the data is in numerical format
df.dropna(inplace=True)

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Human Resources,Research & Development,Sales
0,41,1,1102,1,2,1,1,2,0,94,...,8,0,1,6,4,0,5,0,0,1
1,49,0,279,8,1,1,2,3,1,61,...,10,3,3,10,7,1,7,0,1,0
2,37,1,1373,2,2,1,4,4,1,92,...,7,3,3,0,0,0,0,0,1,0
3,33,0,1392,3,4,1,5,4,0,56,...,8,3,3,8,7,3,0,0,1,0
4,27,0,591,2,1,1,7,1,1,40,...,6,3,3,2,2,2,2,0,1,0


### Vanilla Logistic Regression

In [3]:
# Declare predictors.
X_statsmod = df.drop('Attrition', axis=1)
X_statsmod['intercept'] = 1 
X_train, X_test, Y_train, Y_test= train_test_split(X_statsmod, df['Attrition'], test_size=.2, random_state=10)
logit = sm.Logit(df['Attrition'], X_statsmod)
result = logit.fit(maxiter=20)

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.320065
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:              Attrition   No. Observations:                 1470
Model:                          Logit   Df Residuals:                     1441
Method:                           MLE   Df Model:                           28
Date:                Tue, 30 Oct 2018   Pseudo R-squ.:                  0.2754
Time:                        14:34:44   Log-Likelihood:                -470.50
converged:                       True   LL-Null:                       -649.29
                                        LLR p-value:                 7.397e-59
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Age                         -0.0336      0.013     -2.596      0.009      -0.059

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [4]:
logreg = linear_model.LogisticRegression(C=1e9) 
#C is set very high to offset the penalty, thereby almost negating it entirely
X = df.drop('Attrition', axis=1)
Y = df['Attrition']
logreg.fit(X, Y)
y_pred = logreg.predict(X)
print("\nR-Squared Value: ")
print(logreg.score(X, Y))

sum_tn, sum_tp, sum_fn, sum_fp = 0,0,0,0
for i in range(len(Y)):
    if ((Y[i]==0) and (y_pred[i]==0)):
        sum_tn+=1
    elif ((Y[i]==0) and (y_pred[i]==1)):
        sum_fp+=1
    elif ((Y[i]==1) and (y_pred[i]==0)):
        sum_fn+=1
    else:
        sum_tp+=1
print(pd.crosstab(y_pred, Y))
print("False Negative rate: {}% \nFalse Positive rate: {}% ".format(
    str(sum_fn/(sum_fn+sum_tp)*100)[:5],
    str(sum_fp/(sum_fp+sum_tn)*100)[:4]))
#class imbalance present


R-Squared Value: 
0.8666666666666667
Attrition     0    1
row_0               
0          1215  178
1            18   59
False Negative rate: 75.10% 
False Positive rate: 1.45% 


In [5]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=.2, random_state=10)
print('20% sample score: ', logreg.fit(X_train, Y_train).score(X_test, Y_test))
print('Full sample score: ', logreg.fit(X, Y).score(X, Y))

20% sample score:  0.8469387755102041
Full sample score:  0.8666666666666667


In [6]:
cross_val_score(logreg, X, Y, cv=10)
#relatively good fit, even though imbalance might cause problems

array([0.87837838, 0.85135135, 0.86486486, 0.87755102, 0.85034014,
       0.84353741, 0.8707483 , 0.85616438, 0.89041096, 0.87671233])

### L2 Regularizaiton/Ridge Regression

In [7]:
ridge = linear_model.LogisticRegression()
#df = pd.DataFrame(preprocessing.scale(df), columns=df.columns)
X = df.drop('Attrition', axis=1)
Y = df['Attrition']
ridge.fit(X, Y)
y_pred = ridge.predict(X)
print("\nR-Squared Value: ")
print(ridge.score(X, Y))

sum_tn, sum_tp, sum_fn, sum_fp = 0,0,0,0
for i in range(len(Y)):
    if ((Y[i]==0) and (y_pred[i]==0)):
        sum_tn+=1
    elif ((Y[i]==0) and (y_pred[i]==1)):
        sum_fp+=1
    elif ((Y[i]==1) and (y_pred[i]==0)):
        sum_fn+=1
    else:
        sum_tp+=1
print(sum_tn, sum_tp, sum_fn, sum_fp)
print("False Negative rate: {}% \nFalse Positive rate: {}% ".format(
    str(sum_fn/(sum_fn+sum_tp)*100)[:5],
    str(sum_fp/(sum_fp+sum_tn)*100)[:4]))
#class imbalance present


R-Squared Value: 
0.8659863945578231
1213 60 177 20
False Negative rate: 74.68% 
False Positive rate: 1.62% 


In [8]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=.2, random_state=10)
print('20% sample score: ', ridge.fit(X_train, Y_train).score(X_test, Y_test))
print("Full sample score: ", ridge.fit(X, Y).score(X, Y))
cross_val_score(ridge, X, Y, cv=10)

20% sample score:  0.8469387755102041
Full sample score:  0.8659863945578231


array([0.83783784, 0.85810811, 0.86486486, 0.87755102, 0.85714286,
       0.85034014, 0.86394558, 0.85616438, 0.87671233, 0.86986301])

### L1 regularization/Lasso Regression

In [9]:
lasso = linear_model.LogisticRegression(penalty='l1', C=10)
print(utils.multiclass.type_of_target(Y))
lasso.fit(X, Y)
y_pred = lasso.predict(X)
conf_matrix = pd.crosstab(y_pred, Y)
print("\nR-Squared Value: ")
print(lasso.score(X, Y))
print("\nConfusion matrix of results:\n")
print(conf_matrix, "\n")
#class imbalance present
sum_tn, sum_tp, sum_fn, sum_fp = 0,0,0,0
for i in range(len(Y)):
    if ((Y[i]==0) and (y_pred[i]==0)):
        sum_tn+=1
    elif ((Y[i]==0) and (y_pred[i]==1)):
        sum_fp+=1
    elif ((Y[i]==1) and (y_pred[i]==0)):
        sum_fn+=1
    else:
        sum_tp+=1
print("False Negative rate: {}% \nFalse Positive rate: {}% ".format(
    str(sum_fn/(sum_fn+sum_tp)*100)[:5],
    str(sum_fp/(sum_fp+sum_tn)*100)[:4]))
#Good chance for class imbalance as 80% is 0 for attrition

binary

R-Squared Value: 
0.8768707482993198

Confusion matrix of results:

Attrition     0    1
row_0               
0          1202  150
1            31   87 

False Negative rate: 63.29% 
False Positive rate: 2.51% 


In [10]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=.2, random_state=10)
print('20% sample score: ', lasso.fit(X_train, Y_train).score(X_test, Y_test))
print("Full sample score: ", lasso.fit(X, Y).score(X, Y))
cross_val_score(lasso, X, Y, cv=10)

20% sample score:  0.8503401360544217
Full sample score:  0.8782312925170068


array([0.88513514, 0.86486486, 0.86486486, 0.8707483 , 0.85714286,
       0.83673469, 0.87755102, 0.87671233, 0.88356164, 0.88356164])

In [12]:
lasso = linear_model.LogisticRegression(penalty='l1', C=1000)
print(utils.multiclass.type_of_target(Y))
lasso.fit(X, Y)
y_pred = lasso.predict(X)
conf_matrix = pd.crosstab(y_pred, Y)
print("\nR-Squared Value: ")
print(lasso.score(X, Y))
print("\nConfusion matrix of results:\n")
print(conf_matrix, "\n")
#class imbalance present
sum_tn, sum_tp, sum_fn, sum_fp = 0,0,0,0
for i in range(len(Y)):
    if ((Y[i]==0) and (y_pred[i]==0)):
        sum_tn+=1
    elif ((Y[i]==0) and (y_pred[i]==1)):
        sum_fp+=1
    elif ((Y[i]==1) and (y_pred[i]==0)):
        sum_fn+=1
    else:
        sum_tp+=1
print("False Negative rate: {}% \nFalse Positive rate: {}% ".format(
    str(sum_fn/(sum_fn+sum_tp)*100)[:5],
    str(sum_fp/(sum_fp+sum_tn)*100)[:4]))

binary

R-Squared Value: 
0.8802721088435375

Confusion matrix of results:

Attrition     0    1
row_0               
0          1202  145
1            31   92 

False Negative rate: 61.18% 
False Positive rate: 2.51% 


Lasso model seems to be the best since it has lower type I and type II error as compared to the others. More or less, the models are very similar with slight improvement using regularization, particularly L1