### Import libraries

In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

### Data loading & understanding

In [2]:
# load the dataset
data = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.shape

(1470, 35)

In [3]:
# visuals of data
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
# check null values
data.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

### Data preparation

In [5]:
# remove 'Over18' columns as it has only a single value
data = data.drop(columns='Over18', axis = 1)

In [6]:
# handle binary variables
data['Attrition'] = data['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
data['OverTime'] = data['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)

In [7]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [8]:
# identify the categorical variables
categorical_vars = data.select_dtypes(include = 'object').columns

In [9]:
# handle categorical variables
data = pd.get_dummies(data, columns=categorical_vars, drop_first=True, dtype='int64')

In [10]:
# create X and y
X = data.drop(columns='Attrition', axis=1)
y = data['Attrition']

In [11]:
# split the data into train and test using 70-30 rule
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100, stratify=y)

### Model building & prediction

In [12]:
# build the model
model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=600, random_state=100,
                           learning_rate=0.5)
model.fit(X_train, y_train)

In [13]:
# prediction on train set
y_train_pred = model.predict(X_train)

In [14]:
# prediction on test set
y_test_pred = model.predict(X_test)

In [15]:
def compute_performance_metrices(actual, predicted):
    print('Accuracy is: ', round(accuracy_score(actual, predicted), 4))
    
    mat = confusion_matrix(actual, predicted)
    print('\nConfusion matrix is: \n', mat)
    
    TN = mat[0][0]
    FP = mat[0][1]
    FN = mat[1][0]
    TP = mat[1][1]
    
    sensitivity = TP / (TP+FN)
    specificity = TN / (TN+FP)
    
    print('\nSensitivity is: ', round(sensitivity, 4))
    print('\nPrecision is: ', round(precision_score(actual, predicted), 4))
    print('\nRecall is: ', round(recall_score(actual, predicted), 4))
    print('\nSpecificity is: ', round(specificity, 4))

In [16]:
# metrices on train data
compute_performance_metrices(y_train, y_train_pred)

Accuracy is:  0.9407

Confusion matrix is: 
 [[852  11]
 [ 50 116]]

Sensitivity is:  0.6988

Precision is:  0.9134

Recall is:  0.6988

Specificity is:  0.9873


In [17]:
# metrices on test data
compute_performance_metrices(y_test, y_test_pred)

Accuracy is:  0.8549

Confusion matrix is: 
 [[348  22]
 [ 42  29]]

Sensitivity is:  0.4085

Precision is:  0.5686

Recall is:  0.4085

Specificity is:  0.9405


### Using gradient boosting

In [18]:
model = GradientBoostingClassifier().fit(X_train, y_train)

In [19]:
y_train_pred = model.predict(X_train)
compute_performance_metrices(y_train, y_train_pred)

Accuracy is:  0.9602

Confusion matrix is: 
 [[863   0]
 [ 41 125]]

Sensitivity is:  0.753

Precision is:  1.0

Recall is:  0.753

Specificity is:  1.0


In [20]:
y_test_pred = model.predict(X_test)
compute_performance_metrices(y_test, y_test_pred)

Accuracy is:  0.8662

Confusion matrix is: 
 [[355  15]
 [ 44  27]]

Sensitivity is:  0.3803

Precision is:  0.6429

Recall is:  0.3803

Specificity is:  0.9595


### Using XGBoost

In [24]:
import xgboost as xgb

In [29]:
model = xgb.XGBClassifier(colsample_bytree=0.6, 
                          gamma=0.1, 
                          max_depth=5, 
                          min_child_weight=10,
                          n_estimators=200, 
                          n_jobs=-1, 
                          subsample=0.8).fit(X_train, y_train)

In [30]:
y_train_pred = model.predict(X_train)
compute_performance_metrices(y_train, y_train_pred)

Accuracy is:  0.9786

Confusion matrix is: 
 [[862   1]
 [ 21 145]]

Sensitivity is:  0.8735

Precision is:  0.9932

Recall is:  0.8735

Specificity is:  0.9988


In [31]:
y_test_pred = model.predict(X_test)
compute_performance_metrices(y_test, y_test_pred)

Accuracy is:  0.8639

Confusion matrix is: 
 [[350  20]
 [ 40  31]]

Sensitivity is:  0.4366

Precision is:  0.6078

Recall is:  0.4366

Specificity is:  0.9459
