In [2]:
import numpy as np
import pandas as pd

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,log_loss
from imblearn.over_sampling import SMOTE
import xgboost
from sklearn.model_selection import train_test_split

#Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. EDA 

In [3]:
attrition = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
#Looking for NaN
attrition.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

### Correlation of Features

In [5]:
attrition.corr()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,0.010661,-0.001686,0.208034,,-0.010145,0.010146,0.024287,0.02982,0.509604,...,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
DailyRate,0.010661,1.0,-0.004985,-0.016806,,-0.05099,0.018355,0.023381,0.046135,0.002966,...,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
DistanceFromHome,-0.001686,-0.004985,1.0,0.021042,,0.032916,-0.016075,0.031131,0.008783,0.005303,...,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.016806,0.021042,1.0,,0.04207,-0.027128,0.016775,0.042438,0.101589,...,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EmployeeCount,,,,,,,,,,,...,,,,,,,,,,
EmployeeNumber,-0.010145,-0.05099,0.032916,0.04207,,1.0,0.017621,0.035179,-0.006888,-0.018519,...,-0.069861,,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
EnvironmentSatisfaction,0.010146,0.018355,-0.016075,-0.027128,,0.017621,1.0,-0.049857,-0.008278,0.001212,...,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,0.023381,0.031131,0.016775,,0.035179,-0.049857,1.0,0.042861,-0.027853,...,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
JobInvolvement,0.02982,0.046135,0.008783,0.042438,,-0.006888,-0.008278,0.042861,1.0,-0.01263,...,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
JobLevel,0.509604,0.002966,0.005303,0.101589,,-0.018519,0.001212,-0.027853,-0.01263,1.0,...,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281


### Feature Engineering and Categorical Encoding

In [6]:
attrition.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [9]:
#Empty list to store columns with categorical data
categorical = []
for col,value in attrition.iteritems():
    if value.dtype=='object':
        categorical.append(col)
        
#Store the numerical columns in a list numerical
numerical = attrition.columns.difference(categorical)

In [10]:
#Store the categorical data in a dataframe called attrition_cat
attrition_cat = attrition[categorical]
attrition_cat = attrition_cat.drop(['Attrition'],axis=1) #Dropping the target column

In [11]:
attrition_cat

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,Y,No
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Y,Yes
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,Y,No


#### Applying the get_dummies method

In [12]:
attrition_cat = pd.get_dummies(attrition_cat,drop_first=True)
attrition_cat.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Research & Development,Department_Sales,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
1,1,0,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,1
3,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,0,1,1,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [13]:
#Store the numerical features to a dataframe attrition_num
attrition_num = attrition[numerical]

In [14]:
#Concat the two dataframes together columnwise
attrition_final = pd.concat([attrition_num,attrition_cat],axis=1)

In [15]:
attrition_final.shape

(1470, 47)

### Target variable

In [16]:
#Definte a dictionary for the target mapping
target_map = {'Yes':1,'No':0}

#applying it on the target variable
target = attrition['Attrition'].apply(lambda x:target_map[x])
target.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

#### Splitting data into Train and Test sets

In [17]:
#Split data into train and test sets as well as for validation and testing
train, test, target_train, target_test = train_test_split(attrition_final,target,train_size=0.75,random_state=0)

### Implementing Machine Learning Models

In [18]:
gb = GradientBoostingClassifier(random_state=100)
gb.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 100,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [19]:
#Fit the model to our train and target
gb.fit(train,target_train)
#Get our predictions
gb_predictions = gb.predict(test)

In [20]:
gb_predictions_prob = gb.predict_proba(test)
gb_predictions_prob

array([[0.93565701, 0.06434299],
       [0.96798638, 0.03201362],
       [0.8747258 , 0.1252742 ],
       [0.95525024, 0.04474976],
       [0.10720351, 0.89279649],
       [0.67487367, 0.32512633],
       [0.59570909, 0.40429091],
       [0.91183264, 0.08816736],
       [0.96662071, 0.03337929],
       [0.90249265, 0.09750735],
       [0.93460885, 0.06539115],
       [0.90269076, 0.09730924],
       [0.9732961 , 0.0267039 ],
       [0.26854533, 0.73145467],
       [0.94012005, 0.05987995],
       [0.98792486, 0.01207514],
       [0.94597233, 0.05402767],
       [0.93071641, 0.06928359],
       [0.93537954, 0.06462046],
       [0.92440318, 0.07559682],
       [0.60995072, 0.39004928],
       [0.95078893, 0.04921107],
       [0.96838558, 0.03161442],
       [0.97980593, 0.02019407],
       [0.41611433, 0.58388567],
       [0.74410279, 0.25589721],
       [0.95968266, 0.04031734],
       [0.97096781, 0.02903219],
       [0.28238165, 0.71761835],
       [0.96050297, 0.03949703],
       [0.

In [21]:
accuracy_score(target_test,gb_predictions)

0.8885869565217391

### Feature Importance Gradient Boosting Model

In [22]:
gb.feature_importances_

array([0.06206648, 0.06423625, 0.0298031 , 0.00256078, 0.        ,
       0.03719974, 0.0335136 , 0.02076188, 0.03522262, 0.03448242,
       0.0246517 , 0.09702742, 0.03175668, 0.0326188 , 0.01755939,
       0.        , 0.01168101, 0.        , 0.04073732, 0.04581904,
       0.01072167, 0.02407653, 0.02877356, 0.00696455, 0.03001595,
       0.06333473, 0.02222988, 0.        , 0.00298006, 0.00151816,
       0.00140648, 0.00643378, 0.00243027, 0.        , 0.00649126,
       0.00327118, 0.00015121, 0.01231478, 0.        , 0.00336605,
       0.        , 0.00423302, 0.01078424, 0.01084116, 0.00198427,
       0.01805004, 0.10592893])

In [23]:
train.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
  

In [28]:
#Checking out the feature importances
pd.DataFrame({'Features':train.columns,
            'Imp':gb.feature_importances_}).sort_values(by = 'Imp',ascending=False).reset_index()

Unnamed: 0,index,Features,Imp
0,46,OverTime_Yes,0.105929
1,11,MonthlyIncome,0.097027
2,1,DailyRate,0.064236
3,25,YearsWithCurrManager,0.063335
4,0,Age,0.062066
5,19,TotalWorkingYears,0.045819
6,18,StockOptionLevel,0.040737
7,5,EmployeeNumber,0.0372
8,8,JobInvolvement,0.035223
9,9,JobLevel,0.034482
