In [1]:
!pip install imblearn



In [2]:
import numpy as np 
import pandas as pd 


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.model_selection import train_test_split


# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

# 1. Exploratory Data Analysis

Let us load in the dataset via the trusty Pandas package into a dataframe object which we call **attrition** and have a quick look at the first few rows

In [3]:
attrition = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
# Looking for NaN
attrition.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

In [5]:
# attrition.Age.fillna('')

### Correlation of Features


In [6]:
# attrition.corr()

#  Feature Engineering & Categorical Encoding

Task of Feature engineering and numerically encoding the categorical values in our dataset.

In [7]:
# attrition.shape

In [8]:
attrition.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [9]:
# Empty list to store columns with categorical data
categorical = []
for col, value in attrition.items():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = attrition.columns.difference(categorical)

print("Categorical columns:", categorical)
print("Numerical columns:", numerical)

Categorical columns: ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
Numerical columns: Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


In [10]:
numerical

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [11]:
categorical

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [12]:
# Store the categorical data in a dataframe called attrition_cat
attrition_cat = attrition[categorical]
attrition_cat = attrition_cat.drop(['Attrition'], axis=1) # Dropping the target column

In [13]:
attrition_cat

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,Y,No
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Y,Yes
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,Y,No


Applying the **get_dummies** method

In [14]:
# How can you convert categorial or string or object data into Numerical Format ?

# Process of converting your cat data into numerical format - Encoding process 

# Encoding (15 More )

# Label Encoding 

# One Hot Encoding ( OHE)

# Cat_A 

# Male
#Female 
#Male
#Female
# Prefer_not_to_say
# Male 

# OHE 

           # Cat_A_Male    #Cat_A_Female   #Cat_A_Prefer_not_to_say
#1# Male      1             0                0 
#2#Female     0             1                0
#3#Male       1             0                0
#4#Female     0             1                0
#5# Prefer_not_to_say 0     0                1
#6# Male 



# Label Encoding 

# Cat_A 

# Male   2       
#Female 1
#Male 2
#Female 1
# Prefer_not_to_say 3
# Male 2

# Target Encoding 
# Mean Encoding 































In [15]:
# Filter your object datatypes 

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
le.transform(["tokyo", "tokyo", "paris","amsterdam"])

# list(le.classes_)


#0 ,1,2

array([2, 2, 1, 0])

In [16]:
attrition_cat = pd.get_dummies(attrition_cat)
attrition_cat.head(3)

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,False,False,True,False,False,True,False,True,False,False,...,False,False,True,False,False,False,True,True,False,True
1,False,True,False,False,True,False,False,True,False,False,...,False,True,False,False,False,True,False,True,True,False
2,False,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,True


In [17]:
# Store the numerical features to a dataframe attrition_num
attrition_num = attrition[numerical]

let's concat numerical and caterogial dfs

In [18]:
# Concat the two dataframes together columnwise
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)

In [19]:
attrition_final.shape

(1470, 55)

**Target variable**

The target in this case is given by the column **Attrition** which contains categorical variables therefore requires numerical encoding. We numerically encode it by creating a dictionary with the mapping given as 1 : Yes and 0 : No

In [20]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
target = attrition["Attrition"].apply(lambda x: target_map[x])
target.head(3)

0    1
1    0
2    1
Name: Attrition, dtype: int64


**Splitting Data into Train and Test sets**


In [21]:
# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_test = train_test_split(attrition_final, target, train_size= 0.75,random_state=0);

#  Implementing Machine Learning Models


## GBM Classifier



### 1.n_estimators - No of Trees in the Model

### 2.max_features - The number of features to consider while searching for a best split.Thumb Rule to have Square root of no of Columns

### 3.max_depth - Maximum Depth of Tree and can be used to control overfiting 

### 4.min_samples_leaf - Minimum samples (or observations) required in a terminal node or leaf.In general we need to have lower values  for it for Imbalanced problems

### 5.subsample- The fraction of samples to be used for fitting the individual base learners

### 6.learning_rate - Learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators

In [22]:
gb = GradientBoostingClassifier(random_state=100) # default 
gb.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 100,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [23]:
# Fit the model to our train and target
gb.fit(train, target_train)
# Get our predictions
gb_predictions = gb.predict(test)

In [24]:
gb_predictions_prob = gb.predict_proba(test)
gb_predictions_prob

array([[0.95425715, 0.04574285],
       [0.96266164, 0.03733836],
       [0.86753512, 0.13246488],
       [0.93446343, 0.06553657],
       [0.11595517, 0.88404483],
       [0.69684147, 0.30315853],
       [0.64887562, 0.35112438],
       [0.95196738, 0.04803262],
       [0.97038412, 0.02961588],
       [0.86642399, 0.13357601],
       [0.94437031, 0.05562969],
       [0.91453905, 0.08546095],
       [0.97008856, 0.02991144],
       [0.30172989, 0.69827011],
       [0.94679291, 0.05320709],
       [0.98937899, 0.01062101],
       [0.94587557, 0.05412443],
       [0.93925719, 0.06074281],
       [0.94572155, 0.05427845],
       [0.92824768, 0.07175232],
       [0.63031267, 0.36968733],
       [0.95095032, 0.04904968],
       [0.96466225, 0.03533775],
       [0.97249008, 0.02750992],
       [0.58256377, 0.41743623],
       [0.75837581, 0.24162419],
       [0.95692266, 0.04307734],
       [0.97394765, 0.02605235],
       [0.25660257, 0.74339743],
       [0.96627476, 0.03372524],
       [0.

In [25]:
# Gradient Boosting Parameters
# gb_params ={
#     'n_estimators': 500,   # no of Trees 
#     'learning_rate' : 0.2,
#     'max_depth': 11,
#     'min_samples_leaf': 2,
#     'subsample': 1,
#     'max_features' : 'sqrt',
#     'random_state' : 100,
#     'verbose': 0
# }

#gb = GradientBoostingClassifier(**gb_params) # After Doing HPT , we can pass the paramaters

In [26]:
accuracy_score(target_test, gb_predictions)

0.8831521739130435

### Feature Importance Gradient Boosting Model


In [27]:
gb.feature_importances_

array([0.06637792, 0.05983319, 0.03073138, 0.00432084, 0.        ,
       0.03919363, 0.03440223, 0.01631039, 0.03338705, 0.03211924,
       0.02486999, 0.10462228, 0.02408236, 0.0315579 , 0.02005508,
       0.        , 0.01282365, 0.        , 0.04186334, 0.0461674 ,
       0.00786869, 0.02921662, 0.02275688, 0.01094337, 0.02410598,
       0.05802124, 0.00119296, 0.02512782, 0.        , 0.        ,
       0.00334557, 0.00150605, 0.0042737 , 0.0005249 , 0.00822779,
       0.00420823, 0.0016531 , 0.0043053 , 0.00052944, 0.00090874,
       0.00121754, 0.00014525, 0.01109724, 0.        , 0.00334464,
       0.        , 0.00517715, 0.01111343, 0.01076333, 0.00311429,
       0.00155062, 0.01744231, 0.        , 0.0559923 , 0.04760764])

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = gb.feature_importances_,
    x = attrition_final.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1.3,
        size = 12,
        color = gb.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = attrition_final.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'GBM Model Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')