In [1]:
from pandas import Series, DataFrame
import pandas as pd
%pylab inline
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import time
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz

In [3]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
#data['Age Range'] = pd.cut(data['Age'], bins = [0,35,55,100], labels = ['Millenial','Gen X','Boomer'])

In [5]:
data_y = data['Attrition'] # make y values
data_x = data.drop(['Attrition'],axis = 1) # the rest of the df

# dropping columns that an employer cannot affect
data_x = data_x.drop(['Age','DistanceFromHome','Education','EducationField','Gender','MaritalStatus','NumCompaniesWorked','Over18','PerformanceRating','RelationshipSatisfaction','TotalWorkingYears','YearsAtCompany'],axis = 1)

In [6]:
data_x = pd.get_dummies(data_x) # turns all categorical values into dummies for standardization
feature_names = data_x.columns
scaler = StandardScaler().fit(data_x)
# remember how the data was rescaled; can apply to future data to be tested
data_x = StandardScaler().fit_transform(data_x) # mean = 0, stdev/variance = 1
xtrain, xtest, ytrain, ytest = train_test_split(data_x,data_y, test_size = .125, random_state = None)

print("xtrain =",xtrain.shape, " xtest =", xtest.shape)
print("ytrain =",ytrain.shape, " ytest =", ytest.shape)

('xtrain =', (1286L, 35L), ' xtest =', (184L, 35L))
('ytrain =', (1286L,), ' ytest =', (184L,))


In [7]:
pca = PCA()
pca_data = pca.fit_transform(xtrain)
# get variance of each column
variance = pca.explained_variance_ratio_
# sum total variance of each previous column
cumulative_variance = np.cumsum(variance)
# want all 
n_cols = 1 + np.argmax(cumulative_variance > .95)
print "num cols keep =",n_cols
xtrain_pca = pca_data[:, :n_cols]
xtest_pca = pca.transform(xtest)[:,:n_cols]
# the test data is transformed using the same PCA

num cols keep = 23


In [8]:
# use k-fold cross val 
a_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 15)
scores = cross_val_score(a_tree, data_x, data_y, cv=10)

print "Accuracy:", scores.mean()*100

Accuracy: 78.8438963145421


Now use grid search to find the best tree depth, number of columns, and minimum points in each leaf

In [9]:
parameters = {"max_depth": [n for n in range(5,31,5)], "min_samples_leaf": [n for n in range(5,51,5)], "max_features": [n for n in range(5,26,5)]}

grid_search = GridSearchCV(a_tree, parameters, cv=5, scoring='balanced_accuracy')

grid_search.fit(data_x, data_y)

print("Best values:", grid_search.best_params_)
print("Accuracy:", grid_search.best_score_*100)

('Best values:', {'max_features': 25, 'max_depth': 15, 'min_samples_leaf': 15})
('Accuracy:', 64.86792767732939)


Need to pass gridsearch cv into cross_val_score. each cross val will split into train and test, gridsearch will then split train into train and validation. It will pick a best decision tree based on validation score and test against test set

In [10]:
nested_score = cross_val_score(grid_search, data_x, data_y, cv = 5)
print "Accuracy:", nested_score.mean()*100

Accuracy: 60.79105073019462


In [11]:
best_tree = tree.DecisionTreeClassifier(criterion = "entropy",max_depth = grid_search.best_params_['max_depth'], min_samples_leaf = grid_search.best_params_['min_samples_leaf'], max_features = grid_search.best_params_['max_features'])
best_tree.fit(data_x, data_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=15,
            max_features=25, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=15, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
file_name = "newTree.txt"

webgraphviz.com

In [13]:
export_graphviz(best_tree, out_file = file_name, feature_names = feature_names, filled = True, rounded = True)

In [14]:
features = Series(best_tree.feature_importances_, index = feature_names).sort_values(ascending = False)
features[features > 0]

MonthlyIncome                        0.183069
OverTime_Yes                         0.148570
EmployeeNumber                       0.087960
EnvironmentSatisfaction              0.087698
StockOptionLevel                     0.082466
YearsWithCurrManager                 0.075491
DailyRate                            0.046536
YearsSinceLastPromotion              0.041158
Department_Sales                     0.039147
JobSatisfaction                      0.031975
PercentSalaryHike                    0.031806
JobRole_Research Scientist           0.031532
JobInvolvement                       0.029535
MonthlyRate                          0.027246
YearsInCurrentRole                   0.019169
HourlyRate                           0.011389
JobRole_Laboratory Technician        0.010982
TrainingTimesLastYear                0.009826
Department_Research & Development    0.004447
dtype: float64