In [1]:
## importing modules
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
att_data = pd.read_csv('general_data.csv', index_col='EmployeeID')

In [3]:
att_data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'Over18', 'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [4]:
## converting categorical int....

l = preprocessing.LabelEncoder()
att_data['Attrition'] = l.fit_transform(att_data['Attrition'])
att_data['BusinessTravel'] = l.fit_transform(att_data['BusinessTravel'])
att_data['MaritalStatus'] = l.fit_transform(att_data['MaritalStatus'])
att_data['EducationField'] = l.fit_transform(att_data['EducationField'])
att_data['JobRole'] = l.fit_transform(att_data['JobRole'])
att_data['Gender'] = l.fit_transform(att_data['Gender'])
att_data['Department'] = l.fit_transform(att_data['Department'])
att_data.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Gender,JobLevel,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,51,0,2,2,6,2,1,1,0,1,...,1.0,Y,11,8,0,1.0,6,1,0,0
2,31,1,1,1,10,1,1,1,0,1,...,0.0,Y,23,8,1,6.0,3,5,1,4
3,32,0,1,1,17,4,4,1,1,4,...,1.0,Y,15,8,3,5.0,2,5,0,3
4,38,0,0,1,2,5,1,1,1,3,...,3.0,Y,11,8,3,13.0,5,8,7,5
5,32,0,2,1,10,1,3,1,1,1,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [5]:
## filling missing value
att_data['NumCompaniesWorked']= att_data['NumCompaniesWorked'].fillna(att_data['NumCompaniesWorked'].mode()[0])

In [6]:
att_data.drop('Over18', axis= 1, inplace=True)

In [7]:
att_data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [8]:
features = ['Age',  'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
      'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [10]:
## random forest model
rf_model = RandomForestClassifier(n_estimators=10000,oob_score=True,max_features=10)
rf_model.fit(X= att_data[features],y= att_data['Attrition'])


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=10,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp);

Age 0.12243714663131394
BusinessTravel 0.028737173634148544
Department 0.02690517058501814
DistanceFromHome 0.07473782809540894
Education 0.03792172415162553
EducationField 0.04075447691196819
EmployeeCount 0.0
Gender 0.012875728944112578
JobLevel 0.03268953510593828
JobRole 0.05841512681127063
MaritalStatus 0.041996505525538734
MonthlyIncome 0.12733469568979444
NumCompaniesWorked 0.06652320933198619
PercentSalaryHike 0.07236240579544205
StandardHours 0.0
StockOptionLevel 0.03535825138273337
TrainingTimesLastYear 0.04591078184823323
YearsAtCompany 0.0807171902228269
YearsSinceLastPromotion 0.043245660836724785
YearsWithCurrManager 0.0510773884959155


# Observation:
What we can see from above is  that the most relevant independent variables are 'Age','Monthly income'.

In [22]:
## decision tree based on these three features:

ID_VAR = pd.DataFrame([att_data['Age'],att_data['MonthlyIncome']]).T
tree_model = tree.DecisionTreeClassifier(max_depth=6)
tree_model.fit(X = ID_VAR,y= att_data['Attrition'])


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [23]:
## exporting decision tree:
with open ('Dtree.dot','w') as f:
    f= tree.export_graphviz(tree_model,feature_names=['Age','MonthlyIncome'],out_file=f);

In [24]:
tree_model.score(X= ID_VAR,y= att_data['Attrition'])

0.8530612244897959