### Importing the packages

In [2]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing

### Loading the dataset

In [3]:
data = pd.read_csv('general_data.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


### Checking for null values

In [4]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

### Filling the null values

In [10]:
data['NumCompaniesWorked'] = data['NumCompaniesWorked'].fillna(method='pad')
data['TotalWorkingYears'] = data['TotalWorkingYears'].fillna(method='pad')

In [11]:
data.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [12]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


### Converting string to int

In [14]:
l_enc = preprocessing.LabelEncoder()
data['Attrition'] = l_enc.fit_transform(data['Attrition'])
data['BusinessTravel'] = l_enc.fit_transform(data['BusinessTravel'])
data['Department'] = l_enc.fit_transform(data['Department'])
data['EducationField'] = l_enc.fit_transform(data['EducationField'])
data['Gender'] = l_enc.fit_transform(data['Gender'])
data['JobRole'] = l_enc.fit_transform(data['JobRole'])
data['Over18'] = l_enc.fit_transform(data['Over18'])
data['MaritalStatus'] = l_enc.fit_transform(data['MaritalStatus'])

In [18]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [21]:
rf_model.fit(X=data[features],y=data['Attrition'])
rf_model.oob_score_

0.9997732426303855

In [22]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp)

Age 0.09272041329775463
BusinessTravel 0.027448756132801958
Department 0.025204972366570984
DistanceFromHome 0.06743356067209906
Education 0.03947014932041213
EducationField 0.03976790157917529
EmployeeCount 0.0
EmployeeID 0.037519749897234896
Gender 0.017166177010791806
JobLevel 0.036463665175336835
JobRole 0.053999700742545384
MaritalStatus 0.037761759470060906
MonthlyIncome 0.08853154022781225
NumCompaniesWorked 0.054247019223697845
Over18 0.0
PercentSalaryHike 0.06203328672490691
StandardHours 0.0
StockOptionLevel 0.03216742867832253
TotalWorkingYears 0.08346605968248193
TrainingTimesLastYear 0.04351874558854556
YearsAtCompany 0.06721487241152735
YearsSinceLastPromotion 0.041680548590168766
YearsWithCurrManager 0.05218369320775291


#### Inference: This shows that the important features are Age, DistanceFromHome, MonthlyIncome, PercentSalaryHike, TotalWorkingYears,YearsAtCompany

### Decision Tree

In [39]:
tree_model = tree.DecisionTreeClassifier(max_depth=12)
predictors = pd.DataFrame([data['Age'],data['DistanceFromHome'],data['MonthlyIncome'],data['PercentSalaryHike'],data['TotalWorkingYears'],data['YearsAtCompany']]).T
tree_model.fit(X=predictors,y=data['Attrition'])


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [40]:
tree_model.score(X=predictors,y=data['Attrition'])

0.9478458049886621

In [41]:
with open('Dtree2.dot','w') as f:
    f = tree.export_graphviz(tree_model,feature_names=[['Age'],['DistanceFromHome'],['MonthlyIncome'],['PercentSalaryHike'],['TotalWorkingYears'],['YearsAtCompany']],out_file=f)
    