# Decision Tree

In [1]:
import pandas as pd
import numpy as np

In [2]:
hr = pd.read_csv('D:/Downloaded rcodes and datasets/HR Analytics.csv')

In [3]:
# We have to find solution in binomial term
# So we are going to classify the data by logistic regression, decision tree
# Binary classification

hr["Attrition"].value_counts()/hr.shape[0] *100

0    83.877551
1    16.122449
Name: Attrition, dtype: float64

In [15]:
hr["Attrition"].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

# Gini impurity

In [4]:
a = hr["Attrition"].value_counts()[0]/hr.shape[0]
b = hr["Attrition"].value_counts()[1]/hr.shape[0]
#pa = hr["Attrition" == 0].shape[0] / hr.shape[0]
#pb = hr["Attrition" == 1].shape[0] / hr.shape[0]
gini = 1 - np.square(a) - np.square(b) 
gini

0.2704623073719284

# Information gain

- Ig = gini(i) - gini(i | j)

# It is imbalanced class, because one class is very frequent and another is rare

In [5]:
hr.shape

(1470, 25)

# Binary classification

In [6]:
# o/p : Categorical columns 
# i/p : Numerical columns

In [7]:
# Now we are checking numerical columns

hr.dtypes


Age                          int64
Attrition                    int64
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
JobInvolvement               int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked           int64
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
TotalWorkingYears            int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [8]:
# In python, we have inbuilt function ._get_numeric_data to extract numeric variables

hr._get_numeric_data()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,1,2,1,2,3,4,5993,8,11,3,1,8,1,6,4,0,5
1,49,0,8,1,2,3,2,2,5130,1,23,4,4,10,3,10,7,1,7
2,37,1,2,2,4,4,2,3,2090,6,15,3,2,7,3,0,0,0,0
3,33,0,3,4,5,4,3,3,2909,1,11,3,3,8,3,8,7,3,0
4,27,0,2,1,7,1,3,2,3468,9,12,3,4,6,3,2,2,2,2
5,32,0,2,2,8,4,3,4,3068,0,13,3,3,8,2,7,7,3,6
6,59,0,3,3,10,3,4,1,2670,4,20,4,1,12,2,1,0,0,0
7,30,0,24,1,11,4,3,3,2693,1,22,4,2,1,3,1,0,0,0
8,38,0,23,3,12,4,2,3,9526,0,21,4,2,10,3,9,7,1,8
9,36,0,27,3,13,3,3,3,5237,6,13,3,2,17,2,7,7,7,7


In [9]:
hr._get_numeric_data().columns

Index(['Age', 'Attrition', 'DistanceFromHome', 'Education', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction',
       'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'TotalWorkingYears',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

# Data Preprocessing

- missing value treatment 
- Outlier removal 
- Column type conversion 
- Standardization

# Split the data into train & test data

- Build models using train data
- Test/validate model using test data 

In [3]:
from sklearn.model_selection import train_test_split

In [11]:
hr.shape[0] * 0.7

1029.0

In [12]:
hr.shape[0] * 0.3

441.0

In [13]:
# To overcome biasedness, we select the data randomly
np.random.seed(100)
train_rows = np.random.randint(1,hr.shape[0], int(hr.shape[0] * 0.7))

In [14]:
train = hr.iloc[train_rows]
print(train.shape)
train.head()

(1029, 25)


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
793,28,0,Travel_Rarely,Research & Development,15,2,Life Sciences,1102,1,Male,...,1,16,3,4,4,2,4,2,2,2
80,30,0,Travel_Rarely,Research & Development,1,1,Life Sciences,104,4,Male,...,1,12,3,3,10,2,10,8,3,0
351,32,0,Travel_Rarely,Research & Development,2,3,Medical,471,3,Female,...,1,13,3,3,8,3,8,0,0,7
54,26,0,Travel_Rarely,Sales,23,3,Marketing,72,3,Female,...,7,19,3,3,5,2,2,2,0,0
803,34,0,Non-Travel,Research & Development,3,4,Life Sciences,1115,3,Male,...,3,17,3,4,6,3,0,0,0,0


In [7]:
# It is necessary to convert all input columns into numerical

hr_dummies = pd.get_dummies(hr)
hr_dummies

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1,1,2,1,2,3,4,5993,8,...,0,0,0,0,0,1,0,0,0,1
1,49,0,8,1,2,3,2,2,5130,1,...,0,0,0,0,1,0,0,0,1,0
2,37,1,2,2,4,4,2,3,2090,6,...,1,0,0,0,0,0,0,0,0,1
3,33,0,3,4,5,4,3,3,2909,1,...,0,0,0,0,1,0,0,0,1,0
4,27,0,2,1,7,1,3,2,3468,9,...,1,0,0,0,0,0,0,0,1,0
5,32,0,2,2,8,4,3,4,3068,0,...,1,0,0,0,0,0,0,0,0,1
6,59,0,3,3,10,3,4,1,2670,4,...,1,0,0,0,0,0,0,0,1,0
7,30,0,24,1,11,4,3,3,2693,1,...,1,0,0,0,0,0,0,1,0,0
8,38,0,23,3,12,4,2,3,9526,0,...,0,0,1,0,0,0,0,0,0,1
9,36,0,27,3,13,3,3,3,5237,6,...,0,0,0,0,0,0,0,0,1,0


In [8]:
# We have Built-in function in sklearn package

train,test = train_test_split(hr_dummies, test_size = 0.3, random_state = 100)

In [9]:
train_y = train['Attrition']
test_y = test['Attrition']
train_x = train.drop('Attrition', axis = 1)
test_x = test.drop('Attrition', axis = 1)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(1029, 44)
(1029,)
(441, 44)
(441,)


In [18]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null int64
BusinessTravel              1470 non-null object
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
JobInvolvement              1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
NumCompaniesWorked          1470 non-null int64
PercentSalaryHike           1470 non-null int64
PerformanceRating           1470 non-null int64
RelationshipSatisfactio

# Building Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
model = DecisionTreeClassifier()
model.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
test_pred = model.predict(test_x)
print(len(test_pred))


441


In [27]:
# Instead of predict, we can use predict_proba to know exact prob. instead of 0 or 1

test_pred_prob = pd.DataFrame(model.predict_proba(test_x))
test_pred_prob

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,1.0,0.0
9,1.0,0.0


In [85]:
# Evaluate model performance

df_pred = pd.DataFrame({"actual":test_y,"predicted":test_pred})


In [94]:
df_pred["pred_status"] = df_pred["actual"] == df_pred["predicted"]

In [95]:
df_pred

Unnamed: 0,actual,predicted,pred_status
880,0,1,False
152,0,1,False
1466,0,0,True
1084,0,0,True
1086,0,0,True
1392,0,0,True
57,0,0,True
956,0,0,True
1400,0,0,True
1175,0,0,True


In [100]:
df_pred["pred_status"].value_counts()

True     339
False    102
Name: pred_status, dtype: int64

# Machine learning process
- Clean /prepare your data(EDA process)
    - Missing value treatment
    - Outlier treatment
    - Standardization
    - Dummy variables conversion 
- split your data(train & test)
- separate i/p & o/p cols (train_x, train_y, test_x, test_y)
- Build model using training data
- preddict output values for test data
- Evaluate your model
    - Accuracy, Sensitivity, Specificity
- Fine tune your model for better performance
    - Hyper parameter tuning, cross validation

In [102]:
# another dataset

In [139]:
url = "https://raw.githubusercontent.com/skathirmani/datasets/master/bank-full.csv"
bank = pd.read_csv(url, sep = ';')

In [140]:
bank.shape

(45211, 17)

In [141]:
bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [142]:
bank_dummies = pd.get_dummies(bank.drop('y', axis=1))
bank_dummies['y'] = bank['y']  # Adding y column to bank dummies

In [143]:
bank["y"].value_counts()/bank.shape[0]*100

no     88.30152
yes    11.69848
Name: y, dtype: float64

In [144]:
from sklearn.model_selection import train_test_split

In [145]:
train, test = train_test_split(bank_dummies, test_size = 0.3, random_state = 100)

In [146]:
train_x = train.drop('y', axis = 1)
test_x = test.drop('y', axis=1)
train_y = train['y']
test_y = test['y']
                

In [147]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
model = DecisionTreeClassifier()
model.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
test_pred = model.predict(test_x)
test_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,

In [150]:
bank_pred = pd.DataFrame({'actual':test_y, 'predicted':test_pred})

In [151]:
bank_pred["model_status"] = bank_pred["actual"] == bank_pred["predicted"]
bank_pred.head()

Unnamed: 0,actual,predicted,model_status
14789,no,no,True
8968,no,no,True
34685,no,no,True
2369,no,no,True
36561,no,yes,False


In [152]:
bank_pred["model_status"].value_counts()/bank_pred.shape[0] * 100

True     87.739605
False    12.260395
Name: model_status, dtype: float64

# Gini impurity

In [168]:
gini = 1 - (train_y[train_y=="no"].shape[0]/train_y.shape[0])**2 - (train_y[train_y=="yes"].shape[0]/train_y.shape[0])**2
gini

0.20697532774991265

NameError: name 'model_testing' is not defined

In [14]:
from sklearn import tree
with open("model.dot","w") as f:
    f = tree.export_graphviz(model,
                            out_file=f,
                            feature_names=train_x.columns)

In [16]:
import numpy as np

In [22]:
a = 150/200
b = 50/200
1 - np.square(a) - np.square(b)

0.375

In [18]:
(0.375 + 0.5)/2

0.4375