In [4]:
import pickle # importing pickle for saving and loading machine learning models # importing pandas for analyzing, cleaning, exploring, and manipulating data
import pandas as pd # importing pandas for analyzing, cleaning, exploring, and manipulating data
import numpy as np
from sklearn.model_selection import train_test_split # importing train_test_split for spliting the data into training and testing
from preprocessor import *                 # importing * for import all functions at once
from imblearn.over_sampling import SMOTE   # importing SMOTE for Balancing the Data
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,f1_score,classification_report
from sklearn.model_selection import GridSearchCV

In [6]:
df = pd.read_csv('HR.csv')

In [8]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [12]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

## Spliting The Data

In [15]:
x = df.drop('Attrition', axis=1)         # Extract the features (all columns except Attritions) from the dataset
y = df['Attrition'].map({'No':0,'Yes':1})# Extract the target variable from the dataset with converting 0 and 1

In [17]:
# Spliting the data into train and test:
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.25, random_state=33)

## Loading Preprocessor

In [31]:
# Load a preprocessor object from a pickled file
with open('preprocessor.pkl','rb') as f:
    preprocessor= pickle.load(f)

In [33]:
preprocessor

# Transforming the data

In [36]:
# Transform the training data using the preprocessor object or PipeLine
processed_x_train = preprocessor.fit_transform(x_train)

# Applying DecisionTree

### Model Building

In [40]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(processed_x_train,y_train)

# Validating the model

In [45]:
# Transform the testing data using the preprocessor object or PipeLine
processed_x_test = preprocessor.transform(x_test)

In [47]:
# Predicting from the model:
y_pred = dt.predict(processed_x_test)

In [49]:
# Accuracy score:
acc= accuracy_score(y_test,y_pred)
acc

0.7961956521739131

In [53]:
# F1 score:
f1_score(y_test,y_pred)

0.4186046511627907

In [55]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88       306
           1       0.40      0.44      0.42        62

    accuracy                           0.80       368
   macro avg       0.64      0.65      0.65       368
weighted avg       0.80      0.80      0.80       368



# Hyperparameter Tunning of DicisionTree

## Hyperparameters of DecisionTree
* Hyperparameter tuning is searching the hyperparameter space for a set of values that will optimize your model architecture.

* Criterion: The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.


* Splitter: This is how the decision tree searches the features for a split. The default value is set to “best”. That is, for each node, the algorithm considers all the features and chooses the best split. If you decide to set the splitter parameter to “random,” then a random subset of features will be considered.



* max_depth: This determines the maximum depth of the tree.  we use a depth of two to make our decision tree. ... This will often result in over-fitted decision trees. The depth parameter is one of the ways in which we can regularize the tree, or limit the way it grows to prevent over-fitting..The tree perfectly fits the training data and fails to generalize on testing data.



* min_samples_split:Ideal range is 1 to 40.min_samples_split specifies the minimum number of samples required to split an internal node, while min_samples_leaf specifies the minimum number of samples required to be at a leaf node.



* min_samples_leaf: The minimum number of samples(rows) required to be at a leaf node.Similarr to min sample split ,this describes the minimum number of samples at the leaf,the base of tree.Ideal range is 1 to 20.(thershold value to make a decision)like 40


In [61]:
#creating dictionary--> key value pair of hyperparameters having key as parameter and values as its values
params = {
    "criterion":("gini", "entropy"), #quality of split
    "splitter":("best", "random"), # searches the features for a split
    "max_depth":(list(range(1, 10))), #depth of tree range from 1 to 19
    "min_samples_split":[2, 3, 4,5,6,7],    #the minimum number of samples(rows) required to split internal node
    "min_samples_leaf":list(range(1, 10)),#minimum number of samples(rows) required to be at a leaf node,we are passing list which is range from 1 to 19
}

tree_clf = DecisionTreeClassifier()  # object creation for decision tree with random state 3
tree_cv  = GridSearchCV(tree_clf, params, scoring='f1', n_jobs=-1, verbose=2, cv=5)
#passing model to gridsearchCV ,
#tree_clf-->model
#params---->hyperparametes(dictionary we created)
#scoring--->performance matrix to check performance
#n_jobs---->Number of jobs(PC processor) to run in parallel,-1 means using all processors.
#verbose=Controls the verbosity: the higher, the more messages.
#>1 : the computation time for each fold and parameter candidate is displayed;
#>2 : the score is also displayed;
#>3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation.
#cv------> number of foods

tree_cv.fit(processed_x_train,y_train) # training data on gridsearch cv
best_params = tree_cv.best_params_     # it will give you best parameters
print(f"Best paramters: {best_params})")   # printing  best parameters

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
Best paramters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 3, 'splitter': 'best'})


In [65]:
tree_cv.best_params_    # getting best parameters from cv

{'criterion': 'gini',
 'max_depth': 6,
 'min_samples_leaf': 6,
 'min_samples_split': 3,
 'splitter': 'best'}

In [67]:
# passing best parameter to decision tree
dt1 = DecisionTreeClassifier(criterion= 'gini',max_depth= 6,min_samples_leaf= 6,min_samples_split= 3,splitter= 'best') 

In [69]:
# training model with best parameter
dt1.fit(processed_x_train,y_train)

In [71]:
y_hat1=dt1.predict(processed_x_test) # predicting the model
y_hat1

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [73]:
accuracy_score(y_test,y_hat1)     # Checking accuracy of model

0.8070652173913043

In [75]:
f1_score(y_test,y_hat1)     # Checking f1_Score

0.297029702970297

In [77]:
print(classification_report(y_test,y_hat1)) # it will give precision,recall,f1 scores and accuracy

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       306
           1       0.38      0.24      0.30        62

    accuracy                           0.81       368
   macro avg       0.62      0.58      0.59       368
weighted avg       0.78      0.81      0.79       368

