In [1]:
import pickle   # importing pickle for saving and loading machine learning models
import pandas as pd  # importing pandas for analyzing, cleaning, exploring, and manipulating data
from sklearn.model_selection import train_test_split  # importing train_test_split for spliting the data into training and testing
from preprocessor import *  # importing * for import all functions at once
from imblearn.over_sampling import SMOTE  # importing SMOTE for Balancing the Data
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,f1_score,classification_report #change
from sklearn.model_selection import GridSearchCV #change

In [2]:
df = pd.read_csv("HR.csv")    # Loading dataset

In [5]:
df.head()   # Display first five rows

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


## Spliting The Data

In [8]:
x = df.drop("Attrition",axis=1)     # Extract the features (all columns except Attritions) from the dataset
y = df["Attrition"].map({"No":0,"Yes":1})  # Extract the target variable from the dataset with converting 0 and 1.

In [10]:
# Spliting the data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=33)

## Loading Preprocessor

In [15]:
# Load a preprocessor object from a pickled file
with open("preprocessor.pkl","rb") as f:
    preprocessor=pickle.load(f)

In [17]:
preprocessor

# Transforming the data

In [20]:
# Transform the training data using the preprocessor object or PipeLine
processed_x_train = preprocessor.fit_transform(x_train)

In [22]:
processed_x_train[0]  # Checking first rows of processed_x_train

array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  1.        , -0.3098615 ,  1.3622128 ,  1.16898078,
       -1.33034804,  0.05469197,  2.11675505, -0.30137951,  0.14940946,
       -0.32177888, -0.60819028, -0.68968387, -0.03669649, -0.5936842 ,
        3.        ,  1.        ,  4.        ,  2.        ,  4.        ,
       13.        ,  2.        ,  0.        ,  3.        ,  2.        ,
        3.        ])

# Balancing The Data

In [25]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight={0:1,1:5},y=y_train)  # You need to define this function

# Model Building

In [28]:
from sklearn.ensemble import GradientBoostingClassifier  # Importing GradientBoostingClassifier
gbm=GradientBoostingClassifier() ## object creation
gbm.fit(processed_x_train,y_train,sample_weight=sample_weights) ## fitting the data

# Validating the model

### Processing the test data for validation

In [32]:
processed_x_test = preprocessor.transform(x_test)   # Transform the test data using the preprocessor

In [34]:
# Getting predictions from model
y_gbm=gbm.predict(processed_x_test)     # Getting predictions from model
y_gbm

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Evaluating the model performance

In [37]:
# Printing the classification report comparing the true labels (y_test) and the predicted labels (y_pred)
print(accuracy_score(y_test,y_gbm))  

0.8315217391304348


In [39]:
print(f1_score(y_test,y_gbm))   # Checking F1 Score

0.5694444444444444


In [41]:
# Displaying Classification Report
print(classification_report(y_test,y_gbm))   

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       306
           1       0.50      0.66      0.57        62

    accuracy                           0.83       368
   macro avg       0.71      0.76      0.73       368
weighted avg       0.85      0.83      0.84       368



## Hyper Parameter Tuning Of Gradient Boosting for better performance

In [44]:
# Importing RandomizedSearchCV from sklearn
from sklearn.model_selection import GridSearchCV

# Define Parameters grid for learning_rate, max_depth, n_estimators
param_grid = {
    'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.4, 0.5, 0.6, 0.7],
    'max_depth': [5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 65, 80, 100],
}
XGB=GradientBoostingClassifier()  # Assigning GradientBoostingClassifier model into variables

rcv= GridSearchCV(estimator=XGB, scoring='f1',refit = True,param_grid=param_grid, cv=5, 
                               verbose=2, n_jobs=-1)
                               
#estimator--number of decision tree
#scoring--->performance matrix to check performance
#param_distribution-->hyperparametes(dictionary we created)
#n_iter--->Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.default=10
##cv------> number of flods
#verbose=Controls the verbosity: the higher, the more messages.
#n_jobs---->Number of jobs to run in parallel,-1 means using all processors.
                        
rcv.fit(processed_x_train,y_train,sample_weight=sample_weights)##training data on randomsearch cv
cv_best_params = rcv.best_params_##it will give you best parameters 

Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [46]:
print(f"Best paramters: {cv_best_params})")##printing  best parameters"

Best paramters: {'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 65})


In [48]:
# Building the model using best parameters:
XGB2=GradientBoostingClassifier(n_estimators=50, max_depth=6, learning_rate=0.9)
XGB2.fit(processed_x_train,y_train,sample_weight=sample_weights)#training 

In [50]:
y_predict=XGB2.predict(processed_x_test)#testing

In [52]:
print(accuracy_score(y_test,y_predict))   #Checking Accuracy Score

0.8478260869565217


In [54]:
print(f1_score(y_test,y_predict))   #Checking Accuracy Score

0.44


In [56]:
print(classification_report(y_test,y_predict))   # Displaying classification report

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       306
           1       0.58      0.35      0.44        62

    accuracy                           0.85       368
   macro avg       0.73      0.65      0.68       368
weighted avg       0.83      0.85      0.83       368



In [58]:
# Opening a file named "model.pkl" in write-binary mode
# The 'wb' mode is used for writing binary data to the file
with open("model_GB.pkl","wb") as f:
    pickle.dump(rcv,f)      # Using pickle to serialize and save the grid search object to the file

# Conclusions Of Gradient Boosting

* As we can see that the performance of model has increased significatly after applying Hyperparameter tuning.
* If you want to increase performance of model you can explore more.

# Applying XGBoost Model

In [69]:
#!pip install xgboost  #installing model XGBOOST in Anaconda Prompt

In [None]:
## model creation
from xgboost import XGBClassifier#importing the model library
xgb_r=XGBClassifier() ## object creation
xgb_r.fit(processed_x_train,y_train,sample_weight=sample_weights)# fitting the data
y_hat=xgb_r.predict(processed_x_test)#predicting the price

In [None]:
print(accuracy_score(y_hat,y_test))   # Checking Accuracy Score

In [None]:
print(f1_score(y_test,y_hat))   # Checking F1Score

In [None]:
print(classification_report(y_test,y_hat))  # Checking Classification report

# Hyperparameter Tunning of XGBoost

https://www.youtube.com/watch?v=AvWfL1Us3Kg

For hyperparameter tuning

In [None]:
# Importing RandomizedSearchCV from sklearn
from sklearn.model_selection import GridSearchCV

# Define parameters grid for gamma, learning_Rate, max_depth, n_estimator, reg_alpha, reg_lamdba
param_grid = {'gamma': [0,0.1,0.2,0.4],
              'learning_rate': [0.01, 0.03, 0.06, 0.1],
              'max_depth': [5,6,7,8,9],
              'n_estimators': [50,65,80],
              'reg_alpha': [0,0.1,0.2,0.4],
              'reg_lambda': [0,0.1,0.2]}

XGB=XGBClassifier(random_state=42,verbosity=0,silent=0)  # Assigning XGBClassifier model into variables
rcv= GridSearchCV(estimator=XGB, scoring='f1',refit=True,param_grid=param_grid,  cv=3, 
                               verbose=1, n_jobs=-1)
                               
#estimator--number of decision tree
#scoring--->performance matrix to check performance
#param_distribution-->hyperparametes(dictionary we created)
#n_iter--->Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.default=10
##cv------> number of flods
#verbose=Controls the verbosity: the higher, the more messages.
#n_jobs---->Number of jobs to run in parallel,-1 means using all processors.
                        
rcv.fit(processed_x_train,y_train,sample_weight=sample_weights)##training data on randomsearch cv
cv_best_params = rcv.best_params_##it will give you best parameters 
print(f"Best paramters: {cv_best_params})")##printing  best parameters

In [None]:
XGB2=XGBClassifier(reg_lambda= 0, reg_alpha= 0.2, n_estimators=50, max_depth=5, learning_rate=0.1, gamma=0.4)
XGB2.fit(processed_x_train,y_train,sample_weight=sample_weights)#training 
y_predict=XGB2.predict(processed_x_test)#testing

In [None]:
accuracy=accuracy_score(y_predict,y_test)#checking performance

In [None]:
f1score=f1_score(y_predict,y_test)#checking performance

In [None]:
print(classification_report(y_predict,y_test))   # Checking classification report

## XGBoost
### Pros
1. Less feature engineering required (No need for scaling, normalizing data, can also handle missing values well)
2. Feature importance can be found out(it output importance of each feature, can be used for feature selection)
3. Fast to interpret
4. Outliers have minimal impact.
5. Handles large sized datasets well.
6. Good Execution speed
7. Good model performance (wins most of the Kaggle competitions)
8. Less prone to overfitting

### Cons
1. Difficult interpretation , visualization tough
2. Overfitting possible if parameters not tuned proper
