# People Analytics - Best Model Parameter Tunning

**Importing relevant libraries**

In [1]:
# Main Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression # explicit class import from module
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier, XGBClassifier

# Utils
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from scipy.stats import loguniform

# Warnings
import warnings
warnings.filterwarnings('ignore') # Stopping warnings (deprecation warning, etc.)
pd.set_option('display.max_columns', None) # Showing all columns in the dateset, withoud '...' in between.

**Loading Dataset to Jupyter Notebook**

In [2]:
people = pd.read_csv("../raw_data/people_train.csv")

**Separating Daset into Predictor and Target varibles**

In [24]:
X = people.drop(columns = 'attrition')
y = people['attrition']

In [186]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 0)

## Dataprep

### Balancing Target classes

In [187]:
sm = SMOTE(random_state = 0)
X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)

In [188]:
from collections import Counter
print(f"Before SMOTE: (0: {Counter(y_train)[0]}, 1: {Counter(y_train)[1]})")
print(f"After  SMOTE: (0: {Counter(y_train_smote)[0]}, 1: {Counter(y_train_smote)[1]})")

Before SMOTE: (0: 646, 1: 125)
After  SMOTE: (0: 646, 1: 646)


## Best Model

**Without SMOTE**

In [189]:
#sem smote
rf_model = RandomForestClassifier(n_estimators=100)

In [190]:
sacc = cross_val_score(rf_model, X_train, y_train, scoring='accuracy')
sf1 = cross_val_score(rf_model, X_train, y_train, scoring='f1')
sre = cross_val_score(rf_model, X_train, y_train, scoring='recall')
print('accuracy: ', sacc.mean())
print('f1:       ', sf1.mean())
print('recall:   ', sre.mean())

accuracy:  0.857318810222036
f1:        0.20572853072853067
recall:    0.10400000000000001


**With SMOTE**

In [191]:
# Model with SMOTE
rfsmote_model = RandomForestClassifier(n_estimators=100)

In [192]:
# Accuracy score
score_acc = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='accuracy')

#F1 score
score_f1 = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='f1')

# Recall score
score_rec = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='recall')

print('accuracy: ', score_acc.mean())
print('f1:       ', score_f1.mean())
print('recall:   ', score_rec.mean())

accuracy:  0.9064948669599833
f1:        0.8653464985156727
recall:    0.8573881932021467


In [193]:
# # Accuracy score
# score_acc2 = cross_val_score(rfsmote_model, X_val, y_val, scoring='accuracy')

# #F1 score
# score_f12 = cross_val_score(rfsmote_model, X_val, y_val, scoring='f1')

# # Recall score
# score_rec2 = cross_val_score(rfsmote_model, X_val, y_val, scoring='recall')

# print('accuracy: ', score_acc2.mean())
# print('f1:       ', score_f12.mean())
# print('recall:   ', score_rec2.mean())

In [194]:
rfsmote_model.fit(X_train_smote, y_train_smote)

RandomForestClassifier()

In [195]:
rfsmote_model.score(X_val, y_val) # ACCURACY, not the best metrics for this case

0.8449612403100775

In [196]:
y_pred_rfs = rfsmote_model.predict(X_val)

In [197]:
print(f"F1-score:     ", f1_score(y_val, y_pred_rfs))
print(f"Recall score: ", recall_score(y_val, y_pred_rfs))

F1-score:      0.33333333333333326
Recall score:  0.23809523809523808


In [198]:
print(classification_report(y_val, y_pred_rfs))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91       216
           1       0.56      0.24      0.33        42

    accuracy                           0.84       258
   macro avg       0.71      0.60      0.62       258
weighted avg       0.82      0.84      0.82       258



## Parameter Tunning

1. With RandomSearch
2. With GridSearch

In [199]:
# Params for RandomizedSearchCV

rand_params = {
    'criterion': ['gini', 'entropy'],
#     'min_samples_split': np.random.ran,
#     'min_samples_leaf': np.random.randint(1, 5),
    'min_weight_fraction_leaf': loguniform(0.001, 1),
    'oob_score': [True, False],
#     'ccp_alpha': loguniform(0, 0.5)
}

In [200]:
rand_rfsmote_model = RandomizedSearchCV(rfsmote_model, 
                                        param_distributions = rand_params,
                                        n_iter = 10,
                                        n_jobs = -1,
                                        cv = 10,
                                        scoring = 'f1',
                                        random_state = 0,
                                        verbose = 1)

In [201]:
rand_rfsmote_model.fit(X_train_smote, y_train_smote)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   18.1s finished


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'min_weight_fraction_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x122e4dcd0>,
                                        'oob_score': [True, False]},
                   random_state=0, scoring='f1', verbose=1)

In [202]:
rand_rfsmote_model.score(X_val, y_val)

0.40625000000000006

In [203]:
y_pred = rand_rfsmote_model.predict(X_val)

In [204]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       216
           1       0.59      0.31      0.41        42

    accuracy                           0.85       258
   macro avg       0.73      0.63      0.66       258
weighted avg       0.83      0.85      0.83       258



## XGBoost Classifier

In [205]:
xgb_model = XGBClassifier()

In [206]:
xgb_model.fit(X_train_smote, y_train_smote)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [207]:
xgb_model.score(X_val, y_val)

0.8682170542635659

In [208]:
y_pred_xgb = xgb_model.predict(X_val)

In [209]:
print(f"F1-score:     ", f1_score(y_val, y_pred_xgb))
print(f"Recall score: ", recall_score(y_val, y_pred_xgb))

F1-score:      0.5
Recall score:  0.40476190476190477


In [210]:
print(classification_report(y_val, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       216
           1       0.65      0.40      0.50        42

    accuracy                           0.87       258
   macro avg       0.77      0.68      0.71       258
weighted avg       0.85      0.87      0.86       258



## XGBoostRF Classifier

In [211]:
xgbrf_model = XGBRFClassifier(learning_rate = 0.0001, random_state = 42, n_estimators = 1000)

In [212]:
xgbrf_model.fit(X_train_smote, y_train_smote)

XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', learning_rate=0.0001,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=1000, n_jobs=0,
                num_parallel_tree=1000, objective='binary:logistic',
                random_state=42, reg_alpha=0, scale_pos_weight=1,
                tree_method='exact', validate_parameters=1, verbosity=None)

In [213]:
y_pred_xgbrf = xgbrf_model.predict(X_val)

In [214]:
print(classification_report(y_val, y_pred_xgbrf))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       216
           1       0.39      0.26      0.31        42

    accuracy                           0.81       258
   macro avg       0.63      0.59      0.60       258
weighted avg       0.79      0.81      0.80       258



In [215]:
xgbrf_model_acc = cross_validate(xgbrf_model, X_train_smote, y_train_smote, scoring='accuracy')
xgbrf_model_f1 = cross_validate(xgbrf_model, X_train_smote, y_train_smote, scoring='f1')
xgbrf_model_re = cross_validate(xgbrf_model, X_train_smote, y_train_smote, scoring='recall')

In [216]:
print('accuracy: ', xgbrf_model_acc['test_score'].mean())
print('f1:       ', xgbrf_model_f1['test_score'].mean())
print('recall:   ', xgbrf_model_re['test_score'].mean())

accuracy:  0.8786238065307833
f1:        0.8468923855233295
recall:    0.8310435301132977


### RandomizedSearchCV for XGBoostRF Classifier

In [217]:
params = {
    'learning_rate': loguniform(.001, 1),
    'booster': ['gbtree', 'gblinear', 'dart'],
    'gamma': loguniform(0.001, 1),
    'reg_alpha': loguniform(0.00001, 1),
    'reg_lambda': loguniform(0.00001, 1),
    'subsample': loguniform(0.5, 0.8),
    'colsample_bynode': loguniform(0.5, 0.8)
}

In [218]:
randsearch_xgbrf = RandomizedSearchCV(xgbrf_model, param_distributions = params, n_iter = 10, cv = 5,
                                      n_jobs = -1, scoring = ['f1', 'recall'], refit = 'f1',
                                      verbose = 1)

In [219]:
randsearch_xgbrf.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   45.9s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBRFClassifier(base_score=0.5, booster='gbtree',
                                             colsample_bylevel=1,
                                             colsample_bytree=1, gamma=0,
                                             gpu_id=-1, importance_type='gain',
                                             interaction_constraints='',
                                             learning_rate=0.0001,
                                             max_delta_step=0, max_depth=6,
                                             min_child_weight=1, missing=nan,
                                             monotone_constraints='()',
                                             n_estimators=1000, n_jobs=0,
                                             num_parallel_tree=1000,
                                             obje...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x

In [220]:
y_pred_rs_xgbrf = randsearch_xgbrf.predict(X_val)

In [221]:
print(classification_report(y_val, y_pred_rs_xgbrf))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       216
           1       0.46      0.29      0.35        42

    accuracy                           0.83       258
   macro avg       0.67      0.61      0.63       258
weighted avg       0.80      0.83      0.81       258



In [171]:
xgbrf_model.feature_importances_

array([0.00568387, 0.00580447, 0.00550472, 0.00872019, 0.02526303,
       0.02061308, 0.00892972, 0.00868248, 0.01360461, 0.1400478 ,
       0.02072521, 0.00856427, 0.01268584, 0.01136112, 0.00997499,
       0.00759482, 0.01135745, 0.01550745, 0.03975371, 0.0069609 ,
       0.02202924, 0.01079869, 0.00781583, 0.0132259 , 0.01137589,
       0.01041574, 0.02183324, 0.00815308, 0.01645139, 0.01708712,
       0.02309298, 0.01303383, 0.00609566, 0.01786266, 0.0082855 ,
       0.02097218, 0.01066124, 0.00811926, 0.00765431, 0.01691765,
       0.0522415 , 0.00866196, 0.01441193, 0.0133677 , 0.05654325,
       0.00824266, 0.02311758, 0.0541344 , 0.06491003, 0.04514776],
      dtype=float32)

In [172]:
xgbrf_model.get_xgb_params

<bound method XGBRFClassifier.get_xgb_params of XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', learning_rate=0.0001,
                max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=1000, n_jobs=0,
                num_parallel_tree=1000, objective='binary:logistic',
                random_state=42, reg_alpha=0, scale_pos_weight=1,
                tree_method='exact', validate_parameters=1, verbosity=None)>