# People Analytics - Best Model Parameter Tunning

**Importing relevant libraries**

In [72]:
# Main Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression # explicit class import from module
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier, XGBClassifier

# Utils
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from scipy.stats import loguniform

# Warnings
import warnings
warnings.filterwarnings('ignore') # Stopping warnings (deprecation warning, etc.)
pd.set_option('display.max_columns', None) # Showing all columns in the dateset, withoud '...' in between.

**Loading Dataset to Jupyter Notebook**

In [6]:
people = pd.read_csv("../raw_data/people_train.csv")

**Separating Daset into Predictor and Target varibles**

In [7]:
X = people.drop(columns = 'attrition')
y = people['attrition']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state = 0)

## Dataprep

### Balancing Target classes

In [9]:
sm = SMOTE(random_state=0)
X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)

In [10]:
from collections import Counter
print('before SMOTE :' , Counter(y_train))
print('after SMOTE :' , Counter(y_train_smote))

before SMOTE : Counter({0: 646, 1: 125})
after SMOTE : Counter({0: 646, 1: 646})


## Best Model

**Without SMOTE**

In [11]:
#sem smote
model = RandomForestClassifier(n_estimators=100)
sacc = cross_val_score(model, X_train, y_train, scoring='accuracy')
sf1 = cross_val_score(model, X_train, y_train, scoring='f1')
sre = cross_val_score(model, X_train, y_train, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.856028487641391
f1:  0.17134469467126667
recall:  0.168


**With SMOTE**

In [18]:
# Model with SMOTE
rfsmote_model = RandomForestClassifier(n_estimators=100)

In [19]:
# Accuracy score
score_acc = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='accuracy')

#F1 score
score_f1 = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='f1')

# Recall score
score_rec = cross_val_score(rfsmote_model, X_train_smote, y_train_smote, scoring='recall')

print('accuracy: ', score_acc.mean())
print('f1:       ', score_f1.mean())
print('recall:   ', score_rec.mean())

accuracy:  0.9034120499236777
f1:        0.8591742500474517
recall:    0.8418843172331545


In [53]:
# Accuracy score
score_acc2 = cross_val_score(rfsmote_model, X_val, y_val, scoring='accuracy')

#F1 score
score_f12 = cross_val_score(rfsmote_model, X_val, y_val, scoring='f1')

# Recall score
score_rec2 = cross_val_score(rfsmote_model, X_val, y_val, scoring='recall')

print('accuracy: ', score_acc2.mean())
print('f1:       ', score_f12.mean())
print('recall:   ', score_rec2.mean())

accuracy:  0.83710407239819
f1:        0.12444444444444444
recall:    0.09444444444444444


In [55]:
rfsmote_model.fit(X_train_smote, y_train_smote)

RandomForestClassifier()

In [58]:
rfsmote_model.score(X_val, y_val)

0.8527131782945736

## Parameter Tunning

1. With RandomSearch
2. With GridSearch

In [48]:
# Params for RandomizedSearchCV

rand_params = {
    'criterion': ['gini', 'entropy'],
#     'min_samples_split': np.random.ran,
#     'min_samples_leaf': np.random.randint(1, 5),
    'min_weight_fraction_leaf': loguniform(0.001, 1),
    'oob_score': [True, False],
#     'ccp_alpha': loguniform(0, 0.5)
}

In [65]:
rand_rfsmote_model = RandomizedSearchCV(rfsmote_model, 
                                        param_distributions = rand_params,
                                        n_iter = 10,
                                        n_jobs = -1,
                                        cv = 10,
                                        scoring = 'f1',
                                        random_state = 0,
                                        verbose = 1)

In [66]:
rand_rfsmote_model.fit(X_train_smote, y_train_smote)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   17.9s finished


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'min_weight_fraction_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x125f80e90>,
                                        'oob_score': [True, False]},
                   random_state=0, scoring='f1', verbose=1)

In [67]:
rand_rfsmote_model.score(X_val, y_val)

0.37681159420289856

In [70]:
y_pred = rand_rfsmote_model.predict(X_val)

In [71]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       216
           1       0.48      0.31      0.38        42

    accuracy                           0.83       258
   macro avg       0.68      0.62      0.64       258
weighted avg       0.81      0.83      0.82       258



## XGBoost

In [73]:
xgb_model = XGBClassifier()

In [75]:
xgb_model.fit(X_train_smote, y_train_smote)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [76]:
xgb_model.score(X_val, y_val)

0.8682170542635659

In [78]:
y_pred = xgb_model.predict(X_val)

In [79]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       216
           1       0.65      0.40      0.50        42

    accuracy                           0.87       258
   macro avg       0.77      0.68      0.71       258
weighted avg       0.85      0.87      0.86       258



In [80]:
xgbrf_model = XGBRFClassifier()

In [81]:
xgbrf_model.fit(X_val, y_val)

XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=0, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)

In [86]:
y_val

746    0
513    0
175    0
527    0
622    0
      ..
452    0
546    0
538    1
761    0
62     1
Name: attrition, Length: 258, dtype: int64