In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Note: you may 

In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE, RFECV 
from sklearn.svm import SVR, SVC

import optuna 

from cost_function import cost_function

##reading the data 
train = pd.read_csv('turnover_train.csv')
validation = pd.read_csv('turnover_val.csv')
test = pd.read_csv('turnover_test.csv')


In [6]:
## Changing sales and salary to dummies
train = pd.concat([train.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(train[['sales', 'salary']])], axis = 1)
validation = pd.concat([validation.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(validation[['sales', 'salary']])], axis = 1)
test = pd.concat([test.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(test[['sales', 'salary']])], axis = 1)


In [7]:
##engineering features for the decision tree model
train['interaction_1'] = np.where(((train['satisfaction_level'] >= .115) & 
                                     (train['satisfaction_level'] <= .465) & 
                                     (train['number_project'] > 2.5)), 1, 0)

train['interaction_2'] = np.where(((train['satisfaction_level'] <= .465) & 
                                     (train['number_project'] <= 2.5) & 
                                     (train['last_evaluation'] <= .575)), 1, 0)

train['interaction_3'] = np.where(((train['satisfaction_level'] > .465) & 
                                     (train['time_spend_company'] <= 4.5) & 
                                     (train['average_montly_hours'] <= 290.5)), 1, 0)

##engineering features for the decision tree model
validation['interaction_1'] = np.where(((validation['satisfaction_level'] >= .115) & 
                                     (validation['satisfaction_level'] <= .465) & 
                                     (validation['number_project'] > 2.5)), 1, 0)

validation['interaction_2'] = np.where(((validation['satisfaction_level'] <= .465) & 
                                     (validation['number_project'] <= 2.5) & 
                                     (validation['last_evaluation'] <= .575)), 1, 0)

validation['interaction_3'] = np.where(((validation['satisfaction_level'] > .465) & 
                                     (validation['time_spend_company'] <= 4.5) & 
                                     (validation['average_montly_hours'] <= 290.5)), 1, 0)

##engineering features for the decision tree model
test['interaction_1'] = np.where(((test['satisfaction_level'] >= .115) & 
                                     (test['satisfaction_level'] <= .465) & 
                                     (test['number_project'] > 2.5)), 1, 0)

test['interaction_2'] = np.where(((test['satisfaction_level'] <= .465) & 
                                     (test['number_project'] <= 2.5) & 
                                     (test['last_evaluation'] <= .575)), 1, 0)

test['interaction_3'] = np.where(((test['satisfaction_level'] > .465) & 
                                     (test['time_spend_company'] <= 4.5) & 
                                     (test['average_montly_hours'] <= 290.5)), 1, 0)

## Random Forest with Optuna

In [12]:
X = train[['interaction_3', 'interaction_1','satisfaction_level', 'time_spend_company', 'number_project']]
Y = train['left']

class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10)
                        )
    
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            RF_md = RandomForestClassifier(**params).fit(X_train, Y_train)
            pred_valid = RF_md.predict_proba(X_valid)[:, 1]
            score = cost_function(Y_valid, pred_valid)
            scores.append(score[0])
            
        return np.mean(scores)

In [14]:
SEED = 42
N_TRIALS = 20

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-24 17:48:35,034][0m A new study created in memory with name: no-name-6d3ad1f7-f12a-4b83-9b2e-8b1b26c442e0[0m
[32m[I 2023-03-24 17:48:50,862][0m Trial 0 finished with value: 159166.66666666666 and parameters: {'n_estimators': 1823, 'min_samples_split': 28, 'min_samples_leaf': 17, 'max_depth': 3}. Best is trial 0 with value: 159166.66666666666.[0m
[32m[I 2023-03-24 17:49:01,939][0m Trial 1 finished with value: 259166.66666666666 and parameters: {'n_estimators': 1006, 'min_samples_split': 25, 'min_samples_leaf': 30, 'max_depth': 9}. Best is trial 1 with value: 259166.66666666666.[0m
[32m[I 2023-03-24 17:49:15,508][0m Trial 2 finished with value: 278166.6666666667 and parameters: {'n_estimators': 1342, 'min_samples_split': 11, 'min_samples_leaf': 17, 'max_depth': 6}. Best is trial 2 with value: 278166.6666666667.[0m
[32m[I 2023-03-24 17:49:24,806][0m Trial 3 finished with value: 215166.66666666666 and parameters: {'n_estimators': 1014, 'min_samples_split': 27, 

In [15]:
study.best_trial.params

{'n_estimators': 341,
 'min_samples_split': 18,
 'min_samples_leaf': 5,
 'max_depth': 9}

In [19]:
##building the random forest model
from sklearn.metrics import confusion_matrix

RF_md = RandomForestClassifier(**study.best_trial.params).fit(X,Y)

##predicting on validation and test
X_val = validation[['interaction_3', 'interaction_1','satisfaction_level', 'time_spend_company', 'number_project']]
X_test = test[['interaction_3', 'interaction_1','satisfaction_level', 'time_spend_company', 'number_project']]
Y_val = validation['left']
Y_test = test['left']

RF_val_pred = RF_md.predict_proba(X_val)[:, 1]
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

##identify the optimal cutoff
opt_cutoff = cost_function(Y_val, RF_val_pred)[1]

##changing likelihoods into labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

conf_mat = confusion_matrix(Y_test, RF_label)
print(conf_mat)
print('The cost of the RF is:', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

[[1126   17]
 [  28  329]]
The cost of the RF is: 105500


## Ignore everything below

# GridSearchCV with ADABOOST

In [6]:
## GridSearchCV With adaboost
Ada_param_grid = {'n_estimators': [100, 300],
                    'base_estimator__min_samples_split': [10, 15],
                    'base_estimator__min_samples_leaf': [5, 7],
                    'base_estimator__max_depth': [3, 5, 7],
                    'learning_rate': [0.01]}


## running GridSearchCV
Ada_grid_search = GridSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_grid = Ada_param_grid, 
                               cv = 3,
                               scoring = 'f1',
                               n_jobs = -1).fit(X_train, Y_train)

##extracting the best hyper-parameter combination
Ada_grid_search.best_params_

'''
{'base_estimator__max_depth': 5,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__min_samples_split': 15,
 'learning_rate': 0.01,
 'n_estimators': 300}'''



{'base_estimator__max_depth': 5,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__min_samples_split': 15,
 'learning_rate': 0.01,
 'n_estimators': 300}

In [7]:
## building a random forest with best hyper-parameters
Ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 15, min_samples_leaf = 5, max_depth = 5),
                        n_estimators = 300,
                        learning_rate = .01).fit(X_train, Y_train)

##predicting on test
Ada_pred = Ada.predict_proba(X_test)[:, 1]

## changing liklihoods into labels 
Ada_labels = precision_recall_cutoff(Y_test, Ada_pred)

print(classification_report(Y_test, Ada_labels))



              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2286
           1       0.94      0.92      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.95      3000
weighted avg       0.97      0.97      0.97      3000



# RandomizedSearchCV with AdaBoost

In [11]:
## GridSearchCV With adaboost
Ada_param_grid = {'n_estimators': [100, 300],
                    'base_estimator__min_samples_split': [10, 15],
                    'base_estimator__min_samples_leaf': [5, 7],
                    'base_estimator__max_depth': [3, 5, 7],
                    'learning_rate': [0.01]}


## running GridSearchCV
Ada_grid_search = RandomizedSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_distributions = Ada_param_grid, 
                               cv = 3,
                               scoring = 'f1',
                               n_jobs = -1,
                               n_iter = 10).fit(X_train, Y_train)

##extracting the best hyper-parameter combination
Ada_grid_search.best_params_

'''
{'n_estimators': 100,
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 10,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__max_depth': 7}
 '''



{'n_estimators': 100,
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 10,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__max_depth': 7}

In [12]:
## building a random forest with best hyper-parameters
Ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 5, max_depth = 7),
                        n_estimators = 100,
                        learning_rate = .01).fit(X_train, Y_train)

##predicting on test
Ada_pred = Ada.predict_proba(X_test)[:, 1]

## changing liklihoods into labels 
Ada_labels = precision_recall_cutoff(Y_test, Ada_pred)

print(classification_report(Y_test, Ada_labels))



              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.94      0.92      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.95      3000
weighted avg       0.97      0.97      0.97      3000



In [None]:
## Based on my results, I would go with the model from part 6 to predict left. 