In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-

# Imports

In [17]:
import boto3
import optuna
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import boxcox
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE, RFECV

from cost_function import cost_function
from precision_recall_cutoff import precision_recall_cutoff

# Data

In [7]:
train = pd.read_csv('turnover_train.csv')
test = pd.read_csv('turnover_test.csv')
val = pd.read_csv('turnover_val.csv')

# Feature Engineering

In [8]:
## Train ##
## Create Dummies ##
train = pd.concat([train.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(train[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
train['interaction_1'] = np.where((train['satisfaction_level'] >= .115) &
                                     (train['satisfaction_level'] <= .465) &
                                     (train['number_project'] > 2.5), 1, 0)

train['interaction_2'] = np.where((train['satisfaction_level'] >= .465) &
                                     (train['number_project'] <= 2.5) &
                                     (train['last_evaluation'] <= .575), 1, 0)

train['interaction_3'] = np.where((train['satisfaction_level'] >= .465) &
                                     (train['time_spend_company'] <= 4.5) &
                                     (train['number_project'] <= 290.5), 1, 0)


## Test ##
## Create Dummies ##
test = pd.concat([test.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(test[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
test['interaction_1'] = np.where((test['satisfaction_level'] >= .115) &
                                     (test['satisfaction_level'] <= .465) &
                                     (test['number_project'] > 2.5), 1, 0)

test['interaction_2'] = np.where((test['satisfaction_level'] >= .465) &
                                     (test['number_project'] <= 2.5) &
                                     (test['last_evaluation'] <= .575), 1, 0)

test['interaction_3'] = np.where((test['satisfaction_level'] >= .465) &
                                     (test['time_spend_company'] <= 4.5) &
                                     (test['number_project'] <= 290.5), 1, 0)


## Validation ##
## Create Dummies ##
val = pd.concat([val.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(val[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
val['interaction_1'] = np.where((val['satisfaction_level'] >= .115) &
                                     (val['satisfaction_level'] <= .465) &
                                     (val['number_project'] > 2.5), 1, 0)

val['interaction_2'] = np.where((val['satisfaction_level'] >= .465) &
                                     (val['number_project'] <= 2.5) &
                                     (val['last_evaluation'] <= .575), 1, 0)

val['interaction_3'] = np.where((val['satisfaction_level'] >= .465) &
                                     (val['time_spend_company'] <= 4.5) &
                                     (val['number_project'] <= 290.5), 1, 0)

In [9]:
X = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y = train['left']

class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10)
                     )
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(X_train, Y_train)
            pred_valid = rf_md.predict_proba(X_valid)[:, 1]
            score = cost_function(Y_valid, pred_valid)
            scores.append(score[0])
            
        return np.mean(scores)


In [12]:
study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(42), n_trials = 20)

[32m[I 2023-03-24 22:27:35,467][0m A new study created in memory with name: no-name-ac974ba7-6a0a-4c3a-b807-3218ce447fd6[0m
[32m[I 2023-03-24 22:27:40,996][0m Trial 0 finished with value: 287666.6666666667 and parameters: {'n_estimators': 445, 'min_samples_split': 24, 'min_samples_leaf': 15, 'max_depth': 10}. Best is trial 0 with value: 287666.6666666667.[0m
[32m[I 2023-03-24 22:27:48,659][0m Trial 1 finished with value: 219166.66666666666 and parameters: {'n_estimators': 838, 'min_samples_split': 20, 'min_samples_leaf': 20, 'max_depth': 4}. Best is trial 0 with value: 287666.6666666667.[0m
[32m[I 2023-03-24 22:27:51,403][0m Trial 2 finished with value: 260166.66666666666 and parameters: {'n_estimators': 241, 'min_samples_split': 30, 'min_samples_leaf': 30, 'max_depth': 8}. Best is trial 0 with value: 287666.6666666667.[0m
[32m[I 2023-03-24 22:28:12,361][0m Trial 3 finished with value: 263166.6666666667 and parameters: {'n_estimators': 1962, 'min_samples_split': 14, 'min_

In [13]:
study.best_trial.params

{'n_estimators': 1420,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_depth': 10}

In [19]:
## Model ##
md = RandomForestClassifier(**study.best_trial.params).fit(X, Y)

X_val = val[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
X_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]

Y_val = val['left']
Y_test = test['left']

## Prediction ##
val_pred = md.predict_proba(X_val)[:, 1]
test_pred = md.predict_proba(X_test)[:, 1]

cutoff = cost_function(Y_val, val_pred)[1]

label = np.where(test_pred < cutoff, 0, 1)

mat = confusion_matrix(Y_test, label)
## Metric ##
print('The cost of the rf model is', -1000 * mat[1, 0] - 1500 * mat[0, 1] + 500 * mat[1, 1])

The cost of the rf model is 109500
