In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4
Note: you may need to restart the kernel to use updated packages.


In [18]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from scipy.stats import boxcox
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE, RFECV

from cost_function import cost_function, cost_function_cutoff
from precision_recall_cutoff import precision_recall_cutoff

In [9]:
## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [10]:
## Create Dummies ##
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) &
                                     (turnover['satisfaction_level'] <= .465) &
                                     (turnover['number_project'] > 2.5), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['number_project'] <= 2.5) &
                                     (turnover['last_evaluation'] <= .575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['time_spend_company'] <= 4.5) &
                                     (turnover['number_project'] <= 290.5), 1, 0)


## Inputs and target ##
scaler = MinMaxScaler()
X = turnover[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
# X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
Y = turnover['left']

## split data ##
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

# Random Forest

In [11]:
## Grid ##
rf_grid = {'n_estimators': [100, 300, 500],
           'min_samples_split': [10, 15],
           'min_samples_leaf': [5, 7],
           'max_depth': [3, 5, 7]}

## Score Function ##
score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## Search ##
rf_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = rf_grid, cv = 3, scoring = score_function, n_jobs = -1).fit(X_train, Y_train)
rf_md = rf_search.best_estimator_

## Prediction ##
rf_pred = rf_md.predict_proba(X_test)[:, 1]

## cutoff ##
rf_cut = cost_function_cutoff(Y_test, rf_pred)

## Labels ##
rf_label = np.where(rf_pred < rf_cut, 0, 1)

## Evaluate ##
rf_mat = confusion_matrix(Y_test, rf_label)
print(rf_mat)
print('The cost of the rf model is', 1500*rf_mat[1,0] - 1000*rf_mat[0,1] + 500*rf_mat[1,1])

[[2260   26]
 [  55  659]]
The cost of the rf model is 386000


# XG Boost

In [19]:
xg_grid = {'n_estimators': [500],
           'max_depth': [3, 5, 7],
           'min_child_weight': [5, 7],
           'learning_rate': [0.01],
           'gamma': [0.3, 0.1],
           'subsample': [0.8, 1],
           'colsample_bytree': [1]}

## Score Function ##
score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## Search ##
xg_search = GridSearchCV(estimator = XGBClassifier(), param_grid = xg_grid, cv = 3, scoring = score_function, n_jobs = -1).fit(X_train, Y_train)
xg_md = xg_search.best_estimator_

## Prediction ##
xg_pred = xg_md.predict_proba(X_test)[:, 1]

## cutoff ##
xg_cut = cost_function_cutoff(Y_test, xg_pred)

## Labels ##
xg_label = np.where(xg_pred < xg_cut, 0, 1)

## Evaluate ##
xg_mat = confusion_matrix(Y_test, xg_label)
print(xg_mat)
print('The cost of the xg model is', 1500*xg_mat[1,0] - 1000*xg_mat[0,1] + 500*xg_mat[1,1])

[[2262   24]
 [  61  653]]
The cost of the xg model is 394000


Based on my results, the random forest model is best for predicting left