In [2]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE, RFECV

from precision_recall_cutoff import precision_recall_cutoff

In [3]:
## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# Feature Engineering

In [4]:
## Create Dummies ##
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) &
                                     (turnover['satisfaction_level'] <= .465) &
                                     (turnover['number_project'] > 2.5), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['number_project'] <= 2.5) &
                                     (turnover['last_evaluation'] <= .575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['time_spend_company'] <= 4.5) &
                                     (turnover['number_project'] <= 290.5), 1, 0)


## Inputs and target ##
scaler = MinMaxScaler()
X = turnover.drop(columns = 'left', axis = 1)
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
Y = turnover['left']

## split data ##
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

  self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(


# Random Forest

### RFECV

In [7]:
selection = RFECV(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), step = 1, 
                  min_features_to_select = 2, cv = 3).fit(X_train, Y_train)

## feature names ##
print(X_train.columns[selection.support_])

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3'],
      dtype='object')


### Model 1

In [8]:
## New Inputs ##
X_train_2 = X_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3']]

X_test_2 = X_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3']]

md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)
pred = md.predict_proba(X_test_2)[:,1]
label = precision_recall_cutoff(Y_test, pred)
print(classification_report(Y_test, label))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.90      0.89      0.90       714

    accuracy                           0.95      3000
   macro avg       0.94      0.93      0.93      3000
weighted avg       0.95      0.95      0.95      3000



### GridSearchCV

In [11]:
## Grid ##
rf_grid = {'n_estimators': [100, 300, 500],
           'min_samples_split': [10, 15],
           'min_samples_leaf': [5, 7],
           'max_depth': [3, 5, 7]}

## Search ##
rf_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = rf_grid, cv = 5, scoring = 'f1').fit(X_train_2, Y_train)
rf_search.best_params_

{'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 300}

### Model 2 

In [12]:
md = RandomForestClassifier(n_estimators = 300, max_depth = 7, min_samples_leaf = 5, min_samples_split = 10).fit(X_train_2, Y_train)
pred = md.predict_proba(X_test_2)[:, 1]
label = precision_recall_cutoff(Y_test, pred)
print(classification_report(Y_test, label))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2286
           1       0.96      0.91      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.96      3000
weighted avg       0.97      0.97      0.97      3000



Based on my results, I would use Model 2 to predict left