# Imports

In [1]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import boxcox
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE, RFECV

from precision_recall_cutoff import precision_recall_cutoff

Matplotlib is building the font cache; this may take a moment.


# Data

In [2]:
## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# Feature Engineering

In [3]:
## Create Dummies ##
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

## Feature Engineering ##
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) &
                                     (turnover['satisfaction_level'] <= .465) &
                                     (turnover['number_project'] > 2.5), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['number_project'] <= 2.5) &
                                     (turnover['last_evaluation'] <= .575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['time_spend_company'] <= 4.5) &
                                     (turnover['number_project'] <= 290.5), 1, 0)


## Inputs and target ##
scaler = MinMaxScaler()
X = turnover[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
# X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
Y = turnover['left']

## split data ##
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

# Ada Boost
#### GridSearchCV

In [6]:
## Grid ##
ada_grid = {'n_estimators': [100, 300],
                  'base_estimator__min_samples_split': [10, 15],
                  'base_estimator__min_samples_leaf': [5, 7],
                  'base_estimator__max_depth': [3, 5, 7],
                  'learning_rate': [.01]}

## Search ##
grid_search = GridSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), 
                           param_grid = ada_grid, 
                           cv = 3, scoring = 'f1', 
                           n_jobs = -1).fit(X_train, Y_train)
grid_search.best_params_

{'base_estimator__max_depth': 5,
 'base_estimator__min_samples_leaf': 7,
 'base_estimator__min_samples_split': 15,
 'learning_rate': 0.01,
 'n_estimators': 300}



### Model 1

In [10]:
## Model ##
md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 7, min_samples_split = 15), 
                        learning_rate =.1, 
                        n_estimators = 300).fit(X_train, Y_train)

## Prediction ##
pred = md.predict_proba(X_test)[:, 1]
label = precision_recall_cutoff(Y_test, pred)

## Metric ##
print(classification_report(Y_test, label))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.93      0.93      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.97      0.97      0.97      3000





#### RandomizedSearchCV

In [12]:
random_search = RandomizedSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                                  param_distributions = ada_grid,
                                  cv = 3,
                                  scoring = 'f1',
                                  n_jobs = -1,
                                  n_iter = 10).fit(X_train, Y_train)

random_search.best_params_

{'n_estimators': 300,
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 10,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__max_depth': 5}

### Model 2

In [13]:
## Model ##
md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 5, min_samples_split = 10), 
                        learning_rate =.1, 
                        n_estimators = 300).fit(X_train, Y_train)

## Prediction ##
pred = md.predict_proba(X_test)[:, 1]
label = precision_recall_cutoff(Y_test, pred)

## Metric ##
print(classification_report(Y_test, label))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.93      0.94      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.96      0.96      3000
weighted avg       0.97      0.97      0.97      3000





Both models preformed similarly enough that it is hard to tell which is better for predicting left. Because of this, I would use the second model because it is the simpler model. 