In [8]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

from precision_recall_cutoff import precision_recall_cutoff

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [9]:
## Create dummies for salary and sales ##
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
scaler = MinMaxScaler()

## Scaling data
turnover[['number_project', 'average_montly_hours']] == scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

## Box cox transformation
transformed_time_spend = boxcox(turnover['time_spend_company'])
turnover['time_spend_company'] = transformed_time_spend[0]

In [10]:
## Interactions ##
turnover['interaction_1'] = turnover['satisfaction_level'] * turnover['time_spend_company']
turnover['interaction_2'] = turnover['last_evaluation'] + turnover['promotion_last_5years']
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.38,0.53,2,157,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.305767,0.53
1,0.8,0.86,5,262,1.098118,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.878494,0.86
2,0.11,0.88,7,272,0.941381,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.103552,0.88
3,0.72,0.87,5,223,1.03233,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.743278,0.87
4,0.37,0.52,2,159,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.297721,0.52


In [11]:
## Inputs and Target ##
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

## Random Forest 1 ##

In [12]:
## Model ##
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

## Prediction ##
rf_pred = rf_md.predict_proba(X_test)[:, 1]

## Use local function for labels ##
rf_label = precision_recall_cutoff(Y_test, rf_pred)

## Results ##
print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      2286
           1       0.83      0.87      0.85       714

    accuracy                           0.93      3000
   macro avg       0.90      0.91      0.90      3000
weighted avg       0.93      0.93      0.93      3000



## Random Forest 2 ##

In [13]:
## Remove Interactions ##
X_train_new = X_train.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)
X_test_new = X_test.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)

## Model ##
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_new, Y_train)

## Prediction ##
rf_pred = rf_md.predict_proba(X_test_new)[:, 1]

## Use local function for labels ##
rf_label = precision_recall_cutoff(Y_test, rf_pred)

## Results ##
print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      2286
           1       0.94      0.68      0.79       714

    accuracy                           0.91      3000
   macro avg       0.93      0.83      0.87      3000
weighted avg       0.92      0.91      0.91      3000



Based on my results, I would use the model without the interaction terms.