In [25]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [22]:
turnover['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [26]:
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

In [27]:
scaler = MinMaxScaler()

turnover[['number_project', 'average_montly_hours']] == scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

transformed_time_spend = boxcox(turnover['time_spend_company'])
turnover['time_spend_company'] = transformed_time_spend[0]

In [28]:
## Define input and target ##
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [30]:
## Model ##
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 5).fit(X_train, Y_train)

## Prediction ##
rf_pred = rf_md.predict_proba(X_test)[:, 1]

## Label ##
rf_label = np.where(rf_pred < .1, 0, 1)

print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.99      0.67      0.80      2286
           1       0.48      0.98      0.64       714

    accuracy                           0.74      3000
   macro avg       0.74      0.82      0.72      3000
weighted avg       0.87      0.74      0.76      3000



In [31]:
## Model ##
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5), n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)

## Prediction ##
ada_pred = ada_md.predict_proba(X_test)[:, 1]

## Label ##
ada_label = np.where(ada_pred < .1, 0, 1)

print(classification_report(Y_test, ada_label))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50      2286
           1       0.32      1.00      0.48       714

    accuracy                           0.49      3000
   macro avg       0.66      0.66      0.49      3000
weighted avg       0.83      0.49      0.49      3000



Based on my results, Random Forest is the best model to use