In [7]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from scipy.stats import boxcox
#defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [8]:
##frequency table of left
turnover['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [9]:
## changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns=['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
##defining the scaler 
scaler = MinMaxScaler()

##changing scale to 0-1
turnover[['number_project', 'average_montly_hours']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

##boxcox transformation
transform_time_spend = boxcox(turnover['time_spend_company'])
turnover['time_spend_company'] = transform_time_spend[0]

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [13]:
##defining input and target variables
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

##splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [16]:
### RANDOM FOREST ### 
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

##predicting on test
RF_pred = RF_md.predict_proba(X_test)[:, 1]

##changing liklihoods to labels
RF_pred = np.where(RF_pred< .22, 0, 1)

##printing the classification report
print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      2286
           1       0.67      0.95      0.79       714

    accuracy                           0.88      3000
   macro avg       0.83      0.90      0.85      3000
weighted avg       0.91      0.88      0.88      3000



In [17]:
### ADABOOST ###
ADA_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)

##predicting on test
ADA_pred = ADA_md.predict_proba(X_test)[:, 1]

##changing liklihoods to labels
ADA_pred = np.where(ADA_pred< .22, 0, 1)

##printing the classification report
print(classification_report(Y_test, ADA_pred))

              precision    recall  f1-score   support

           0       1.00      0.30      0.47      2286
           1       0.31      1.00      0.47       714

    accuracy                           0.47      3000
   macro avg       0.65      0.65      0.47      3000
weighted avg       0.84      0.47      0.47      3000



In [None]:
##based on mty results I would use the random forest model to predict left, because overall it has the best results. 