In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

from precision_recall_cutoff import precision_recall_cutoff

#defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
##engineering features for the decision tree model
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= .115) & 
                                     (turnover['satisfaction_level'] <= .465) & 
                                     (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= .465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= .575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > .465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5)), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [5]:
##defining input and target variables
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

##splitting the data 
X_train, X_test, Y_train, Y_test = train

In [7]:
##defining the list to store feature importances
results = list()

for i in range (0,10):
    ##splitting the train data 
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = .2, stratify = Y_train)
    
    ##buinding the RF model
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ##extracting the feature importances
    results.append(RF.feature_importances_)
    
##transforming list into data-frame
results = pd.DataFrame(results)
results.columns = X.columns 

In [8]:
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.196756,0.038168,0.096309,0.069272,0.079123,0.009175,0.000448,9.5e-05,0.000477,0.000143,...,8.1e-05,6.5e-05,3.7e-05,6.8e-05,0.00516,0.004524,0.00069,0.044494,0.195893,0.25839
1,0.180905,0.039681,0.125308,0.065694,0.08139,0.01228,0.000465,2.5e-05,0.000374,8.7e-05,...,7.6e-05,0.000109,9.2e-05,0.000121,0.004355,0.005887,0.001304,0.044317,0.200738,0.236215
2,0.196146,0.03891,0.113055,0.064285,0.086518,0.009994,0.001031,5e-05,0.000234,8.6e-05,...,8.9e-05,9.5e-05,9.6e-05,5.9e-05,0.003808,0.005855,0.000894,0.045976,0.196013,0.23635
3,0.179744,0.043437,0.12857,0.060493,0.083236,0.012359,0.000965,6.4e-05,0.000335,8.3e-05,...,6.2e-05,4e-05,3.6e-05,0.000188,0.004198,0.003751,0.000614,0.037052,0.18319,0.261262
4,0.18556,0.045073,0.132134,0.070658,0.079979,0.011952,0.000522,5.4e-05,0.000318,5.6e-05,...,7.5e-05,3.8e-05,6.9e-05,0.000139,0.006209,0.005722,0.001375,0.031979,0.185665,0.241946
5,0.194018,0.043509,0.118736,0.059071,0.072487,0.010002,0.000504,2.8e-05,0.000446,8e-05,...,9.3e-05,9.7e-05,4.1e-05,0.0001,0.005078,0.006022,0.001007,0.042552,0.179071,0.26667
6,0.189933,0.040367,0.110891,0.070112,0.077295,0.010423,0.000409,7.1e-05,0.000662,6.8e-05,...,4.3e-05,7.5e-05,1.1e-05,0.000271,0.004336,0.006074,0.000516,0.04825,0.197063,0.242659
7,0.180268,0.040549,0.111167,0.059558,0.079288,0.0124,0.000481,7.1e-05,0.00072,7e-05,...,0.000108,8.3e-05,9e-05,0.000107,0.005122,0.008034,0.001134,0.040625,0.211619,0.247919
8,0.198249,0.04063,0.122431,0.057345,0.081254,0.011973,0.000904,3.7e-05,0.000413,4.5e-05,...,3.5e-05,6.7e-05,0.000101,8.9e-05,0.006306,0.006306,0.001602,0.045702,0.184211,0.241301
9,0.214287,0.041867,0.111967,0.058493,0.091061,0.008951,0.000905,5.4e-05,0.000395,0.000199,...,4.5e-05,3e-05,1.7e-05,0.000115,0.004863,0.009452,0.00122,0.044325,0.164603,0.246014


In [9]:
## average of the importances 
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results

Unnamed: 0,0
satisfaction_level,0.191587
last_evaluation,0.041219
number_project,0.117057
average_montly_hours,0.063498
time_spend_company,0.081163
Work_accident,0.010951
promotion_last_5years,0.000663
sales_IT,5.5e-05
sales_RandD,0.000438
sales_accounting,9.2e-05


In [11]:
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results

Unnamed: 0,Feature,Importance
0,satisfaction_level,0.191587
1,last_evaluation,0.041219
2,number_project,0.117057
3,average_montly_hours,0.063498
4,time_spend_company,0.081163
5,Work_accident,0.010951
6,promotion_last_5years,0.000663
7,sales_IT,5.5e-05
8,sales_RandD,0.000438
9,sales_accounting,9.2e-05


In [12]:
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.247873
0,satisfaction_level,0.191587
21,interaction_2,0.189807
2,number_project,0.117057
4,time_spend_company,0.081163
3,average_montly_hours,0.063498
20,interaction_1,0.042527
1,last_evaluation,0.041219
5,Work_accident,0.010951
18,salary_low,0.006163
