In [1]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

from precision_recall_cutoff import precision_recall_cutoff

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Create Dummies ##
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
## Feature Engineering ##
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) &
                                     (turnover['satisfaction_level'] <= .465) &
                                     (turnover['number_project'] > 2.5), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['number_project'] <= 2.5) &
                                     (turnover['last_evaluation'] <= .575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= .465) &
                                     (turnover['time_spend_company'] <= 4.5) &
                                     (turnover['number_project'] <= 290.5), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [4]:
## Inputs and target ##
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

## split data ##
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [16]:
## list to store importances ##
results, importance = list(), list()

for i in range(0, 10):
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = .2, stratify = Y_train)
    
    ## model ##
    md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    importance.append(md.feature_importances_)
    results.append(md.feature_importances_)
    
results = pd.DataFrame(results)
results.columns = X.columns

In [17]:
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.233142,0.044331,0.143993,0.077592,0.098247,0.017945,0.001568,2.4e-05,0.000792,0.000157,...,7.2e-05,4.9e-05,0.00028,0.000236,0.008647,0.011615,0.002031,0.060745,0.001925,0.29545
1,0.229071,0.049476,0.145846,0.095824,0.090485,0.016759,0.001025,3.2e-05,0.000331,0.000121,...,0.000123,0.000114,0.000111,0.000196,0.007151,0.008105,0.001355,0.051728,0.003485,0.297705
2,0.246429,0.049593,0.147967,0.0727,0.106369,0.018147,0.001049,3.2e-05,0.000398,2.1e-05,...,8e-05,6.1e-05,8.3e-05,0.000227,0.006936,0.01136,0.001656,0.046867,0.002862,0.286438
3,0.239515,0.05296,0.168371,0.088165,0.092102,0.013488,0.000898,7.2e-05,0.000309,0.000153,...,0.000122,4.1e-05,4.7e-05,8.7e-05,0.006748,0.009163,0.002541,0.052045,0.00296,0.26949
4,0.238386,0.046562,0.152259,0.093358,0.102368,0.015392,0.001384,8.7e-05,0.000252,5.1e-05,...,0.000116,4.7e-05,0.00015,0.000201,0.00577,0.011239,0.00152,0.046954,0.002872,0.27996
5,0.22294,0.049172,0.140245,0.102261,0.107948,0.020117,0.0008,3.2e-05,0.000526,7e-05,...,6.4e-05,6.2e-05,2.8e-05,0.000142,0.008476,0.01315,0.002594,0.054037,0.004278,0.272146
6,0.228499,0.054994,0.142044,0.100813,0.100725,0.018997,0.001174,2.8e-05,0.000428,4.8e-05,...,0.00013,2.4e-05,9.1e-05,0.000145,0.00684,0.012075,0.002582,0.053951,0.003035,0.272164
7,0.231101,0.046863,0.15888,0.091975,0.105009,0.010556,0.001631,8.8e-05,0.000449,0.000133,...,0.000245,5.5e-05,5.3e-05,0.000164,0.009275,0.008391,0.001666,0.054049,0.003179,0.275261
8,0.228919,0.052477,0.155038,0.089581,0.09797,0.021282,0.001355,3.5e-05,0.0003,4.2e-05,...,7e-05,5.4e-05,7e-05,0.000276,0.006318,0.008728,0.001862,0.047696,0.003542,0.283644
9,0.26245,0.052936,0.146039,0.078869,0.109791,0.016384,0.001751,0.00016,0.000473,6.9e-05,...,7e-05,7.2e-05,4.6e-05,0.000209,0.008045,0.009207,0.001558,0.050871,0.002593,0.257421


In [18]:
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results

Unnamed: 0,0
satisfaction_level,0.236045
last_evaluation,0.049936
number_project,0.150068
average_montly_hours,0.089114
time_spend_company,0.101101
Work_accident,0.016907
promotion_last_5years,0.001263
sales_IT,5.9e-05
sales_RandD,0.000426
sales_accounting,8.7e-05


In [20]:
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results

Unnamed: 0,Feature,Importance
0,satisfaction_level,0.236045
1,last_evaluation,0.049936
2,number_project,0.150068
3,average_montly_hours,0.089114
4,time_spend_company,0.101101
5,Work_accident,0.016907
6,promotion_last_5years,0.001263
7,sales_IT,5.9e-05
8,sales_RandD,0.000426
9,sales_accounting,8.7e-05


In [23]:
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.278968
0,satisfaction_level,0.236045
2,number_project,0.150068
4,time_spend_company,0.101101
3,average_montly_hours,0.089114
20,interaction_1,0.051894
1,last_evaluation,0.049936
5,Work_accident,0.016907
18,salary_low,0.010303
17,salary_high,0.007421


In [24]:
## This is the code I have been using on Kaggle to sort and display importances ##

## Create Data Frame of Importances ##
imp_data = pd.DataFrame(importance)
importances = pd.DataFrame({
    'Variable': X.columns.values,
    'Importance': np.nan
})

## Set Average Values into the Data Frame ##
for i in range(len(X.columns.values)):
    importances.loc[i, 'Importance'] = np.mean(imp_data[i])

## Sort Results ##
importances.sort_values(['Importance'], ascending = False, inplace = True)
importances

Unnamed: 0,Variable,Importance
22,interaction_3,0.278968
0,satisfaction_level,0.236045
2,number_project,0.150068
4,time_spend_company,0.101101
3,average_montly_hours,0.089114
20,interaction_1,0.051894
1,last_evaluation,0.049936
5,Work_accident,0.016907
18,salary_low,0.010303
17,salary_high,0.007421
