<center> <h1>Modeling</h1> </center>

## Basic imports

In [11]:
# Main Libs
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split

# DataViz libs
import matplotlib.pyplot as plt
import seaborn as sns

# Nb Setup
from warnings import simplefilter
simplefilter('ignore')

pd.set_option('display.max_columns', None)

## Dataset

In [2]:
# Import Dataset onto Jupyter
employees = pd.read_csv('../data/employees.csv')

In [3]:
# Check employees dataset
employees.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0


In [4]:
# Check employees dataset metadata
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [5]:
# List categorical variables
categorical_vars_list = employees.select_dtypes(include = 'object').columns.tolist()
categorical_vars_list

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [6]:
# Print value counts for each categorical variable
for var in categorical_vars_list:
    var_centered = var.center(34, '-')
    print(var_centered)
    print(employees[var].value_counts(), '\n\n')

------------Attrition-------------
No     1233
Yes     237
Name: Attrition, dtype: int64 


----------BusinessTravel----------
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64 


------------Department------------
Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64 


----------EducationField----------
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64 


--------------Gender--------------
Male      882
Female    588
Name: Gender, dtype: int64 


-------------JobRole--------------
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director     

In [7]:
# Optimize datatypes

# Attrition to bool
employees['Attrition'] = employees['Attrition'].map({'Yes': True, 'No': False})

# BusinessTravel to category
employees['BusinessTravel'] = employees['BusinessTravel'].astype('category')

# Department to category
employees['Department'] = employees['Department'].astype('category')

# EducationField to category
employees['EducationField'] = employees['EducationField'].astype('category')

# Create GenderFemale from Gender, and make it bool
employees['GenderFemale'] = employees['Gender'].map({'Female': True, 'Male': False})

#JobRole to category
employees['JobRole'] = employees['JobRole'].astype('category')

# MaritalStatus to category
employees['MaritalStatus'] = employees['MaritalStatus'].astype('category')

# Over18 to bool
employees['Over18'] = employees['Over18'].map({'Y': True, 'N': False})

# OverTime to bool
employees['OverTime'] = employees['OverTime'].map({'Yes': True, 'No': False})

In [8]:
# Drop 'Gender' and 'Over18'
employees = employees.drop(['Gender', 'Over18'], axis = 1)

In [9]:
# Check dataset's number of rows and columns (variables)
employees.shape

(1470, 34)

## Model with `Pycaret`

In [10]:
# Import everything from classification module in Pycaret
from pycaret.classification import *

In [19]:
data = employees.sample(frac = .9, random_state = 7)
data_unseen = employees.drop(data.index)

data.reset_index(inplace = True, drop = True)
data_unseen.reset_index(inplace = True, drop = True)

print('Data for Modeling:           ' + str(data.shape))
print('Unseen Data For Predictions:  ' + str(data_unseen.shape))

Data for Modeling:           (1323, 34)
Unseen Data For Predictions:  (147, 34)


In [20]:
# Setyo Experiment
exp_employee_attrition_class = setup(data = data, 
                                     target = 'Attrition', 
                                     fix_imbalance=True, 
                                     polynomial_features=True, remove_multicollinearity=True,
                                     feature_selection = True,
                                     session_id = 0) 

Unnamed: 0,Description,Value
0,session_id,0
1,Target,Attrition
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(1323, 34)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,24
8,Ordinal Features,False
9,High Cardinality Features,False


In [21]:
# Comapre models and save best model
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8607,0.8092,0.2665,0.8538,0.3966,0.3442,0.4207,0.039
gbc,Gradient Boosting Classifier,0.8585,0.8115,0.4048,0.6819,0.5032,0.4272,0.4495,0.088
lightgbm,Light Gradient Boosting Machine,0.8542,0.7957,0.3257,0.6934,0.4284,0.3616,0.3997,0.031
rf,Random Forest Classifier,0.8499,0.7951,0.2121,0.7971,0.3243,0.2742,0.351,0.042
ada,Ada Boost Classifier,0.8369,0.8166,0.5261,0.5441,0.5298,0.4321,0.4353,0.031
dummy,Dummy Classifier,0.8218,0.5,0.0,0.0,0.0,0.0,0.0,0.005
ridge,Ridge Classifier,0.7884,0.0,0.7029,0.4464,0.5439,0.416,0.4354,0.006
lda,Linear Discriminant Analysis,0.784,0.8307,0.7029,0.4411,0.5397,0.4096,0.4297,0.014
dt,Decision Tree Classifier,0.7766,0.6218,0.3816,0.3789,0.3709,0.2383,0.2426,0.011
lr,Logistic Regression,0.7474,0.7973,0.6919,0.3867,0.4946,0.3446,0.3714,0.219


In [23]:
# Check best_model params (not tunned)
print(best_model)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=0, verbose=0,
                     warm_start=False)


In [24]:
# Evaluate best model
# evaluate_model(best_model)

In [25]:
# Tune model
best_model_tunned = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8925,0.8916,0.4706,0.8889,0.6154,0.5597,0.598
1,0.7849,0.6246,0.1765,0.3333,0.2308,0.1193,0.1275
2,0.8065,0.7895,0.2941,0.4545,0.3571,0.2493,0.2575
3,0.8495,0.7655,0.5294,0.6,0.5625,0.472,0.4734
4,0.7957,0.7964,0.4706,0.4444,0.4571,0.3314,0.3316
5,0.8925,0.8352,0.5625,0.75,0.6429,0.5811,0.5894
6,0.8261,0.8553,0.5,0.5,0.5,0.3947,0.3947
7,0.8587,0.7286,0.3125,0.7143,0.4348,0.3679,0.4091
8,0.8696,0.7303,0.4375,0.7,0.5385,0.4672,0.4847
9,0.8696,0.7122,0.375,0.75,0.5,0.4344,0.469


In [26]:
print(best_model_tunned)

ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                     criterion='entropy', max_depth=11, max_features='log2',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0002, min_impurity_split=None,
                     min_samples_leaf=5, min_samples_split=5,
                     min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
                     oob_score=False, random_state=0, verbose=0,
                     warm_start=False)


In [27]:
evaluate_model(best_model_tunned)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…