# IBM HR analytics

In [None]:
try:
    import evidently
except:
    !pip install git+https://github.com/evidentlyai/evidently.git

In [None]:
!pip install catboost

In [None]:
#Import of the particular libraries 
import math
import numpy as np
import os 
import pandas as pd

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from catboost import CatBoostClassifier

In [None]:
from evidently.report import Report
from evidently.metric_preset import ClassificationPreset
from evidently.pipeline.column_mapping import ColumnMapping

## Source data

Download this dataset https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

You can just download data from kaggle and upload it here manually or using kaggle API https://www.kaggle.com/docs/api

In [None]:
dataset = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',', header = 0)

In [None]:
dataset.head()

In [None]:
dataset.describe()

## Feature engineering

In [None]:
target_name = 'Attrition'

In [None]:
numerique_features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
                      'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
                      'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',
                      'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears',
                      'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
                      'YearsSinceLastPromotion', 'YearsWithCurrManager'
                     ]

In [None]:
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender',
                        'JobRole', 'MaritalStatus', 'OverTime'
                       ]

In [None]:
processed_dataset = dataset.copy(deep = True)

#### Drop constant features

In [None]:
processed_dataset.drop(columns = ['EmployeeCount', 'StandardHours', 'Over18'], inplace = True)

#### BusinessTravel

In [None]:
processed_dataset.BusinessTravel.value_counts()

In [None]:
business_travel_dummies = pd.get_dummies(processed_dataset.BusinessTravel, prefix = 'b_travel')
processed_dataset = pd.concat([processed_dataset, business_travel_dummies], axis=1)

#### Department

In [None]:
dataset.Department.value_counts()

In [None]:
department_dummies = pd.get_dummies(processed_dataset.Department, prefix = 'department')
processed_dataset = pd.concat([processed_dataset, department_dummies], axis=1)

#### EducationField

In [None]:
dataset.EducationField.value_counts()

In [None]:
edu_field_dummies = pd.get_dummies(processed_dataset.Department, prefix = 'edu_field')
processed_dataset = pd.concat([processed_dataset, edu_field_dummies], axis=1)

#### Gender

In [None]:
dataset.Gender.value_counts()

In [None]:
processed_dataset['gender_bin'] = processed_dataset.Gender.apply(
    lambda x : 0 if x == 'Male' else 1 if x == 'Female' else -1)

#### JobRole

In [None]:
dataset.JobRole.value_counts()

In [None]:
job_role_dummies = pd.get_dummies(processed_dataset.JobRole, prefix = 'job_role')
processed_dataset = pd.concat([processed_dataset, job_role_dummies], axis=1)

#### MaritalStatus

In [None]:
dataset.MaritalStatus.value_counts()

In [None]:
marital_dummies = pd.get_dummies(processed_dataset.MaritalStatus, prefix = 'marital')
processed_dataset = pd.concat([processed_dataset, marital_dummies], axis=1)

#### OverTime

In [None]:
dataset.OverTime.value_counts()

In [None]:
overtime_dummies = pd.get_dummies(processed_dataset.OverTime, prefix = 'overtime')
processed_dataset = pd.concat([processed_dataset, overtime_dummies], axis=1)

In [None]:
#drop initial categorical features
processed_dataset.drop(columns = categorical_features, inplace = True)

### Target

In [None]:
processed_dataset['target'] = processed_dataset.Attrition.apply(
    lambda x : 0 if x == 'No' else 1 if x == 'Yes' else -1)

## Dataset generation

In [None]:
features = ['Age','DailyRate', 'DistanceFromHome', 'Education',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'b_travel_Non-Travel',
       'b_travel_Travel_Frequently', 'b_travel_Travel_Rarely',
       'department_Human Resources', 'department_Research & Development',
       'department_Sales', 'edu_field_Human Resources',
       'edu_field_Research & Development', 'edu_field_Sales', 'gender_bin',
       'job_role_Healthcare Representative', 'job_role_Human Resources',
       'job_role_Laboratory Technician', 'job_role_Manager',
       'job_role_Manufacturing Director', 'job_role_Research Director',
       'job_role_Research Scientist', 'job_role_Sales Executive',
       'job_role_Sales Representative', 'marital_Divorced', 'marital_Married',
       'marital_Single', 'overtime_No', 'overtime_Yes',]

#### Train & Holdout Test Split

In [None]:
RANDOM_STATE = 1603

In [None]:
train_data, test_data, train_y, test_y = train_test_split(processed_dataset[features], processed_dataset.target,
                                                   random_state = RANDOM_STATE, test_size = 0.25,
                                                   stratify = processed_dataset.target)

## Modeling

### Baseline

In [None]:
rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, random_state = 11)

In [None]:
rf.fit(train_data[features], train_y)

### Baseline Classification Report

In [None]:
train_probas = pd.DataFrame(rf.predict_proba(train_data[features]))
train_probas.columns = ['no', 'yes']
test_probas = pd.DataFrame(rf.predict_proba(test_data[features]))
test_probas.columns = ['no', 'yes']

In [None]:
train_data.reset_index(inplace=True, drop=True)
train_data['Attrition'] = ['no' if x == 0 else 'yes' for x in train_y]
rf_merged_train = pd.concat([train_data, train_probas], axis = 1)

test_data.reset_index(inplace=True, drop=True)
test_data['Attrition'] = ['no' if x == 0 else 'yes' for x in test_y]
rf_merged_test = pd.concat([test_data, test_probas], axis = 1)

In [None]:
column_mapping = ColumnMapping()

column_mapping.target = 'Attrition'
column_mapping.prediction = ['yes', 'no']
column_mapping.pos_label = 'yes'

column_mapping.numerical_features = ['Age','DailyRate', 'DistanceFromHome', 'Education',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

column_mapping.categorical_features = ['b_travel_Non-Travel',
       'b_travel_Travel_Frequently', 'b_travel_Travel_Rarely',
       'department_Human Resources', 'department_Research & Development',
       'department_Sales', 'edu_field_Human Resources',
       'edu_field_Research & Development', 'edu_field_Sales', 'gender_bin',
       'job_role_Healthcare Representative', 'job_role_Human Resources',
       'job_role_Laboratory Technician', 'job_role_Manager',
       'job_role_Manufacturing Director', 'job_role_Research Director',
       'job_role_Research Scientist', 'job_role_Sales Executive',
       'job_role_Sales Representative', 'marital_Divorced', 'marital_Married',
       'marital_Single', 'overtime_No', 'overtime_Yes']

In [None]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])

classification_performance_report.run(reference_data=rf_merged_train, current_data=rf_merged_test, column_mapping = column_mapping)

classification_performance_report

In [None]:
#classification_performance_report.save_html('ibm_hr_attrition_baseline_performance.html')

## Better model

In [None]:
cat = CatBoostClassifier(random_state= 11, iterations=1600, learning_rate=0.008, class_weights = {0:1, 1:6})

In [None]:
cat.fit(train_data[features], train_y)

In [None]:
train_probas = pd.DataFrame(cat.predict_proba(train_data[features]))
train_probas.columns = ['no', 'yes']
test_probas = pd.DataFrame(cat.predict_proba(test_data[features]))
test_probas.columns = ['no', 'yes']

In [None]:
train_data.reset_index(inplace=True, drop=True)
train_data['target'] = ['no' if x == 0 else 'yes' for x in train_y]
cat_merged_train = pd.concat([train_data, train_probas], axis = 1)

test_data.reset_index(inplace=True, drop=True)
test_data['target'] = ['no' if x == 0 else 'yes' for x in test_y]
cat_merged_test = pd.concat([test_data, test_probas], axis = 1)

In [None]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])

classification_performance_report.run(reference_data=cat_merged_train, current_data=cat_merged_test, column_mapping = column_mapping)

classification_performance_report

In [None]:
#classification_performance_report.save_html('ibm_hr_attrition_better_model_performance.html')

## Models comparison

In [None]:
classification_performance_report.run(reference_data=rf_merged_test, current_data=cat_merged_test, column_mapping = column_mapping)

classification_performance_report

In [None]:
#classification_performance_report.save_html('ibm_hr_attrition_model_comparison.html')

# Support Evidently
Enjoyed the tutorial? Star Evidently on GitHub to contribute back! This helps us continue creating free open-source tools for the community. https://github.com/evidentlyai/evidently