# How much is an employee likely to leave the company?

## 1 - About the project

## 2 - Loading the libraries

In [None]:
pip install -U plotly

In [None]:
!pip install plotly==4.14.1

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


#Data Analysis
import numpy as np 
import pandas as pd

# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
#pd.options.plotting.backend = "plotly" 

# Machine Learning 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

#Saving results
import pickle


In [None]:
employees = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
                          
employees.head()

In [None]:
employees.info()

In [None]:
employees.shape

In [None]:
employees.dtypes

In [None]:
employees.isnull().sum()

In [None]:
employees.duplicated().sum()

In [None]:
employees.describe()

The statistics on the data show that:
* The mean age of the employees is 36 years old
* They work for 7 years on the company, on average
* The oldest employee in the company works there for 40 years, the youngest less than one year.
* On average, employees are working on the current role for 4 years
* The last promotion happened, on average, 2 years ago.
* Years with current manager seems to follow the trend of the Work in current role variable, having an average of 4 years and a maximum of 17 years
* The company seems to invest in training of the employees, once that on average they had around 3 trainings last year.


In [None]:
employees['Attrition'].unique()

In [None]:
employees['OverTime'].unique()

In [None]:
employees['Over18'].unique()

In [None]:
employees['EducationField'].unique()

## Data Visualization

In [None]:
employees['Attrition'] = employees['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
employees['OverTime'] = employees['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)
employees['Over18'] = employees['Over18'].apply(lambda x: 1 if x == 'Y' else 0)

In [None]:
sns.set_style("darkgrid")
employees.hist(bins = 30, figsize=(20,20));

* Most of the employees are around 30-40 years old
* The attrition variable shows the number of employees wih the potential to leave the company. This number is around 200 employees and it is the focus of our study. This variable alone is not enough in the analysis as the reasons why people leave the company may be varied. 
* Most people live close to work
* There is a greater number of employees with college degrees (number 3 in the Education histogram)
* Regarding salaries, most of the employees earn salaries below 5000 USD
* Most of employees in the research (around 500) only worked in one company. 

Over18, StandardHours, EmployeeCount are variables showing the same value in the histogram and can be dropped from the dataset to simplify further analysis, by reducing the number of variables. EmployeeNumber is also an unecessary information for this analysis as is only represents the number of the employee in the company. 

### Are the employees unhappy with the company? 

In [None]:
employees['Attrition'].value_counts()

In [None]:
#att = ['237', '1233']


#fig = go.Figure([go.Bar(y=att)])
#fig.show()

In [None]:
#employees.drop(['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber'], axis = 1, inplace=True)
employees.drop(['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber'], axis = 1, inplace=True)

In [None]:
left = employees[employees['Attrition'] == 1]
stayed = employees[employees['Attrition'] == 0]

In [None]:
print('Total = ', len(employees))
print('Number of employees that left the company = ', len(left))
print('% of employees that left the company = ', (len(left) / len(employees)) * 100)
print('Employees that stayed in the company = ', len(stayed))
print('% of employees that stayed in the company = ', (len(stayed) / len(employees)) * 100)

In [None]:
left.describe()

In [None]:
stayed.describe()

* As expected, people who left the company earned, on average, less thant the people that stayed (750 USD on average of *DailyRate* for employees who left against 812 for those who stayed).
* On average, people who left lived a bit further from work as compared to those who stayed (10.63 agains 8.91 for *DistantceFromHome* variable


Seeing the correlations between the data: 

In [None]:
correlations = employees.corr()
f, ax = plt.subplots(figsize = (20,20))
sns.heatmap(correlations, annot=True);

* JobLevel and MonthlyIncome are positively correlated (0.95) as expected

In [None]:
plt.figure(figsize=[25,12])
sns.countplot(x = 'Age', hue = 'Attrition', data=employees, palette = 'seismic_r');

Younger people seem to have more tendency to leave the company. 

In [None]:
sns.set_palette("seismic_r")
plt.figure(figsize=[20,20])
plt.subplot(411)
sns.countplot(x = 'JobRole', hue = 'Attrition', data = employees)
plt.subplot(412)
sns.countplot(x = 'MaritalStatus', hue = 'Attrition', data = employees)
plt.subplot(413)
sns.countplot(x = 'JobInvolvement', hue = 'Attrition', data = employees)
plt.subplot(414)
sns.countplot(x = 'JobLevel', hue = 'Attrition', data = employees)

* Most people that left the company are in the Sales and Laboratories departments (Sales Executives and Lab Technicians).
* Most people leaving the company are single
* The higher the job level, the less likely is for an employee to leave the company, as we can see from the last plot

In [None]:
pd.options.plotting.backend = "plotly" 
plt.figure(figsize=(12,7))
sns.kdeplot(left['TotalWorkingYears'], label = 'Employees that left', shade = True, color = 'red')
sns.kdeplot(stayed['TotalWorkingYears'], label = 'Employees that stayed', shade = True, color = 'c');



# KDE (Kernel Density Estimate)


In [None]:
pd.options.plotting.backend = "plotly" 

In [None]:
plt.figure(figsize=(12,7)) 
sns.kdeplot(left['DistanceFromHome'], label = 'Employees that left', shade = True, color = 'k') 
sns.kdeplot(stayed['DistanceFromHome'], label = 'Employees that stayed', shade = True, color = 'b');

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'MonthlyIncome', y = 'Gender', data=employees);

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'MonthlyIncome', y = 'JobRole', data=employees, palette='Paired');

* Sales representative have the lowest range of salaries while Managers and Research Directors have salaries in the highest ranges. 

## Preparing the data for Machine Learning

In [None]:
employees.head()

In [None]:
#Creating a dataframe of categorial variables
X_categoricals = employees[['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']]
X_categoricals

#### Transforming categorical variables into numerical:

In [None]:
onehotencoder = OneHotEncoder()
X_categoricals = onehotencoder.fit_transform(X_categoricals).toarray()

In [None]:
X_categoricals = pd.DataFrame(X_categoricals)
type(X_categoricals)

In [None]:
employees['BusinessTravel'].unique()

In [None]:
X_categoricals

In [None]:
X_numerical = employees[['Age', 'DailyRate', 'DistanceFromHome',	'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement',	'JobLevel',	'JobSatisfaction',	'MonthlyIncome',	'MonthlyRate',	'NumCompaniesWorked',	'OverTime',	'PercentSalaryHike', 'PerformanceRating',	'RelationshipSatisfaction',	'StockOptionLevel',	'TotalWorkingYears'	,'TrainingTimesLastYear'	, 'WorkLifeBalance',	'YearsAtCompany'	,'YearsInCurrentRole', 'YearsSinceLastPromotion',	'YearsWithCurrManager']]
X_numerical

Merging both dataframes with the input information for the model:

In [None]:
X_conct = pd.concat([X_categoricals, X_numerical], axis = 1)
X_conct

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X_conct)

Labels for the model:

In [None]:
y = employees['Attrition']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
X_train.shape, y_train

In [None]:
X_test.shape, y_test

## Training the Model with Logistic Regression

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

In [None]:
y_pred = logistic.predict(X_test)
y_pred

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True);

In [None]:
precision_score(y_test, y_pred)

Because this is an unbalanced dataset, only evaluating accuracy is not enough, we have to check other parameters, in special recall. 

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred, average='macro')

In [None]:
print(classification_report(y_test, y_pred))

* Recall is not so good for class 1 - which means the model is not good in identifying the people that will leave the company (0.52 or 52%). This may be caused by the fact that this is an unbalanced dataset. For the identification of people who are going to stay in the company, the model is good in the prediction, having a recall of 95%. 

## Training the Model with Random Forest

In [None]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [None]:
y_pred = forest.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_pred, y_test)
cm

In [None]:
sns.heatmap(cm, annot=True);

In [None]:
print(classification_report(y_test, y_pred))

Random forest showed less precision and a worst algorithm in general for the predictions. Recall for class 1 was o 19%, which is much lower than the recall using logistic regression (52%, also low). 

## Training the model with Neural Networks

In [None]:
X_train.shape

Number of neurons for the network: (inputs + outputs)/2 which is approximately 25. 

In [None]:
neuralmod = tf.keras.models.Sequential()
neuralmod.add(tf.keras.layers.Dense(units = 25, activation='relu', input_shape=(50,)))
neuralmod.add(tf.keras.layers.Dense(units = 25, activation = 'relu'))
neuralmod.add(tf.keras.layers.Dense(units = 25, activation = 'relu'))

#Output of classification model
neuralmod.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [None]:
neuralmod.summary()

In [None]:
neuralmod.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
neuralmod.fit(X_train, y_train, epochs=200)

In [None]:
y_pred = neuralmod.predict(X_test)
y_pred

In [None]:
y_pred = (y_pred >= 0.5)
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True);

In [None]:
print(classification_report(y_test, y_pred))

This was the best model, but still with a recall for class 1 that indicates the model is having difficulty in detecting the probability of employees leaving the company (recall = 62%). 

## Saving the results

In [None]:
with open('employees_model.pkl', 'wb') as f:
  pickle.dump([scaler, onehotencoder, logistic], f)

In [None]:
with open('employees_model.pkl', 'rb') as f:
  min_max, encoder, model = pickle.load(f)

In [None]:
min_max, encoder, model

In [None]:
X_new = employees.iloc[0:1]
X_new

In [None]:
X_cat_new = X_new[['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']]
X_cat_new

In [None]:
X_cat_new = encoder.transform(X_cat_new).toarray()

In [None]:
X_cat_new

In [None]:
X_cat_new = pd.DataFrame(X_cat_new)
X_cat_new

In [None]:
X_numerical_new = X_new[['Age', 'DailyRate', 'DistanceFromHome',	'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement',	'JobLevel',	'JobSatisfaction',	'MonthlyIncome',	'MonthlyRate',	'NumCompaniesWorked',	'OverTime',	'PercentSalaryHike', 'PerformanceRating',	'RelationshipSatisfaction',	'StockOptionLevel',	'TotalWorkingYears'	,'TrainingTimesLastYear'	, 'WorkLifeBalance',	'YearsAtCompany'	,'YearsInCurrentRole', 'YearsSinceLastPromotion',	'YearsWithCurrManager']]
X_numerical_new

In [None]:
X_conct_new = pd.concat([X_cat_new, X_numerical_new], axis = 1)
X_conct_new

In [None]:
X_new = min_max.transform(X_conct_new)
X_new

In [None]:
model.predict(X_new)

In [None]:
model.predict_proba(X_new)

In [None]:
model.classes_