**Predict Attrition Values for the given data set of employees**

In [None]:
#This fuction is for the plotly package to work

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

Importing Python libraries

In [None]:
# importing libraries for data handling and analysis
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
from scipy.stats import norm, skew
import warnings
warnings.filterwarnings('ignore')

In [None]:
#!pip install chart_studio

In [None]:
# importing libraries for data visualisations
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
color = sns.color_palette()
pd.options.display.max_columns = None
# Standard plotly imports
import plotly.offline as py
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
#py.initnotebookmode(connected=True) # this code, allow us to work with offline plotly version
# Using plotly + cufflinks in offline mode
import cufflinks as cf
cf.set_config_file(offline=True)
import cufflinks
cufflinks.go_offline(connected=True)

In [None]:
# sklearn modules for preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

Upload the dataset in csv format

In [None]:
from google.colab import files
uploaded = files.upload()

TypeError: ignored

In [None]:
import io
df2 = pd.read_csv(io.BytesIO(uploaded['org.csv']))

In [None]:
print("Shape of dataframe is: {}".format(df2.shape))

In [None]:
df_HR = df2.copy()

**Data Description and Exploratory Visualisations**

In this section, we will provide data visualizations that summarizes or extracts relevant characteristics of features in our dataset. Let's look at each column in detail, get a better understanding of the dataset, and group them together.

In [None]:
# Dataset columns
df_HR.columns

In [None]:
# Dataset header
df_HR.head()

In [None]:
df_HR.drop(['EmployeeCount','StandardHours'],axis=1,inplace = True)

In [None]:
lt = [['Yes','No'][x] for x in df_HR.Attrition]
df_HR['Target'] = lt

In [None]:
df_HR['Target'].value_counts()

The dataset contains several numerical and categorical columns providing various information on employee's personal and employment details.

In [None]:
#break down the columns by their type (i.e. int64, float64, object)
df_HR.columns.to_series().groupby(df_HR.dtypes).groups

In [None]:
# Columns datatypes and missing values
df_HR.info()

Inference: The data provided has no missing values.

In [None]:
df_HR.describe()

In [None]:
df_HR.hist(figsize=(20,20))
plt.show()

Observations made are:

-Many histograms are tail-heavy; indeed several distributions are right-skewed (e.g. MonthlyIncome DistanceFromHome, YearsAtCompany). 

-Age distribution is a slightly right-skewed normal distribution with the bulk of the staff between 25 and 45 years old.

-EmployeeCount and StandardHours are constant values for all employees. They're likely to be redundant features.

-Employee Number is likely to be a unique identifier for employees.

In [None]:
(mu, sigma) = norm.fit(df_HR.loc[df_HR['Attrition'] == 1, 'Age'])
print('Ex-employees: average age = {:.1f} years old and standard deviation = {:.1f}'.format(mu, sigma))
(mu, sigma) = norm.fit(df_HR.loc[df_HR['Attrition'] == 0, 'Age'])
print('Current employees: average age = {:.1f} years old and standard deviation = {:.1f}'.format(
    mu, sigma))

In [None]:
##Find some explanation???
'''configure_plotly_browser_state()
from plotly.offline import iplot

plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'Age'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'Age'], label = 'Ex-Employees')
plt.xlim(left=18, right=60)
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Age Distribution in Percent by Attrition Status');'''

In [None]:
# Education Field of employees
df_HR['EducationField'].value_counts()

In [None]:
##Most people who tend to leave the company belong to HR and technical
configure_plotly_browser_state()

from plotly.offline import iplot
import plotly.graph_objs as go



df_EducationField = pd.DataFrame(columns=["Field", "% of Leavers"])
i=0
for field in list(df_HR['EducationField'].unique()):
    ratio = df_HR[(df_HR['EducationField']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['EducationField']==field].shape[0]
    df_EducationField.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_EF = df_EducationField.groupby(by="Field").sum()
df_EF.iplot(kind='bar',title='Leavers by Education Field (%)')

In [None]:
# Gender of employees
df_HR['Gender'].value_counts()

In [None]:
print("Normalised gender distribution of ex-employees in the dataset: Male = {:.1f}%; Female {:.1f}%.".format((df_HR[(df_HR['Attrition'] == 1) & (
    df_HR['Gender'] == 'Male')].shape[0] / df_HR[df_HR['Gender'] == 'Male'].shape[0])*100, (df_HR[(df_HR['Attrition'] == 1) & (df_HR['Gender'] == 'Female')].shape[0] / df_HR[df_HR['Gender'] == 'Female'].shape[0])*100))

In [None]:
##Number of males to leave the company is more than females
configure_plotly_browser_state()

df_Gender = pd.DataFrame(columns=["Gender", "% of Leavers"])
i=0
for field in list(df_HR['Gender'].unique()):
    ratio = df_HR[(df_HR['Gender']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['Gender']==field].shape[0]
    df_Gender.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_G = df_Gender.groupby(by="Gender").sum()
df_G.iplot(kind='bar',title='Leavers by Gender (%)')

In [None]:
# Marital Status of employees
df_HR['MaritalStatus'].value_counts()

In [None]:
##Most people who leave the company are single by martial status
configure_plotly_browser_state()

df_Marital = pd.DataFrame(columns=["Marital Status", "% of Leavers"])
i=0
for field in list(df_HR['MaritalStatus'].unique()):
    ratio = df_HR[(df_HR['MaritalStatus']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['MaritalStatus']==field].shape[0]
    df_Marital.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_MF = df_Marital.groupby(by="Marital Status").sum()
df_MF.iplot(kind='bar',title='Leavers by Marital Status (%)')

In [None]:
# Distance from Home
print("Distance from home for employees to get to work is from {} to {} miles.".format(df_HR['DistanceFromHome'].min(),
                                                                                       df_HR['DistanceFromHome'].max()))

In [None]:
print('Average distance from home for currently active employees: {:.2f} miles and ex-employees: {:.2f} miles'.format(
    df_HR[df_HR['Attrition'] == 0]['DistanceFromHome'].mean(), df_HR[df_HR['Attrition'] == 1]['DistanceFromHome'].mean()))

In [None]:
##Most people who tend to leave the company have more distance from home (>=10) compared to those who tend to stay.
configure_plotly_browser_state()

plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'DistanceFromHome'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'DistanceFromHome'], label = 'Ex-Employees')
plt.xlabel('DistanceFromHome')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Distance From Home Distribution in Percent by Attrition Status');

In [None]:
# The organisation consists of several departments
df_HR['Department'].value_counts()

In [None]:
# Employees have different business travel commitmnent depending on their roles and level in the organisation
df_HR['BusinessTravel'].value_counts()

In [None]:
##Most people who tend to leave the company used to travel frequently
configure_plotly_browser_state()

df_BusinessTravel = pd.DataFrame(columns=["Business Travel", "% of Leavers"])
i=0
for field in list(df_HR['BusinessTravel'].unique()):
    ratio = df_HR[(df_HR['BusinessTravel']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['BusinessTravel']==field].shape[0]
    df_BusinessTravel.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_BT = df_BusinessTravel.groupby(by="Business Travel").sum()
df_BT.iplot(kind='bar',title='Leavers by Business Travel (%)')

In [None]:
# Employees in the database have several roles on-file
df_HR['JobRole'].value_counts()

In [None]:
##Job Roles including Sales representative, HR and lab technician tend to leave the company.
configure_plotly_browser_state()

df_JobRole = pd.DataFrame(columns=["Job Role", "% of Leavers"])
i=0
for field in list(df_HR['JobRole'].unique()):
    ratio = df_HR[(df_HR['JobRole']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['JobRole']==field].shape[0]
    df_JobRole.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_JR = df_JobRole.groupby(by="Job Role").sum()
df_JR.iplot(kind='bar',title='Leavers by Job Role (%)')

In [None]:
df_HR['JobLevel'].value_counts()


In [None]:
##Most people who tend to leave the organization belonged to Job level 1, i.e at a very early level.
configure_plotly_browser_state()

df_JobLevel = pd.DataFrame(columns=["Job Level", "% of Leavers"])
i=0
for field in list(df_HR['JobLevel'].unique()):
    ratio = df_HR[(df_HR['JobLevel']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['JobLevel']==field].shape[0]
    df_JobLevel.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_JL = df_JobLevel.groupby(by="Job Level").sum()
df_JL.iplot(kind='bar',title='Leavers by Job Level (%)')

In [None]:
df_HR['JobInvolvement'].value_counts()

In [None]:
##People with lesser involvement in their job tend to leave the organization
configure_plotly_browser_state()

df_JobInvolvement = pd.DataFrame(columns=["Job Involvement", "% of Leavers"])
i=0
for field in list(df_HR['JobInvolvement'].unique()):
    ratio = df_HR[(df_HR['JobInvolvement']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['JobInvolvement']==field].shape[0]
    df_JobInvolvement.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_JI = df_JobInvolvement.groupby(by="Job Involvement").sum()
df_JI.iplot(kind='bar',title='Leavers by Job Involvement (%)')

In [None]:
print("Number of training times last year varies from {} to {} years.".format(
    df_HR['TrainingTimesLastYear'].min(), df_HR['TrainingTimesLastYear'].max()))

In [None]:
##No intuition can be removed,since both have the same trends
'''configure_plotly_browser_state()

plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'TrainingTimesLastYear'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'TrainingTimesLastYear'], label = 'Ex-Employees')
plt.xlabel('TrainingTimesLastYear')
plt.ylabel('Density')
plt.title('Training Times Last Year Distribution in Percent by Attrition Status');'''

In [None]:
df_HR['NumCompaniesWorked'].value_counts()

In [None]:
##Poeple who have worked in different organization, i.e have changed their jobs frequently are more likely to leave the organization.
configure_plotly_browser_state()

df_NumCompaniesWorked = pd.DataFrame(columns=["Num Companies Worked", "% of Leavers"])
i=0
for field in list(df_HR['NumCompaniesWorked'].unique()):
    ratio = df_HR[(df_HR['NumCompaniesWorked']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['NumCompaniesWorked']==field].shape[0]
    df_NumCompaniesWorked.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_NC = df_NumCompaniesWorked.groupby(by="Num Companies Worked").sum()
df_NC.iplot(kind='bar',title='Leavers by Num Companies Worked (%)')

In [None]:
print("Number of Years at the company varies from {} to {} years.".format(
    df_HR['YearsAtCompany'].min(), df_HR['YearsAtCompany'].max()))

In [None]:
##People who have spent lesser time in the company are likely to discontinue their work.
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'YearsAtCompany'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'YearsAtCompany'], label = 'Ex-Employees')
plt.xlabel('YearsAtCompany')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Years At Company in Percent by Attrition Status');

In [None]:
print("Number of Years in the current role varies from {} to {} years.".format(
    df_HR['YearsInCurrentRole'].min(), df_HR['YearsInCurrentRole'].max()))

In [None]:
##Most people who have spent lesser time in their role,are likely to leave the organization. with increase in their years in role they tend to stay in the organization.
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'YearsInCurrentRole'], label = 'Attrition No ')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'YearsInCurrentRole'], label = 'Attrition Yes')
plt.xlabel('YearsInCurrentRole')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Years In Current Role in Percent by Attrition Status');

In [None]:
print("Number of Years since last promotion varies from {} to {} years.".format(
    df_HR['YearsSinceLastPromotion'].min(), df_HR['YearsSinceLastPromotion'].max()))

In [None]:
##Mostly the employees with more number of difference in their promotion period are less likely to leave the company.
##And with less number of difference i.e 1 or less than it are more likely to leave the company
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'YearsSinceLastPromotion'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'YearsSinceLastPromotion'], label = 'Ex-Employees')
plt.xlabel('YearsSinceLastPromotion')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Years Since Last Promotion in Percent by Attrition Status');


In [None]:
print("Total working years varies from {} to {} years.".format(
    df_HR['TotalWorkingYears'].min(), df_HR['TotalWorkingYears'].max()))

In [None]:
##Number of people spending lesser time in organization are more likely to leave it.
##Number of people with comparatively more experience in the same company (>=38)years are more likely to leave the company
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'TotalWorkingYears'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'TotalWorkingYears'], label = 'Ex-Employees')
plt.xlabel('TotalWorkingYears')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Total Working Years in Percent by Attrition Status');


In [None]:
print("Number of Years with current manager varies from {} to {} years.".format(
    df_HR['YearsWithCurrManager'].min(), df_HR['YearsWithCurrManager'].max()))

In [None]:
##More number of people who are leaving the company are wokring with the same manager for lesser time.
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'YearsWithCurrManager'], label = 'Attrition No')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'YearsWithCurrManager'], label = 'Attrition Yes')
plt.xlabel('YearsWithCurrManager')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Years With Curr Manager in Percent by Attrition Status');


In [None]:

#df_HR['WorkLifeBalance'].value_counts()

In [None]:
##Not important
'''configure_plotly_browser_state()

df_WorkLifeBalance = pd.DataFrame(columns=["WorkLifeBalance", "% of Leavers"])
i=0
for field in list(df_HR['WorkLifeBalance'].unique()):
    ratio = df_HR[(df_HR['WorkLifeBalance']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['WorkLifeBalance']==field].shape[0]
    df_WorkLifeBalance.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_WLB = df_WorkLifeBalance.groupby(by="WorkLifeBalance").sum()
df_WLB.iplot(kind='bar',title='Leavers by WorkLifeBalance (%)')'''

In [None]:
df_HR['OverTime'].value_counts()


In [None]:
##Important################
##Most people who tend to leave the company used to work Overtime in their work.
configure_plotly_browser_state()


df_OverTime = pd.DataFrame(columns=["OverTime", "% of Leavers"])
i=0
for field in list(df_HR['OverTime'].unique()):
    ratio = df_HR[(df_HR['OverTime']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['OverTime']==field].shape[0]
    df_OverTime.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_OT = df_OverTime.groupby(by="OverTime").sum()
df_OT.iplot(kind='bar',title='Leavers by OverTime (%)')

In [None]:
print("Employee Hourly Rate varies from ${} to ${}.".format(
    df_HR['HourlyRate'].min(), df_HR['HourlyRate'].max()))

In [None]:
print("Employee Daily Rate varies from ${} to ${}.".format(
    df_HR['DailyRate'].min(), df_HR['DailyRate'].max()))

In [None]:
print("Employee Monthly Rate varies from ${} to ${}.".format(
    df_HR['MonthlyRate'].min(), df_HR['MonthlyRate'].max()))

In [None]:
print("Employee Monthly Income varies from ${} to ${}.".format(
    df_HR['MonthlyIncome'].min(), df_HR['MonthlyIncome'].max()))

In [None]:
##Most people who tend to leave the company are having lesser income and hence the count is more. 
##Also the people with more income are less likely to leave the company
plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'MonthlyIncome'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'MonthlyIncome'], label = 'Ex-Employees')
plt.xlabel('Monthly Income')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Monthly Income in Percent by Attrition Status');


In [None]:
'''print("Percentage Salary Hikes varies from {}% to {}%.".format(
    df_HR['PercentSalaryHike'].min(), df_HR['PercentSalaryHike'].max()))'''

In [None]:
##Graph should not be included.
'''plt.figure(figsize=(15,6))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 0, 'PercentSalaryHike'], label = 'Active Employee')
sns.kdeplot(df_HR.loc[df_HR['Attrition'] == 1, 'PercentSalaryHike'], label = 'Ex-Employees')
plt.xlabel('PercentSalaryHike')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Percent Salary Hike in Percent by Attrition Status');'''

In [None]:
'''print("Stock Option Levels varies from {} to {}.".format(
    df_HR['StockOptionLevel'].min(), df_HR['StockOptionLevel'].max()))'''

In [None]:
'''configure_plotly_browser_state()

df_StockOptionLevel = pd.DataFrame(columns=["StockOptionLevel", "% of Leavers"])
i=0
for field in list(df_HR['StockOptionLevel'].unique()):
    ratio = df_HR[(df_HR['StockOptionLevel']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['StockOptionLevel']==field].shape[0]
    df_StockOptionLevel.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_SOL = df_StockOptionLevel.groupby(by="StockOptionLevel").sum()
df_SOL.iplot(kind='bar',title='Leavers by Stock Option Level (%)')'''

In [None]:
#df_HR['EnvironmentSatisfaction'].value_counts()

In [None]:
##Probably Not important
'''configure_plotly_browser_state()

df_EnvironmentSatisfaction = pd.DataFrame(columns=["EnvironmentSatisfaction", "% of Leavers"])
i=0
for field in list(df_HR['EnvironmentSatisfaction'].unique()):
    ratio = df_HR[(df_HR['EnvironmentSatisfaction']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['EnvironmentSatisfaction']==field].shape[0]
    df_EnvironmentSatisfaction.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_Env = df_EnvironmentSatisfaction.groupby(by="EnvironmentSatisfaction").sum()
df_Env.iplot(kind='bar',title='Leavers by Environment Satisfaction (%)')'''

In [None]:
# Job Satisfaction was captured as: 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
#df_HR['JobSatisfaction'].value_counts()

In [None]:
##Probably not important
'''configure_plotly_browser_state()

df_JobSatisfaction = pd.DataFrame(columns=["JobSatisfaction", "% of Leavers"])
i=0
for field in list(df_HR['JobSatisfaction'].unique()):
    ratio = df_HR[(df_HR['JobSatisfaction']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['JobSatisfaction']==field].shape[0]
    df_JobSatisfaction.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_JS = df_JobSatisfaction.groupby(by="JobSatisfaction").sum()
df_JS.iplot(kind='bar',title='Leavers by Job Satisfaction (%)')'''

In [None]:
df_HR['RelationshipSatisfaction'].value_counts()

In [None]:
##The leavers were having an average relationship satisfaction with company
configure_plotly_browser_state()

df_RelationshipSatisfaction = pd.DataFrame(columns=["RelationshipSatisfaction", "% of Leavers"])
i=0
for field in list(df_HR['RelationshipSatisfaction'].unique()):
    ratio = df_HR[(df_HR['RelationshipSatisfaction']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['RelationshipSatisfaction']==field].shape[0]
    df_RelationshipSatisfaction.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_RS = df_RelationshipSatisfaction.groupby(by="RelationshipSatisfaction").sum()
df_RS.iplot(kind='bar',title='Leavers by Relationship Satisfaction (%)')

In [None]:
df_HR['PerformanceRating'].value_counts()

In [None]:
print("Normalised percentage of leavers by Stock Option Level: 3: {:.2f}%, 4: {:.2f}%".format(
    df_HR[(df_HR['Attrition'] == 1) & (df_HR['PerformanceRating'] == 3)
          ].shape[0] / df_HR[df_HR['StockOptionLevel'] == 1].shape[0]*100,
    df_HR[(df_HR['Attrition'] == 1) & (df_HR['PerformanceRating'] == 4)].shape[0] / df_HR[df_HR['StockOptionLevel'] == 1].shape[0]*100))

In [None]:
'''configure_plotly_browser_state()
##Not important
df_PerformanceRating = pd.DataFrame(columns=["PerformanceRating", "% of Leavers"])
i=0
for field in list(df_HR['PerformanceRating'].unique()):
    ratio = df_HR[(df_HR['PerformanceRating']==field)&(df_HR['Attrition']==1)].shape[0] / df_HR[df_HR['PerformanceRating']==field].shape[0]
    df_PerformanceRating.loc[i] = (field, ratio*100)
    i += 1
    #print("In {}, the ratio of leavers is {:.2f}%".format(field, ratio*100))    
df_PR = df_PerformanceRating.groupby(by="PerformanceRating").sum()
df_PR.iplot(kind='bar',title='Leavers by Performance Rating (%)')'''

In [None]:
# Attrition indicates if the employee is currently active ('No') or has left the company ('Yes')
df_HR['Attrition'].value_counts()

In [None]:
print("Percentage of Current Employees is {:.1f}% and of Ex-employees is: {:.1f}%".format(
    df_HR[df_HR['Attrition'] == 0].shape[0] / df_HR.shape[0]*100,
    df_HR[df_HR['Attrition'] == 1].shape[0] / df_HR.shape[0]*100))

In [None]:
##Very less people tend toleave the company
'''configure_plotly_browser_state()
df_HR['Attrition'].iplot(kind='hist', xTitle='Attrition',
                         yTitle='count', title='Attrition Distribution')'''

In [None]:
###from this graph we can conclude that most of the people lie in the age ranging from 25-42
sns.distplot(df_HR.Age)

In [None]:
##With this graph we can conclude that with an increase in age and years in the company, the employees are less likely to get weakened
##Also at a initial stage, the people are more likely to get weakened at a lower age group(20-30)
'''sns.relplot(x='Age', y='YearsAtCompany', hue="Attrition",
            data=df_HR)'''

In [None]:
sns.boxplot(y = df_HR.MonthlyIncome,hue = df_HR.Department,x = df_HR.Attrition)

In [None]:
##The plots are arranged considering the number of counts of employees in each field
yes = df_HR[df_HR.Attrition == 1]
sns.boxplot(x = yes.MonthlyIncome,y = yes.EducationField,orient = 'h',order = ['Life Sciences','Medical','Marketing','Technical Degree','Other','Human Resources'])

In [None]:
no = df_HR[df_HR['Attrition'] == 0]
sns.lineplot(x = no.YearsSinceLastPromotion,y = no.MonthlyIncome,hue = no.Attrition,ci = None)
##for those who chose to stay in company, there monthly income increases with respect to there promotion period
##for those who choose to leave company, for them the monthly income has no proper relation with promotion period

Working and Removal of Outliers


In [None]:
##Focused on graphs with more outliers
for i in ['HourlyRate','TotalWorkingYears','NumCompaniesWorked','MonthlyRate']:
    sns.boxplot(y = df_HR[i],x = df_HR['Target'])
    plt.show()

In [None]:
print(df_HR['TotalWorkingYears'].quantile(0.10))
print(df_HR['TotalWorkingYears'].quantile(0.80))
df1 = df_HR.copy()
df1.shape

In [None]:
df1["TotalWorkingYears"] = np.where(df_HR["TotalWorkingYears"] <1, 1,df_HR['TotalWorkingYears'])
df1["TotalWorkingYears"] = np.where(df_HR["TotalWorkingYears"] >16,16,df_HR['TotalWorkingYears'])
print(df1['TotalWorkingYears'].skew())

In [None]:
df1["TotalWorkingYears"].describe()

In [None]:
num1 = ['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction','HourlyRate','JobInvolvement','MonthlyIncome','MonthlyRate','NumCompaniesWorked','PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','WorkLifeBalance','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [None]:
'''for i in num1:
    sns.boxplot(y = df1[i],x = df1['Target'])
    plt.show()'''

In [None]:
df_HR['MonthlyIncome'].describe()
x = df_HR['MonthlyIncome'].quantile(0.10)
y = df_HR['MonthlyIncome'].quantile(0.80)
df1["MonthlyIncome"] = np.where(df_HR["MonthlyIncome"] <x, x,df_HR['MonthlyIncome'])
df1["MonthlyIncome"] = np.where(df_HR["MonthlyIncome"] >y, y,df_HR['MonthlyIncome'])
print(df1['MonthlyIncome'].skew())

In [None]:
df_HR['NumCompaniesWorked'].describe()
x = df_HR['NumCompaniesWorked'].quantile(0.10)
y = df_HR['NumCompaniesWorked'].quantile(0.80)
df1["NumCompaniesWorked"] = np.where(df_HR["NumCompaniesWorked"] <x, x,df_HR['NumCompaniesWorked'])
df1["NumCompaniesWorked"] = np.where(df_HR["NumCompaniesWorked"] >y, y,df_HR['NumCompaniesWorked'])
print(df1['NumCompaniesWorked'].skew())

In [None]:
df1['TrainingTimesLastYear'].describe()

In [None]:
x = df_HR['TrainingTimesLastYear'].quantile(0.1)
y = df_HR['TrainingTimesLastYear'].quantile(0.9)
df1["TrainingTimesLastYear"] = np.where(df_HR["TrainingTimesLastYear"] <x, x,df_HR['TrainingTimesLastYear'])
df1["TrainingTimesLastYear"] = np.where(df_HR["TrainingTimesLastYear"] >y, y,df_HR['TrainingTimesLastYear'])
print(df1['TrainingTimesLastYear'].skew())

In [None]:
x = df_HR['YearsAtCompany'].quantile(0.1)
y = df_HR['YearsAtCompany'].quantile(0.9)
df1["YearsAtCompany"] = np.where(df_HR["YearsAtCompany"] <x, x,df_HR['YearsAtCompany'])
df1["YearsAtCompany"] = np.where(df_HR["YearsAtCompany"] >y, y,df_HR['YearsAtCompany'])
print(df1['YearsAtCompany'].skew())

In [None]:
x = df_HR['YearsInCurrentRole'].quantile(0.2)
y = df_HR['YearsInCurrentRole'].quantile(0.9)
df1["YearsInCurrentRole"] = np.where(df_HR["YearsInCurrentRole"] <x, x,df_HR['YearsInCurrentRole'])
df1["YearsInCurrentRole"] = np.where(df_HR["YearsInCurrentRole"] >y, y,df_HR['YearsInCurrentRole'])
print(df1['YearsInCurrentRole'].skew())
print(df1['YearsInCurrentRole'].describe())

In [None]:
x = df_HR['YearsSinceLastPromotion'].quantile(0.1)
y = df_HR['YearsSinceLastPromotion'].quantile(0.8)
df1["YearsSinceLastPromotion"] = np.where(df_HR["YearsSinceLastPromotion"] <x, x,df_HR['YearsSinceLastPromotion'])
df1["YearsSinceLastPromotion"] = np.where(df_HR["YearsSinceLastPromotion"] >y, y,df_HR['YearsSinceLastPromotion'])
print(df1['YearsSinceLastPromotion'].skew())
print(df1['YearsSinceLastPromotion'].describe())

In [None]:
x = df_HR['YearsWithCurrManager'].quantile(0.1)
y = df_HR['YearsWithCurrManager'].quantile(0.98)
df1["YearsWithCurrManager"] = np.where(df_HR["YearsWithCurrManager"] <x, x,df_HR['YearsWithCurrManager'])
df1["YearsWithCurrManager"] = np.where(df_HR["YearsWithCurrManager"] >y, y,df_HR['YearsWithCurrManager'])
print(df1['YearsWithCurrManager'].skew())
print(df1['YearsWithCurrManager'].describe())

In [None]:
y = df_HR['StockOptionLevel'].quantile(0.92)
df1["StockOptionLevel"] = np.where(df_HR["StockOptionLevel"] >y, y,df_HR['StockOptionLevel'])
print(df1['StockOptionLevel'].skew())
print(df1['StockOptionLevel'].describe())

In [None]:
num1 = ['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction','HourlyRate','JobInvolvement','MonthlyIncome','MonthlyRate','NumCompaniesWorked','PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','WorkLifeBalance','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [None]:
for i in num1:
    sns.boxplot(y = df1[i],x = df1['Target'])
    plt.show()

In [None]:
#df_HR.drop(['EmployeeCount','StandardHours'],axis=1,inplace = True)

**Correlation**

correlation coefficients only measure linear correlations.

In [None]:
# Calculate correlations
corr = df_HR.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
# Heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr,
            vmax=.5,
            mask=mask,
            # annot=True, fmt='.2f',
            linewidths=.2, cmap="RdYlGn")

"Monthly Rate", "Number of Companies Worked" and "Distance From Home" are positively correlated to Attrition;
while "Total Working Years", "Job Level", and "Years In Current Role" are negatively correlated to Attrition.

**Data Pre-processing**

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object
le = LabelEncoder()

Perform Label Encoding


In [None]:
print(df_HR.shape)
df_HR.head()

In [None]:
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df_HR.columns[1:]:
    if df_HR[col].dtype == 'object':
        if len(list(df_HR[col].unique())) <= 2:
            le.fit(df_HR[col])
            df_HR[col] = le.transform(df_HR[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))

In [None]:
# convert rest of categorical variable into dummy
df_HR = pd.get_dummies(df_HR, drop_first=True)

In [None]:
df_HR.drop('Target',axis=1,inplace = True)

In [None]:
print(df_HR.shape)
df_HR.head()

In [None]:
df_HR.shape

Feature Scaling

In [None]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
HR_col = list(df_HR.columns)
HR_col.remove('Attrition')
for col in HR_col:
    df_HR[col] = df_HR[col].astype(float)
    df_HR[[col]] = scaler.fit_transform(df_HR[[col]])
df_HR['Attrition'] = pd.to_numeric(df_HR['Attrition'], downcast='float')
df_HR.head()

In [None]:
print('Size of Full Encoded Dataset: {}'. format(df_HR.shape))

Splitting data into training and testing sets

In [None]:
# assign the target to a new dataframe and convert it to a numerical feature
#df_target = df_HR[['Attrition']].copy()
target = df_HR['Attrition'].copy()

In [None]:
df_HR.shape

In [None]:
y = target
#df_HR.drop('Attrition',axis=1,inplace = True)
X = df_HR.iloc[:,2:]

In [None]:
X.head()
X = pd.concat([df_HR.Age,X],axis=1)
X.head()
#X.drop(['EmployeeCount','StandardHours'],axis=1,inplace = True)

In [None]:
X.drop('Age',axis=1,inplace = True)

In [None]:
X = pd.concat([df_HR.Age,X],axis=1)

In [None]:
X.head()

In [None]:
y.head()


In [None]:
# Since we have class imbalance (i.e. more employees with turnover=0 than turnover=1)
# let's use stratify=y to maintain the same ratio as in the training dataset when splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)  
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
X_train.shape

**Model 1: Random Forest Classifier**: 
Random Forest is a popular and versatile machine learning method that is capable of solving both regression and classification. Random Forest is a brand of Ensemble learning, as it relies on an ensemble of decision trees. It aggregates Classification (or Regression) Trees. A decision tree is composed of a series of decisions that can be used to classify an observation in a dataset.

Random Forest fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. Random Forest can handle a large number of features, and is helpful for estimating which of your variables are important in the underlying data being modeled.

In [None]:
##Model 1: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(min_samples_leaf = 3)
clf.fit(X_train,y_train)
clf.feature_importances_

In [None]:
#9:85.37 #11:85.38 29:85.38 27: 85.72

clf.score(X_test,y_test)*100

In [None]:
from sklearn.model_selection import cross_val_score
score_rf = cross_val_score(clf,X,y)
print('minimum score',min(score_rf))
print('avergage',np.mean(score_rf))
print('maximum',np.max(score_rf))

In [None]:
###Reports for train data
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_train,clf.predict(X_train)))
print(confusion_matrix(y_train,clf.predict(X_train)))

In [None]:
##Reports for test data
print(classification_report(y_test,clf.predict(X_test)))
print(confusion_matrix(y_test,clf.predict(X_test)))

**Model 2: Support Vector Classifier**: 
A Support Vector Machine (SVM) is a discriminative classifier formally defined by a separating hyperplane. In other words, given labeled training data (supervised learning), the algorithm outputs an optimal hyperplane which categorizes new examples. In two dimentional space this hyperplane is a line dividing a plane in two parts where in each class lay in either side.

In [None]:
##Model 2:Support Vector Classifier
from sklearn.svm import SVC
clf2 = SVC(C = 2,gamma = 0.01)
clf2.fit(X_train,y_train)
clf2.score(X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
params_svc = {'C':list(np.arange(1,10)),'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.01,0.02,0.04]}

clfcv = GridSearchCV(clf2,params_svc)
clfcv.fit(X,y)
print(clfcv.best_params_)

In [None]:
from sklearn.model_selection import cross_val_score
score_rf1 = cross_val_score(clf2,X,y)
print('minimum score',min(score_rf1))
print('avergage',np.mean(score_rf1))
print('maximum',np.max(score_rf1))

In [None]:
print(classification_report(y_train,clf2.predict(X_train)))
print(confusion_matrix(y_train,clf2.predict(X_train)))

In [None]:
print(classification_report(y_test,clf2.predict(X_test)))
print(confusion_matrix(y_test,clf2.predict(X_test)))

**Model 3: Logistic Regression**: 
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. Logistic Regression is classification algorithm that is not as sophisticated as the ensemble methods or boosted decision trees method discussed below. Hence, it provides us with a good benchmark.

In [None]:
from sklearn.linear_model import LogisticRegression
clf3 = LogisticRegression()
clf3.fit(X_train,y_train)
pred3 = clf3.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf3,X,y,cv=10)
print(min(scores))
print(np.mean(scores))
print(max(scores))

In [None]:
print(classification_report(y_train,clf3.predict(X_train)))
print(confusion_matrix(y_train,clf3.predict(X_train)))

In [None]:
print(classification_report(y_test,clf3.predict(X_test)))
print(confusion_matrix(y_test,clf3.predict(X_test)))

**Model 4: KNeighbors Classifier**: 
The intuition behind the KNN algorithm is one of the simplest of all the supervised machine learning algorithms. It calculates the distance of a new data point to all other training data points. The distance can be of any type e.g Euclidean or Manhattan etc. It then selects the K-nearest data points, where K can be any integer. Finally it assigns the data point to the class to which the majority of the K data points belong.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf4 = KNeighborsClassifier(n_neighbors = 2)
clf4.fit(X_train,y_train)


In [None]:
from sklearn.model_selection import cross_val_score
scores_clf4 = cross_val_score(clf4,X,y,cv=10)
print(min(scores_clf4))
print(np.mean(scores_clf4))
print(max(scores_clf4))

In [None]:
print(classification_report(y_train,clf4.predict(X_train)))
print(confusion_matrix(y_train,clf4.predict(X_train)))

In [None]:
print(classification_report(y_test,clf4.predict(X_test)))
print(confusion_matrix(y_test,clf4.predict(X_test)))

**Model 5: XGB Classifier:** 
The implementation of the algorithm was engineered for efficiency of compute time and memory resources. A design goal was to make the best use of available resources to train the model. XGBoost is an implementation of gradient boosted decision trees designed for speed and performance.

In [None]:
##Model 5: XGB Classifier
from xgboost import XGBClassifier
clf5 = XGBClassifier(max_depth = 9)
clf5.fit(X_train,y_train)


In [None]:
scores_xg  = cross_val_score(clf5,X,y,cv=10)
print(min(scores_xg))
print(np.mean(scores_xg))
print(max(scores_xg))

In [None]:
print(classification_report(y_train,clf5.predict(X_train)))


In [None]:
print(clf5.score(X_test,y_test))
print(classification_report(y_test,clf5.predict(X_test)))
print(confusion_matrix(y_test,clf5.predict(X_test)))

**Model 6: Naive Bayes:** Naive Bayes classifiers are a collection of classification algorithms based on Bayes’ Theorem. It is not a single algorithm but a family of algorithms where all of them share a common principle, i.e. every pair of features being classified is independent of each other.

In [None]:
##Model 6:Naive bayes
from sklearn.naive_bayes import MultinomialNB
clf6 = MultinomialNB(alpha = 9)
clf6.fit(X_train,y_train)


In [None]:
params_nb = {'alpha':list(np.arange(1,10))}

clf_nb = GridSearchCV(clf6,params_nb)
clf_nb.fit(X_train,y_train)
print(clf_nb.best_params_)
print(clf_nb.best_score_)

In [None]:
print(classification_report(y_train,clf6.predict(X_train)))
print(confusion_matrix(y_train,clf6.predict(X_train)))

In [None]:
print(classification_report(y_test,clf6.predict(X_test)))
print(confusion_matrix(y_test,clf6.predict(X_test)))