# Problem Statement

This is a fictional data set created by IBM data scientists. We need to uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’.

#### Various features are defined below

#### Education 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
#### EnvironmentSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
#### JobInvolvement  1 'Low' 2 'Medium' 3 'High' 4 'Very High'
#### JobSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
#### PerformanceRating 1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'
#### RelationshipSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
#### WorkLifeBalance 1 'Bad' 2 'Good' 3 'Better' 4 'Best'

## Reading and Understanding the Data

In [None]:
#Importing the packages
#Data processing packages
import numpy as np
import pandas as pd

#Visualization packages
import matplotlib.pyplot as plt
import seaborn as sns


# hide warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
hr = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
hr.head()

In [None]:
hr.info()

In [None]:
hr.shape

In [None]:
sum(hr.duplicated(subset = 'EmployeeNumber')) == 0

In [None]:
print (hr.isnull().values.any())

In [None]:
hr.describe(include="all")

In [None]:
hr.columns

# Data Cleaning & EDA

There is no data cleaning required. We will derive few new metrics for better analysis.

In [None]:
hr_categorical = hr.columns[hr.dtypes=='object']
print(hr_categorical)
print(hr_categorical.shape)

In [None]:
hr_numeric = hr.columns[hr.dtypes!='object']
print(hr_numeric)
print(hr_numeric.shape)

In [None]:
hr.corr()

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (30,20))        

# Size of the figure
sns.heatmap(hr.corr(),annot = True)
plt.show()

In [None]:
#Find attrition size 

hr['Attrition'].value_counts()

237 employees left the company out of total 1470 employees

In [None]:
hr["DistanceFromHome_title"] = pd.cut(hr.DistanceFromHome,[-1,5,10,20,30], labels=['Very near','near','far','very far'])


In [None]:
hr.groupby('DistanceFromHome_title')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.DistanceFromHome_title, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### The employees living "very far" from office are more likely to leave the organisation. 

In [None]:
hr.groupby('JobRole')['DistanceFromHome_title']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True, figsize=(7,6))

In [None]:
hr1 = hr.groupby(["Attrition","JobRole"]).DistanceFromHome_title.value_counts().unstack()
hr1.plot(kind="bar",align='center', alpha=1.0,figsize=(15,5))

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.violinplot(x = 'Education',y = 'MonthlyIncome',data=hr, hue='Attrition',split=True,palette='hot')
plt.legend(loc='best')
plt.show()

In [None]:
hr.groupby('Age')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True, figsize=(15,6))

hr["Age_grp"] = pd.cut(hr.Age,[-1,20,30,40,50,60], labels=['Teenager','20s','30s','40s','50s'])

pd.crosstab(hr.Age_grp, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Teenagers are more likely to leave the organisation. 

In [None]:
hr.groupby('JobRole')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True, figsize=(7,6))
pd.crosstab(hr.JobRole, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Sales Representative are more likely to leave the organisation. 

In [None]:
hr.groupby('Education')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.Education, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

###### Education 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'

### Employees with below college education are more likely to leave the organisation.

In [None]:
hr.groupby('EducationField')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.EducationField, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Employees with Human Resources & Technical Degree are more likely to leave the organisation.

In [None]:
hr.groupby('Department')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.Department, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Employees from Sales Department are more likely to leave the organisation.

In [None]:
hr.groupby('Gender')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.Gender, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Gender doesn't play major role in employee's attrition

In [None]:
hr.groupby('MaritalStatus')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.MaritalStatus, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Singles are more likely to leave the organisation

In [None]:
hr.groupby('OverTime')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.OverTime, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Employees who are doing overtime are more likely to leave the organisation.

In [None]:
hr.groupby('BusinessTravel')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)
pd.crosstab(hr.BusinessTravel, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Employees who travel_Frequently are more likely  to leave the organisation.

In [None]:
hr.groupby('JobSatisfaction')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.JobSatisfaction, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

#### JobSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

### Employees with low job satisfaction are more likely to leave the organisation.

In [None]:
hr.groupby('EnvironmentSatisfaction')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.EnvironmentSatisfaction, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

#### EnvironmentSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

### Employees with low Environment satisfaction are more likely  to leave the organisation.

In [None]:
hr.groupby('JobInvolvement')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.JobInvolvement, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

#### JobInvolvement 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

### Employees with low Job Involvement are more likely  to leave the organisation.

In [None]:
hr.groupby('PerformanceRating')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.PerformanceRating, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Performance rating doesn't affect the attrition.

In [None]:
hr.groupby('RelationshipSatisfaction')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.RelationshipSatisfaction, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

#### RelationshipSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

### Employees with low Relationship Satisfaction are more likely  to leave the organisation.

In [None]:
hr.groupby('WorkLifeBalance')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.WorkLifeBalance, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

#### WorkLifeBalance 1 'Bad' 2 'Good' 3 'Better' 4 'Best'

### Employees with Bad Work Life Balance are more likely  to leave the organisation.

In [None]:
hr["NumCompaniesWorked_grp"] = pd.cut(hr.NumCompaniesWorked,[-1,4,9], labels=['Less than 5','More than 5'])


hr.groupby('NumCompaniesWorked_grp')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.NumCompaniesWorked_grp, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')

### Employees who have worked in more than 5 companies are more likely to leave the organisation

In [None]:
hr.groupby('StockOptionLevel')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.StockOptionLevel, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')


### Employees with 0 Stock option level are more likely to leave the organisation

In [None]:
hr["MonthlyIncome_grp"] = pd.cut(hr.MonthlyIncome,[-1,5000,10000,15000,20000], labels=['0-5,000','5,001-10,000','10,001-15,000','15,001-20,000'])


hr.groupby('MonthlyIncome_grp')['Attrition']\
    .value_counts()\
    .unstack(level=1)\
    .plot.bar(stacked=True)

pd.crosstab(hr.MonthlyIncome_grp, hr.Attrition, margins=True, normalize='index').round(2).style.background_gradient(cmap='spring')


### Employees with monthly income slab of Rupees 0-5,000  are more likely to leave the organisation.

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.violinplot(x = 'Gender',y = 'MonthlyIncome',data=hr, hue='Attrition',split=True,palette='prism')
plt.legend(loc='best')
plt.show()

### Both Male & Female employees with monthly income slab of Rupees 0-5,000 are more likely to leave the organisation.