In [None]:
## HR Data Analysis to understand reasons why employees are leaving and draw plots for different data fields

#### HR data contains different attributes about employees in an organisation like Age, Business Travel, Daily Rate, Department, 
#### Distance from home, Education, Education Field, Job Level, Marital Status, Monthly Income, Monthly Rate, Num of companies worked,
#### Overtime, Percentage salary hike etc
#### There is an attrition Field which states if employee has left organisation or not

#### In total data has 1470 rows and 35 columns

In [None]:

#### Call libraries

import numpy as np ## For displaying graphics
import pandas as pd ## For reading data files
import matplotlib.pyplot as plt
import seaborn as sns ## For plots
import os ## For setting directory and environment variables


In [None]:
#### Read Data & View the different information in data

##os.chdir("D:\Trainings\Big Data Training\Excerises\Python Jupiter Excercise 2")
##os.listdir()

hrdata = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')
hrdata.head()

In [None]:
#### View Data Statistics

##### Full summary
hrdata.info()

##### How many columns?
hrdata.shape[1]

## Convert Attrition column to numeric
hrdata['Attrition_no']=pd.get_dummies(hrdata.Attrition, drop_first = True)

## Convert BusinessTravel column to numeric
hrdata=hrdata.replace({'BusinessTravel':{'Non-Travel':0,'Travel_Rarely':1,'Travel_Frequently':2}})


In [None]:
## Distribution Plots

#### Let us plot distribution plot for age, Monthly Rate, Number of Years at Company, Percentage Salary Hike

sns.distplot(hrdata['Age'])
plt.show()

### Inference: Age distribution is well spread with maximum no of employees around 25-40 years

sns.distplot(hrdata['MonthlyRate'])
plt.show()

sns.distplot(hrdata['YearsAtCompany'])
plt.show()

### Inference: Employee experience distribution is left skewed showing maximum no of employees <5-8 years exp

sns.distplot(hrdata['PercentSalaryHike'])
plt.show()

### Inference: Salary hike distribution is also left skewed with 11% to 14 % being the maximum occured

In [None]:
## Count Plots

#### Let us plot department wise count

sns.countplot(hrdata.Department)
plt.show()

### Inference: R&D has maximum no of employees

#### Let us view the plot for Education Field wise count

sns.countplot(hrdata.EducationField)
plt.show()

### Inference: Life sciences has maximum employees education followed by medical

#### Let us plot attrition count for given data

sns.countplot(hrdata.Attrition)
plt.show()

### Inference: Attrition count is around ~200 employees



In [None]:

## Bar Plots

#### Let us see how attrition has happened department wise, Jobrole wise, Gender wise and income wise in form for Bar plots

fig,ax = plt.subplots(2,2, figsize=(10,10))
sns.barplot(x = 'Department', y = 'Attrition_no', data = hrdata,ax = ax[0,0])
sns.barplot(x = 'MaritalStatus', y = 'Attrition_no', data = hrdata,ax = ax[0,1])
sns.barplot(x = 'Gender', y = 'Attrition_no', data = hrdata,ax = ax[1,0])
sns.barplot(x = 'BusinessTravel', y = 'Attrition_no', data = hrdata,ax = ax[1,1])
plt.show()

### Inference: Attrition is high in Sales followed by HR.
###            Singles have maximum attrition rate compared to Married and Divorced
###            Male have more attrition rate than females
###            Attrition rate is higher for frequent business travellers (Sales travel more frequently)

#### Let us see how attrition has happened for job roles
fig,ax = plt.subplots(1,1, figsize=(10,15))
sns.barplot(x = 'Attrition_no', y = 'JobRole', data = hrdata,orient='h')
plt.show()

### Inference: Attrition is high for Sales representative role, followed by Lab technician and HR

In [None]:
#### Let us see Monthly income with Gender
fig,ax = plt.subplots(1,1, figsize=(5,5))
sns.barplot(x = 'Gender', y = 'MonthlyIncome', data = hrdata)
plt.show()

In [None]:
#### Let us see Monthly income with JobRole
sns.barplot(x = 'MonthlyIncome', y = 'JobRole', data = hrdata,orient='h')
plt.show()

### Monthly Income for Manager is highest followed by Research Director

In [None]:
## Box Plots

#### Let us see relation between Attrition and Job satisfaction score

sns.boxplot(hrdata['JobSatisfaction'], hrdata['Attrition'])
plt.show()

### Inference: Attrition is high when job satisfaction score is less than 3

#### Let us see relation between Attrition and Environment satisfaction score

sns.boxplot(hrdata['DistanceFromHome'], hrdata['Attrition'])
plt.show()

### Inference: If distance from home to office is more it does lead to attrition

In [None]:
## Scatter Plots

#### Let us see scatter plot between Attrition and Performance Rating score

sns.jointplot(hrdata.Age,hrdata.Education, kind = "scatter")   
plt.show()

#### Let us see scatter plot between Attrition and Performance Rating score

sns.jointplot(hrdata.Age,hrdata.MonthlyIncome, kind = "reg")   
plt.show()

In [None]:
## Pair Plots

#### Let us see pair plot between Attrition and key factors like Num of Companies worked, Overtime, Stock Options, Total Working Years
hrdata_col1= ['NumCompaniesWorked','YearsAtCompany','TotalWorkingYears','YearsWithCurrManager','YearsInCurrentRole','Attrition']
sns.pairplot(hrdata[hrdata_col1], kind="reg", diag_kind = "kde" , hue = 'Attrition' )
plt.show()



In [None]:
#### Let us see pair plot between Attrition and key relationship factors like Years At Company, Years in Current Role, Job satisfaction, Years with curr manager
hrdata_col2= ['DistanceFromHome', 'BusinessTravel','EnvironmentSatisfaction','JobLevel','Attrition']
sns.pairplot(hrdata[hrdata_col2], kind="reg", diag_kind = "kde" , hue = 'Attrition' )
plt.show()



In [None]:
#### Let us see pair plot between Attrition and key appraisal factors like Monthly income, Percentage Salary Hike, Performance rating, Years Since last promotion & Stock Option Level
hrdata_col3= ['MonthlyIncome', 'PercentSalaryHike','PerformanceRating','YearsSinceLastPromotion','StockOptionLevel','Attrition']
sns.pairplot(hrdata[hrdata_col3], kind="reg", diag_kind = "kde" , hue = 'Attrition' )
plt.show()

In [None]:
#### Let us plot heatmap for correlation between different attributes of employee data

f,ax = plt.subplots(figsize=(20, 20))
corr = hrdata.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
#### Let us plot correlation between key variables from the dataset based on observations from above plot
f,ax = plt.subplots(figsize=(10, 10))
hrdata_1 = hrdata[['Age','MonthlyIncome','YearsInCurrentRole','YearsSinceLastPromotion','TotalWorkingYears','JobLevel']]
hcorr = hrdata_1.corr()
sns.heatmap(hcorr, 
        xticklabels=hcorr.columns,
        yticklabels=hcorr.columns)
plt.show()

In [None]:
### Inference:
### Attrition is dependent on key variables like:

#### 1.Business travel (Frequent-> High chances of attrition)
#### 2. Years at Company (Less -> High chances of attrition)
#### 3. Years Since Last Promotion (Less -> High chances of attrition)
#### 4. Years with current manager (Less -> High chances of attrition)
#### 5. Job Role (Sales -> High chances of attrition)
#### 6. Distance from home (More -> High chances of attrition)