In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from matplotlib.pyplot import figure

In [None]:
df=pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
df.isnull().values.any()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_new=df.drop(['EmployeeCount','EmployeeNumber','Over18','StandardHours'],axis=1)

In [None]:
attrition=df_new.loc[df.Attrition=='Yes']
print(len(attrition))

Employees in the range of 25-35 years roughly of age exhibit most slack in performance.

In [None]:
sns.countplot(x='BusinessTravel',data=attrition)

In [None]:
print("Complete data")
print(df_new['BusinessTravel'].value_counts(normalize=True)*100)
print("\nBusinessTravel categorical percentage when attrition=Yes")
print(attrition['BusinessTravel'].value_counts(normalize=True)*100)

It can be seen from the above calculation that 30% employees who show attrition travel frequently.

In [None]:
sns.countplot(x='Department',data=attrition)

Research and Development shows the maximum attrition level followed by Sales followed by Human Resources

In [None]:
df_new.JobRole.unique()

In [None]:
figure(figsize=(20,4)) 
sns.countplot(x='JobRole',data=attrition)

In [None]:
print(df.loc[df['JobRole']=='Laboratory Technician','Department'].iloc[0])
print(df.loc[df['JobRole']=='Sales Executive','Department'].iloc[0])
print(df.loc[df['JobRole']=='Research Scientist','Department'].iloc[0])

The maximum attrition rates are seen in laboratory technicians and research scientists from the research and development department and in sales executives and sales representatives from sales. This is in confirmation with the graph above which stated that the highest attrition rates were in these departments.


As seen in the graph above, there is no regular trend in the impact of commuting on the performance.

In [None]:
figure(figsize=(15,4))
sns.stripplot(x="DistanceFromHome", y="DailyRate", data=attrition,jitter=True)

There are more number of employees staying closer to the workplace and show a tendency towards higher daily rates as compared to employees living farther away. 

In [None]:
sns.countplot(x='EnvironmentSatisfaction',data=attrition)

Lesser environment satisfaction results in higher attrition in the lower and higher ranges. The mid range [2-3] shows an opposite trend.

In [None]:
sns.countplot(x='Gender',data=df_new,hue='Attrition')

Attrition is predominant in men compared to women.

In [None]:
sns.distplot(attrition['JobInvolvement'],hist=False)

In [None]:
sns.lineplot(x='JobLevel',y='JobSatisfaction',data=attrition)

In [None]:
sns.distplot(attrition['JobLevel'],hist=False)

Higher the job level, lesser is the rate of attrition in employees.

In [None]:
sns.distplot(attrition['JobSatisfaction'],hist=False)

Employees with JobInvolvement level between 2 and 3 show higher level of attrition. 
No such general trend pertaining to JobSatisfaction is seen.

In [None]:
sns.distplot(attrition['Age'],hist=False)

In [None]:
figure(figsize=(10,4))
overtime=attrition.loc[attrition.OverTime=='Yes']
sns.countplot(x='Age',data=overtime)

Employees in the age range of 26-35 approximately working overtime show maximum attrition.
It can therefore be concluded that attrition is predominant in this age range due to overtime work.

In [None]:
figure(figsize=(15,4))
sns.kdeplot(attrition['MonthlyRate'])
sns.rugplot(attrition['MonthlyRate'])

The monthly rate ranging from 7000-23000 shows higher attrition rate.

In [None]:
edjob=df_new.loc[(df.JobSatisfaction>=3) & (df.Attrition=='Yes')]
print(len(edjob))
edjob=df_new.loc[(df.JobSatisfaction<3)  & (df.Attrition=='Yes')]
print(len(edjob))

We see that despite higher job satisfaction, the number of employees with poor performance is almost equal to the numbe rof employees with lesser satisfaction. Therefore, JobSatisfaction parameter does not govern performance.

In [None]:
sns.countplot(x='WorkLifeBalance',data=attrition)

A surprising find that a on a scale of 1-4, employees with work-life balance 2 and 3 show higher attrition.


In [None]:
sns.countplot(x='MaritalStatus',data=attrition,hue='Attrition')

From the graph we see that single employees suffer from attrition more than married and divorced employees.

In [None]:
print("Single",len(attrition.loc[(attrition.MaritalStatus=='Single') & (attrition.OverTime=='Yes')]))
print("Married",len(attrition.loc[(attrition.MaritalStatus=='Married') & (attrition.OverTime=='Yes')]))
print("Divorced",len(attrition.loc[(attrition.MaritalStatus=='Divorced') & (attrition.OverTime=='Yes')]))

This count tallies with the attrition level associated with single employees. Their overtime leads to degraded performance.

In [None]:
df_new[['TotalWorkingYears','YearsAtCompany']].head()

In [None]:
df_new['PastExperience']=df_new['TotalWorkingYears']-df_new['YearsAtCompany']

In [None]:
#df_new[['TotalWorkingYears','YearsAtCompany','PastExperience']]
df_new.head()

In [None]:
print(len(df_new.loc[(df_new.PastExperience>0) & (df_new.Attrition=='Yes')]))
print(len(df_new.loc[(df_new.PastExperience>0) & (df_new.Attrition=='No')]))

We see that employees who have past experience do not generally slack in performance.


In [None]:
figure(figsize=(10,4))
sns.countplot(x='YearsAtCompany',data=attrition)

The more the number of years spent at this company causes better performance.

In [None]:
sns.countplot(x='YearsInCurrentRole',data=attrition)

On an average, fewer years in the current job role accounts for a higher level of performance drop.

In [None]:
sns.distplot(attrition['RelationshipSatisfaction'],hist=False)

A higher relationship satisfaction on scale leads to more attrition among employees.


In [None]:
sns.lineplot(y='RelationshipSatisfaction',x='YearsWithCurrManager',data=attrition)

There exists no uniform trend in the years spent under the manager and his relationship with the latter.

In [None]:
print(len(df_new['OverTime']=='Yes'))
print(len(df_new['OverTime']=='No'))

print(len(attrition['OverTime']=='Yes'))
print(len(attrition['OverTime']=='No'))

In [None]:
sns.boxplot(x='OverTime',y='JobInvolvement',data=attrition)

No conclusion can be drawn between the level of job involvement and overtime factor.

In [None]:
sns.countplot(x='TrainingTimesLastYear',data=attrition)

The number of employees suffering from attrition reduces drastically as the number of training times last year increases from 2 onwards.

In [None]:
sns.lineplot(y='JobLevel',x='Education',data=df_new)

A linear relationship exists between education and job level.

In [None]:
sns.countplot(x='StockOptionLevel',data=attrition)

Lower the stock level options, greater is the slack in performance.

In [None]:
sns.stripplot(y='MonthlyIncome',x='Attrition',data=df_new)

It is seen that out of a small section of employees that suffer from attrition, employees with salaries below 10,000 show maximum attrition. 

In [None]:
figure(figsize=(10,4))
sns.countplot(x='YearsSinceLastPromotion',data=attrition)

It is observed that employees with less frequent promotions fare better.

In [None]:
sns.countplot(x='PerformanceRating',data=attrition)

Employees with performance rating 3 out of 4 show degraded performance compared to employees who have full rating.

In [None]:
sns.countplot(x='PercentSalaryHike',data=attrition)

The above graph shows that greater salary hike causes better performance compared to those with poor salary hike. This shows that salary hikes and higher appreciation (performance rating) improves employee performance.

In [None]:
df_new['JobChangeRate']=df_new['PastExperience']/df_new['NumCompaniesWorked']
m = df_new.loc[df_new['JobChangeRate'] != np.inf, 'JobChangeRate'].max()
print(m)
df_new['JobChangeRate'].replace(np.inf,m,inplace=True)
df_new['JobChangeRate']

The employees suffering from attrition are dominated by lower job change rates implying that more varying experience (higher job change rate) helps improve performance.

In [None]:
figure(figsize=(8,4))
a=df_new.loc[df_new.Attrition=='Yes']
sns.distplot(a['JobChangeRate'],kde=False)

As the rate of job change increases performance improves (lesser attrition). 

In [None]:
sns.lineplot(y='PercentSalaryHike',x='JobInvolvement',data=df_new)

The percentage in salary hike takes a dip as the job involvement level increases upto 3 and then goes up by a little amount.

In [None]:
sns.lineplot(y='PercentSalaryHike',x='PerformanceRating',data=df_new)

A linear relationship is observed between performance rating and salary hike.