In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![turnover](https://blog.bonus.ly/hubfs/employee-turnover.png)

# HR analysis
**HR analytics** is the process of collecting and analyzing Human Resource (**HR**) data in order to improve an organization's workforce performance. ... **HR analytics** provides data-backed insight on what is working well and what is not so that organizations can make improvements and plan more effectively for the future.

### Introduction :
Why are our best and most experienced employees leaving prematurely? Have fun with this database and try to predict which valuable employees will leave next. Fields in the dataset include:

- Satisfaction (satisfaction_level)
- Last review (last_evaluation)
- Number of projects done by employees (number_project)
- Average working hours per month (average_montly_hours)
- Entry Time (time_spend_company)
- Whether there is a work accident (Work_accident)
- Have you been promoted in the last five years (promotion_last_5years)
- Staff department (department)
- Salary level (salary)
- Resign (left)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('../input/hr-analytics-dataset/HR.csv', error_bad_lines=False)
df.info()

In [None]:
type(df['satisfaction_level'])

In [None]:
# Mean, standard deviation, minimum, lower quartile, second quartile, upper quartile, maximum
df.describe()

In [None]:
# Mode
df.mode()

In [None]:
# variance
df.var()

In [None]:
# Skewness coefficient
df['satisfaction_level'].skew()

**The skewness coefficient is less than 0, indicating negative skew, and most of the values ​​are greater than the mean. In other words, most people are satisfied.**

In [None]:
# Kurtosis Coefficient
df['satisfaction_level'].kurt()

**The kurtosis coefficient is less than 0, indicating that the distribution is gentler than the normal distribution**

In [None]:
# The normal distribution'mvsk' is mean, var, skew, kurt
stats.norm.stats(moments='mvsk')

In [None]:
# The y value corresponding to the normal distribution x=0
stats.norm.pdf(0.0)

In [None]:
# The area from negative infinity to a certain value
stats.norm.ppf(0.5)

In [None]:
# Cumulative probability from negative infinity
stats.norm.cdf(0)

**The above are exactly two opposites.**

In [None]:
# 10 numbers satisfying the normal distribution
stats.norm.rvs(size=10)

In [None]:
# Chi-square distribution
stats.chi2.stats(moments='mvsk', df=1)

In [None]:
# t distribution
stats.t.stats(moments='mvsk',df=100)

In [None]:
# f distribution
stats.f.stats(moments='mvsk', dfn=1, dfd=100)

In [None]:
# Sample 10
df.sample(n=10)

In [None]:
# Percentage of sample size
df.sample(frac=0.0005)

### Perform outlier analysis and distribution analysis on the satisfaction_level column

In [None]:
sl = df['satisfaction_level']
sl.isnull()

In [None]:
sl[sl.isnull()]

In [None]:
df[df['satisfaction_level'].isnull()]

In [None]:
# fillna
sl = sl.dropna()
sl[sl.isnull()]

**Outliers removed**

In [None]:
sl.mean()

In [None]:
sl.max()

In [None]:
sl.min()

In [None]:
sl.var()

In [None]:
sl.median()

In [None]:
sl.quantile(q=0.25)

In [None]:
sl.quantile(q=0.75)

**It can be seen from the above data that the distribution is still normal after removing the outliers.**

In [None]:
sl.skew()

In [None]:
sl.kurt()

In [None]:
# Histogram, divide continuous values ​​into 10 parts
np.histogram(sl.values, bins=np.arange(0.0, 1.1, 0.1))

**Most of the values ​​are concentrated in the second half, which accords with the result that the skewness coefficient is less than 0.**

### Perform outlier analysis on the last_evaluation column

In [None]:
le = df['last_evaluation']
le[le.isnull()]

In [None]:
le.mean()

In [None]:
le.std()

In [None]:
le.median()

In [None]:
le.max()

It is concluded from the analysis results that there must be an abnormally large value that has affected all the data.

In [None]:
le[le>1]

**Only one value deviates from normal.**

In [None]:
le = le[le<=1]
le

### Another method of extracting outliers is judged based on the upper and lower quartiles.

In [None]:
le_s = df['last_evaluation']
q_low = le_s.quantile(q=0.25)
q_high = le_s.quantile(q=0.75)
q_interval = q_high-q_low
# Allowable variable parameter k = 1.5, 2, 3 are all satisfied, because the gap between the outliers is too large
k = 1.5
le_s = le_s[le_s<q_high+k*q_interval][le_s>q_low-k*q_interval]
le_s

**The 15000th row of the outlier was also deleted.**

In [None]:
len(le_s)

In [None]:
len(le)

In [None]:
np.histogram(le.values, bins=np.arange(0.0, 1.1, 0.1))

In [None]:
le.mean()

In [None]:
le.std()

In [None]:
le.median()

In [None]:
le.max()

In [None]:
le.skew()

In [None]:
le.kurt()

**Now it seems that the indicators are still normal.**

### Analysis of static structure of number_project column

In [None]:
np_s = df['number_project']
np_s[np_s.isnull()]

In [None]:
np_s.mean()

In [None]:
np_s.std()

In [None]:
np_s.median()

In [None]:
np_s.max()

In [None]:
np_s.min()

In [None]:
np_s.skew()

The skewness coefficient is positive, indicating that the number of items for most people is less than the average.

In [None]:
np_s.kurt()

In [None]:
# Do simple statistics for discrete values
np_s.value_counts().sort_index()

In [None]:
# Get the statistical ratio
np_s.value_counts(normalize=True).sort_index()

### Analyze the average_monthly_hours column

In [None]:
amh = df['average_monthly_hours']
amh.mean()

In [None]:
amh.std()

In [None]:
amh.max()

In [None]:
amh.min()

In [None]:
amh.skew()

In [None]:
amh.kurt()

In [None]:
q_low = amh.quantile(0.25)
q_high = amh.quantile(0.75)
q_inter = q_high-q_low
amh=amh[amh< q_high+1.5*(q_high-q_low)][amh>q_low-1.5*(q_high-q_low)]
len(amh)

In [None]:
# Start the distribution from the minimum with a step size of 10
np.histogram(amh.values, bins=np.arange(amh.min(), amh.max()+10, 10))

In [None]:
amh.value_counts(bins=np.arange(amh.min(), amh.max()+10, 10)).sort_index()

The same is histogram statistics, the two methods have certain differences. The first type is left to close and right to open, and the second is left to open and right to close.

### Analyze the column of time_spend_company

In [None]:
tsc = df['time_spend_company']
tsc.value_counts().sort_index()

**Most employees work for about three years.**

### Analysis of the Work_accident column

In [None]:
wa = df['Work_accident']
wa.value_counts(normalize=True).sort_index()

**Most people have no accidents during their work, and the accident rate is 0.144**

### Analyze the left column

In [None]:
l = df['left']
l.value_counts().sort_index()

### Analyze the promotion_last_5years column

In [None]:
pl5 = df['promotion_last_5years']
pl5.value_counts()

Most people got a promotion within 5 years.

### Analysis of salary column

In [None]:
sa = df['salary']
sa.value_counts()

**There is a row of outliers.**

In [None]:
sa.where(sa != 'nme').dropna()

In [None]:
sa.where(sa != 'nme').dropna().value_counts(normalize=True)

**Or low-income accounts for the majority.**

### Analyze the department column

In [None]:
dep = df['department']
dep.value_counts(normalize=True)

**Here sale should be sales, which is an outlier, just delete it.**

In [None]:
dep = dep.where(dep != 'sale').dropna()
dep.value_counts(normalize=True)

### Aggregate all data

In [None]:
df = df.dropna(axis=0, how='any')
df.tail()

In [None]:
# Data filtering
df=df[df['last_evaluation']<1][df['salary']!='nme'][df['department']!='sale']

In [None]:
df.groupby('department').mean()

In [None]:
df.loc[:, ['last_evaluation', 'department']].groupby('department').mean()

### Custom aggregate function

In [None]:
df.loc[:, ['average_monthly_hours', 'department']].groupby('department')['average_monthly_hours'].apply(lambda x: x.max()-x.min())

## Visualization

In [None]:
# Bar graph
plt.title('SALARY')
# seaborn rich background
sns.set_style(style='darkgrid')
# Font settings
sns.set_context(context='poster', font_scale=0.6)
# Graphic color
sns.set_palette('summer')
plt.xlabel('salary level')
plt.ylabel('Number')
plt.xticks(np.arange(len(df['salary'].value_counts())), df['salary'].value_counts().index)
# Set x, y axis range
plt.axis([-0.5, 2.5, 0, 9000])
# X-axis chart corresponding name
plt.bar(np.arange(len(df['salary'].value_counts())), df['salary'].value_counts(), width=0.5)
# Mark the corresponding number
for x, y in zip(np.arange(len(df['salary'].value_counts())), df['salary'].value_counts()):
    plt.text(x, y+100, y, ha='center', va='bottom')
plt.show()

In [None]:
# Histogram 
f=plt.figure()
f.add_subplot(1, 3, 1)
sns.distplot(df['satisfaction_level'], bins=10, kde=True, hist=True)
f.add_subplot(1, 3, 2)
sns.distplot(df['last_evaluation'], bins=10)
f.add_subplot(1, 3, 3)
sns.distplot(df['average_monthly_hours'], bins=10)
plt.show()

In [None]:
# Box plot
# whis default 1.5, top to top quartile, bottom to bottom quartile
sns.boxplot(y=df['time_spend_company'], saturation=0.75, whis=1.5)

In [None]:
# line chart
sub_df = df.groupby('time_spend_company').mean()
sns.pointplot(sub_df.index, sub_df['left'])
plt.show()

**It can be seen that the turnover rate is the highest within five years of work.**

In [None]:
# Same effect
sns.pointplot(x='time_spend_company', y='left', data=df)

In [None]:
# Pie chart
color = sns.color_palette("Reds")
label = df['department'].value_counts().index
value = df['department'].value_counts(normalize=True)
# Highlight a part
explodes = [0.1 if i=='sales' else 0 for i in label]
plt.pie(value, labels=label, autopct="%1.1f%%", colors=color, explode=explodes)

**To the End.**

![greetings](https://www.animatedimages.org/data/media/466/animated-thank-you-image-0111.gif)