Dataset is downloaded from Kaggle. Link: https://www.kaggle.com/giripujar/hr-analytics

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/hr-analytics/HR_comma_sep.csv')
df.head()

### Preliminary data exploration

In [None]:
df[df['left']==1].shape[0] # No of employees lost by the company

In [None]:
df[df['left']==0].shape[0] # No of employees retained by the company

In [None]:
df.groupby('left').mean() 

Based on the above observation:
 Parameters- satisfaction_level, average_monthly_hours and promotion_last_5years seem to have the most effect on retention rate

### Employee retention salary-wise: Stacked Bar Chart

In [None]:
df2 = df[['left','salary']]
left = df2[df['left']==1].salary.value_counts()
retained = df2[df['left']==0].salary.value_counts()
counts = {"retained":retained, "left":left}
counts = pd.DataFrame(counts)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def stacked_chart(df, title):
    fields = df.columns.tolist()
    category = df.index.tolist()
    ypos = np.arange(len(category))
    colors = ['#1D2F6F', '#8390FA']
    
    print(df)
    
    # Stacked Bar Chart
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(12, 10))

    left = len(df)*[0]
    for idx, name in enumerate(fields):
        plt.barh(df.index,df[name],left=left, color=colors[idx])
        left = left + df[name]
    
    # title, legend, labels
    plt.title('Employee retention by salary', loc='left')
    plt.legend(fields, bbox_to_anchor=([0.55, 1, 0, 0]), ncol=4, frameon=False)
    plt.xlabel('No of Employees')
    
    # Stacked Percentage Bar Chart
    
    df['Total'] = df[fields].sum(axis=1)
    idx = len(df.columns)
    # create a column for each salary segment proportion of total
    for i in fields:
        df['{}_Percent'.format(i)] = df[i] / df['Total']


    df_p = df[df.columns.tolist()[idx:]]
    
    title = 'Employee retention percentage salary-wise'
    fields = df_p.columns.tolist() 
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(12, 10))

    # plot bars
    left = len(df_p) * [0]
    for idx, name in enumerate(fields):
        plt.barh(df_p.index, df_p[name], left = left, color=colors[idx])
        left = left + df_p[name]
        
    # title
    plt.title(title, loc='left')
    
    # format x ticks
    xticks = np.arange(0,1.1,0.1)
    xlabels = ['{}%'.format(i) for i in np.arange(0,101,10)]
    plt.xticks(xticks, xlabels)
    plt.show()


In [None]:
stacked_chart(counts,"Employee retention: Salary-wise")

The above graph shows that attrition rate is higher in lower salary level.

### Department-wise employee retention rate

In [None]:
df3 = df[['left','Department']]

In [None]:
depts = df3.Department.unique().tolist()

In [None]:
left = df3[df3['left']==1].Department.value_counts()
retained = df3[df3['left']==0].Department.value_counts()
dept_counts = pd.DataFrame({"retained":retained, "left":left})
dept_counts

In [None]:
stacked_chart(dept_counts,"Employee retention: department-wise")

### Logistics Regression model

In [None]:
df1 = df[['salary','satisfaction_level', 'average_montly_hours', 'promotion_last_5years','left']]

In [None]:
df1

In [None]:
dummies = pd.get_dummies(df1.salary)
dummies

In [None]:
df1 = pd.concat([df1,dummies],axis = 'columns')

In [None]:
df1 = df1.drop(['salary','medium'],axis='columns')
df1

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(df1[['satisfaction_level', 'average_montly_hours', 'promotion_last_5years','high','low']],df1.left, test_size = 0.001,random_state = 1)

In [None]:
X_test

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)