In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
!pip install dython
import dython
from dython.nominal import associations
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> Download employee retention dataset from here:
> https://www.kaggle.com/giripujar/hr-analytics

In [None]:
#read the csv file into a pandas dataframe
data = pd.read_csv('/kaggle/input/hr-analytics/HR_comma_sep.csv')

> Now do some exploratory data analysis to figure out which variables have direct and clear 
> impact on employee retention (ie, whether they leave the company or continue to work) 

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
print(data.info())
#print basic info of dataset
#no null values are present in this dataset

In [None]:
print(data.shape)
#we can see here that 14999 rows with 10 colums of data are present

In [None]:
# here we make a correlation matrix / heatmap to find correlations between all values
cols = data.columns
associations(data[cols],figsize=(10,10))

**From the correlation matrix generated above, we can list out the top 5 factors that cause an employee to leave by checking the 'left' row of correlation matrix:**
> 1. satisfaction_level
> 2. salary
> 3. work_accident
> 4. time_spend_company
> 5. department

> Plot bar charts showing impact of employee salaries on retention

In [None]:
emp_data = pd.crosstab(data.left, data.salary)
print(emp_data)
emp_data.plot(kind='bar')

We can see from here that employees who left (left = 1 ) is what we are concerned about. So in the second group of bars in the above bar chart, we can see that employees with high salary rarely left, whereas employees with low and medium salaries left a lot.

> Plot bar charts showing correlation between department and employee retention

In [None]:
dept_data = pd.crosstab(data.Department, data.left)
print(dept_data)
dept_data.plot(kind='bar')

From the above bar we can see that employees in sales and technical departments left the most whereas employees from management and R&D left the least.

> Now build logistic regression model using variables that were narrowed down in step 1 

In [None]:
# make a new dataframe by taking the 5 most important variables as mentioned before
narrowed_data = data.loc[:, ['left', 'satisfaction_level', 'salary', 'Work_accident', 'time_spend_company', 'Department']]

narrowed_data['salary'] = narrowed_data['salary'].replace(['high', 'low', 'medium'], [3, 1, 2])
old_depts = ['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing', 'product_mng', 'sales', 'support', 'technical']
new_depts = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
narrowed_data['Department'] = narrowed_data['Department'].replace(old_depts, new_depts)
print(narrowed_data.head())

X = narrowed_data.loc[:, ['satisfaction_level', 'salary', 'Work_accident', 'time_spend_company', 'Department']]
y = narrowed_data.loc[:, 'left']

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
from sklearn.linear_model import LogisticRegression
log  = LogisticRegression(solver='liblinear')
log.fit(X_train,y_train)
y_pred = log.predict(X_test)
print(y_pred)

> Measure the accuracy (precision,recal,F1 and ROC) of the mode

In [None]:
print(log.score(X_test,y_test))
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
accuracy_score(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# roc
from sklearn import metrics
metrics.plot_roc_curve(log, X_test, y_test)  
plt.show()    