## Importing and understanding the data 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/hr-analytics/HR_comma_sep.csv')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

## Exploratory Data Analysis

Now do some exploratory data analysis to figure out which variables have direct and clear impact on employee retention (ie, whether they leave the company or continue to work) 

In [None]:
df['left'].describe()

In [None]:
print("No of employees lost by the company: ", df[df['left']==1].shape[0])
print("No of employees retained by the company: ", df[df['left']==0].shape[0])

In [None]:
df.groupby('left').mean() 

In [None]:
import seaborn as sns
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.heatmap(df.corr(), linewidths = 2, cmap="plasma", annot=True)

In [None]:
df1 = df[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'left']]

### Heatmap of continuous value attributes only

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.heatmap(df1.corr(), linewidths = 2, cmap="plasma", annot=True)

### Pairplot with Employee Retention as hue

In [None]:
sns.pairplot(df, hue="left")

We see that the plots made by each of the two from the following classifies Employees Retained from Employees left:
- satisfaction level
- average monthly hours
- last evaluation

## Understanding impact of employee salaries on retention using bar charts

### A. Comparing Employee Retention with Salary

In [None]:
df1 = df[['left','salary']]
left = df1[df['left']==1].salary.value_counts()
retained = df1[df['left']==0].salary.value_counts()
left_percent = left / (left + retained)
retain_percent = retained / (left + retained)
counts1 = {"retained":retained, "left":left, "retained_percent":retain_percent, "left_percent":left_percent}
counts1 = pd.DataFrame(counts1)
counts1

In [None]:
x = counts1.index
y1 = counts1.retained
y2 = counts1.left

f, axs = plt.subplots(2,2,figsize=(15,5))
plt.subplot(1, 2, 1)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Number of Employee Retention grouped by Salary")
plt.xlabel("Salary")
plt.ylabel("Number of Employees Retained/Left")
plt.legend(['retained', 'left'])
y1 = counts1.retained_percent
y2 = counts1.left_percent

plt.subplot(1, 2, 2)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Percent of Employee Retention grouped by Salary")
plt.xlabel("Salary")
plt.ylabel("Percent of Employees Retained/Left")
plt.legend(['retained', 'left'])

plt.show()

Here, we see that the lower the salary, the more number of employees left their job at the company

## Plotting bar charts to show the correlation between department and employee retention

### B. Comparing Employee Retention with Department

In [None]:
df1 = df[['left','Department']]
left = df1[df['left']==1].Department.value_counts()
retained = df1[df['left']==0].Department.value_counts()
left_percent = left / (left + retained)
retain_percent = retained / (left + retained)
counts2 = {"retained":retained, "left":left, "retained_percent":retain_percent, "left_percent":left_percent}
counts2 = pd.DataFrame(counts2)
counts2

In [None]:
x = counts2.index
y1 = counts2.retained
y2 = counts2.left

f, axs = plt.subplots(2,2,figsize=(15,5))
plt.subplot(1, 2, 1)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Number of Employee Retention grouped by Department")
plt.xlabel("Department")
plt.ylabel("Number of Employees Retained/Left")
plt.xticks(rotation=80)
plt.legend(['retained', 'left'])

y1 = counts2.retained_percent
y2 = counts2.left_percent

plt.subplot(1, 2, 2)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Percent of Employee Retention grouped by Department")
plt.xlabel("Department")
plt.ylabel("Percent of Employees Retained/Left")
plt.xticks(rotation=80)
plt.legend(['retained', 'left'])

plt.show()

Here, we see that the percentage of Employees retained are slightly greater for the Department Management and RandD

### C. Comparing Employee Retention with Work Accident

In [None]:
df1 = df[['left','Work_accident']]
left = df1[df['left']==1].Work_accident.value_counts()
retained = df1[df['left']==0].Work_accident.value_counts()
left_percent = left / (left + retained)
retain_percent = retained / (left + retained)
counts3 = {"retained":retained, "left":left, "retained_percent":retain_percent, "left_percent":left_percent}
counts3 = pd.DataFrame(counts3)
counts3.index = ["No", "Yes"]
counts3

In [None]:
x = counts3.index
y1 = counts3.retained
y2 = counts3.left

f, axs = plt.subplots(2,2,figsize=(15,5))
plt.subplot(1, 2, 1)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Number of Employee Retention grouped by Work Accident")
plt.xlabel("Work Accident")
plt.ylabel("Number of Employees Retained/Left")
plt.legend(['retained', 'left'])

y1 = counts3.retained_percent
y2 = counts3.left_percent

plt.subplot(1, 2, 2)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Percent of Employee Retention grouped by Work Accident")
plt.xlabel("Work Accident")
plt.ylabel("Percent of Employees Retained/Left")
plt.legend(['retained', 'left'])

plt.show()

### D. Comparing Employee Retention with promotion in last 5 years

In [None]:
df1 = df[['left','promotion_last_5years']]
left = df1[df['left']==1].promotion_last_5years.value_counts()
retained = df1[df['left']==0].promotion_last_5years.value_counts()
left_percent = left / (left + retained)
retain_percent = retained / (left + retained)
counts4 = {"retained":retained, "left":left, "retained_percent":retain_percent, "left_percent":left_percent}
counts4 = pd.DataFrame(counts4)
counts4.index = ["No", "Yes"]
counts4

In [None]:
x = counts4.index
y1 = counts4.retained
y2 = counts4.left

f, axs = plt.subplots(2,2,figsize=(15,5))
plt.subplot(1, 2, 1)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Number of Employee Retention grouped by Promotion in last 5 years")
plt.xlabel("Work Accident")
plt.ylabel("Number of Employees Retained/Left")
plt.legend(['retained', 'left'])

y1 = counts4.retained_percent
y2 = counts4.left_percent

plt.subplot(1, 2, 2)
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.title("Percent of Employee Retention grouped by Promotion in last 5 years")
plt.xlabel("Work Accident")
plt.ylabel("Percent of Employees Retained/Left")
plt.legend(['retained', 'left'])

plt.show()

Though the number of Employees retained are higher when they have gotten a promotion in the last 5 years, there are very few Employees with promotions

### E. Comparing Employee Retention with continuous valued attributes, namely:

1. satisfaction_level
2. last_evaluation
3. average_montly_hours
4. time_spend_company
5. sns.boxplot(data = df, y='number_project', x='left')

In [None]:
sns.boxplot(data = df, y='satisfaction_level', x='left')

In [None]:
box=sns.boxplot(data = df, y='last_evaluation', x='left')

In [None]:
sns.boxplot(data = df, y='average_montly_hours', x='left')

In [None]:
sns.boxplot(data = df, y='time_spend_company', x='left')

In [None]:
sns.boxplot(data = df, y='number_project', x='left')

Thus, we see that Satisfaction_level has a significant influence on Employee Retention

## Building a Logistic Regression Model

In [None]:
df1 = df[['salary', 'Department','satisfaction_level', 'average_montly_hours', 'promotion_last_5years','left']]

In [None]:
df1 = pd.get_dummies(df1, columns = ['Department','salary'])

In [None]:
df1.head()

In [None]:
X = np.asarray(df1.loc[:, df1.columns != 'left'])
y = np.asarray(df1.loc[:, df1.columns == 'left'])

In [None]:
from sklearn import preprocessing
# scaler = preprocessing.StandardScaler().fit(X)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 1)

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train.ravel())
log_pred = model.predict(X_test)

## Evaluation of the model

In [None]:
print(" accuracy = ", accuracy_score(y_test, log_pred)) 
print(" f1_score = ", f1_score(y_test, log_pred))
print(confusion_matrix(y_test, log_pred))
print(classification_report(y_test, log_pred))

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train.ravel())

y_pred=model.predict(X_test)
y_proba=model.predict_proba(X_test)

ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
print("ROC AUC SCORE: ",roc_auc_score(y_test, y_proba[:, 1]))
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, y_proba[:,1])
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')