In [None]:
#import
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Load the data:

In [None]:
HR_data= pd.read_csv("/kaggle/input/hr-dataset/HR.csv")
HR_data.head()

In [None]:
HR_data.shape

In [None]:
HR_data=HR_data.rename(columns={"sales": "department"})
# check the columns:
HR_data.columns

# Data analysis:


In [None]:
HR_data.info()

In [None]:
#Show unique value for category feature
print('department:', HR_data.department.unique())
print('salary:', HR_data.salary.unique())

In [None]:
HR_data.describe()

In [None]:
corr=HR_data.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)

In [None]:
#Show how many employee left
sns.countplot(x='left',data=HR_data)
print(HR_data.groupby('left').size())


In [None]:
#Understanding the correlation between salary level and the employee leaving
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='salary',hue='left',data=HR_data, ax=ax[0])
sns.pointplot(x='salary', y='left', data=HR_data, ax=ax[1])
ax[1].set_ylabel('left probability')


In [None]:
#Understanding the correlation between project number and employee departure
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='number_project',hue='left',data=HR_data,ax=ax[0])
sns.pointplot(x='number_project', y='left', data=HR_data, ax=ax[1])
ax[1].set_ylabel('left probability')

In [None]:
#Understanding the correlation between employee's department and the employee leaving
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='department',hue='left',data=HR_data,ax=ax[0]);
sns.pointplot(x='department', y='left', data=HR_data, ax=ax[1])
ax[1].set_ylabel('left probability')
HR_data.groupby(['department','left']).size()


In [None]:
#Understanding the correlation between time spend company and the employee leaving
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='time_spend_company',hue='left',data=HR_data,ax=ax[0]);
sns.pointplot(x='time_spend_company', y='left', data=HR_data, ax=ax[1])
ax[1].set_ylabel('left probability')

In [None]:
#Understanding the correlation between promotion and the employee leaving
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='promotion_last_5years',hue='left',data=HR_data,ax=ax[0]);
sns.pointplot(x='promotion_last_5years', y='left', data=HR_data, ax=ax[1])
ax[1].set_ylabel('left probability')

In [None]:
fig, ax =plt.subplots(figsize=(6.7, 4.27))
HR_data.pivot(columns='left').satisfaction_level.plot(kind = 'hist', stacked=True, ax=ax)
ax.set_xlabel('satisfaction level')

In [None]:
fig, ax =plt.subplots(2,1, figsize=(7.7, 11.27))
HR_data.pivot(columns='left').average_montly_hours.plot(kind = 'hist', stacked=True, ax=ax[0])
ax[0].set_xlabel('average montly hours')
sns.scatterplot(data=HR_data, y="promotion_last_5years", x="average_montly_hours",hue='left', ax=ax[1])
ax[1].set_title('promotion and average montly hours VS left ')

In [None]:
fig, ax =plt.subplots(figsize=(6.7, 4.27))
HR_data.pivot(columns='left').last_evaluation.plot(kind = 'hist', stacked=True, ax=ax)
ax.set_xlabel('last evaluation')


In [None]:
fig, ax =plt.subplots(2,1, figsize=(11.7, 9.27))
sns.countplot(x='Work_accident',hue='left',data=HR_data, ax=ax[0])
sns.pointplot(x='Work_accident', y='left', data=HR_data, ax=ax[1])


# Preprocessing

In [None]:
data= HR_data.copy()
#Checking of missing values
data.isnull().sum()

Handle categorical data:

In [None]:
#Set numeric value to salary col by there value size: low-1, medium-2,high-3
salary= dict(zip(HR_data["salary"].unique(), [1,2,3]))
data.replace({"salary": salary}, inplace=True)
data.info()

In [None]:
# get dummies for departmen
dummies_department=data['department'].str.get_dummies()
data=data.drop(columns=['department'])
data=data.join(dummies_department,how="outer")
data.head()

In [None]:
#Normlize average monthly hours by project numbers
data['monthly_hours_rate']= data['average_montly_hours']/data['number_project']
data= data.drop(columns=['average_montly_hours'])

data.head()

In [None]:
corr=data.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)

# Prediction model and Evaluation

In [None]:
import xgboost as xgb
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import cross_validate
import numpy as np

x=data.copy()
x=x.drop(columns=['left'])
y=data['left']

#split train test by: 0.2 test, 0.8 train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)

print(len(X_test))
print(len(y_train))



Using Decision Tree Classifier:

In [None]:
#Fit
dtree = tree.DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)

#Evaluation
precision,recall,fscore,split=precision_recall_fscore_support(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)


print('Accuracy of Decision Tree Classifier on test set: {:.2f}'
     .format(dtree.score(X_test, y_test)))
print('Precision of Decision Tree Classifier on test set: {:.2f}'
     .format(np.mean(precision)))
print('Recall of Decision Tree Classifier on test set: {:.2f}'
     .format(np.mean(recall)))
print('F score of Decision Tree Classifier on test set: {:.2f}'
     .format(np.mean(fscore)))


Using XGBoost:

In [None]:
#Fit
xg_reg = xgb.XGBClassifier()
xg_reg.fit(X_train,y_train)
y_pred = xg_reg.predict(X_test)

#Evaluation
precision,recall,fscore,split=precision_recall_fscore_support(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)

print('Accuracy of XGBClassifier on test set: {:.2f}'
     .format(xg_reg.score(X_test, y_test)))
print('Precision of XGBClassifier on test set: {:.2f}'
     .format(np.mean(precision)))
print('Recall of XGBClassifier on test set: {:.2f}'
     .format(np.mean(recall)))
print('F score of XGBClassifier on test set: {:.2f}'
     .format(np.mean(fscore)))



Using Random Forest:

In [None]:
#Fit
rfo = RandomForestRegressor(n_estimators=5)
rfo.fit(X_train,y_train)
y_pred = rfo.predict(X_test)
y_pred= y_pred.round()

#Evaluation
precision,recall,fscore,split=precision_recall_fscore_support(y_test, y_pred)
print(confusion_matrix(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)


print('Accuracy of Random Forest on test set: {:.2f}'
     .format(rfo.score(X_test, y_test)))
print('Precision of Random Forest on test set: {:.2f}'
     .format(np.mean(precision)))
print('Recall of Random Forest on test set: {:.2f}'
     .format(np.mean(recall)))
print('F score of Random Forest on test set: {:.2f}'
     .format(np.mean(fscore)))


In [None]:
import shap
shap.initjs()
model=xg_reg
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x)
shap.summary_plot(shap_values,x, max_display=7)

In [None]:
shap.summary_plot(shap_values, X_test.columns, plot_type="bar", max_display=7)