In [None]:
# import packages
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as stats
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Read .csv file
df = pd.read_csv('/kaggle/input/hr-analytics/HR_comma_sep.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

**Data Exploration and Visualization**

In [None]:
df.describe()

In [None]:
# Employees left
left = df[df.left==1]
left.shape

In [None]:
# Employees retained
retained = df[df.left==0]
retained.shape

**Mean of features groupby Left**

In [None]:
df.groupby('left').mean()

Insights from above:

The "satisfaction_level" seems to be relatively low (0.44) in employees leaving firms compared to retained ones (0.66). The "Avergae_monthly_hours" spent by employees leaving the firm is higher compared to retained ones. The "Promotion_last_5years" employees who received promotion are likey to be retained in the firm.


**Impact of salries on employee retention**

In [None]:
pd.crosstab(df.salary,df.left).plot(kind='bar')

From the above plot we could observe that employees with hgh salary are likey to retain in the firm.

**Corelation between department and employee retention**

In [None]:
pd.crosstab(df.Department, df.left).plot(kind='bar')

From the plot we could see there is slight a employee retained has slight correlation with department but its not major.

**Check for correlation**

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), xticklabels=df.corr().columns, yticklabels=df.corr().columns, 
            annot=True, linewidth=4.8, cmap="autumn")

From the correlation check we could observe that fetaures last_evaluation, number_project, work_accident features has weak correlation with 'left'

In [None]:
# Creating new dataframe with features useful for building model
df2 = df[['satisfaction_level', 'average_montly_hours', 'promotion_last_5years', 'salary']]
df2.head()

Creating (n-1) dummy variables for Salary categorical feature

In [None]:
dummies = pd.get_dummies(df.salary, prefix="salary")
df3 = pd.concat([df2,dummies], axis=1)
df3.head()

In [None]:
# Dropping categorical feature 'salary' as dummies are alreday being created.
# Dropping 'salary_high' for to avoid multicollinerity
df4 = df3.drop(['salary', 'salary_high'], axis='columns')
df4.head()

**Building Logistic Regression Model**

In [None]:
X = df4
y = df.left

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train,y_train)

In [None]:
reg.predict(X_test)

**Accuracy of a Model**

In [None]:
reg.score(X_test,y_test)
print("Accuracy on Training set: ",reg.score(X_train,y_train))
print("Accuracy on Testing set: ",reg.score(X_test,y_test))

Logistic Regression Model Error Table

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = reg.predict(X_test)
print("\t\tError Table")
print('Mean Absolute Error     : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error      : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Squared Error         : ', metrics.r2_score(y_test, y_pred))

**Building Decision Tree classifier Model**

In [None]:
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
modeltree = tree.DecisionTreeClassifier()
modeltree.fit(X_train,y_train)

In [None]:
print('Model Accuracy on train data:', modeltree.score(X_train,y_train))
print('Model Accuracy on test data :', modeltree.score(X_test,y_test))

In [None]:
y_pred = modeltree.predict(X_test)
print("\t\tError Table")
print('Mean Absolute Error     : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error      : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Squared Error         : ', metrics.r2_score(y_test, y_pred))

**Random Forest Classifier Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
randommodel = RandomForestClassifier()
randommodel.fit(X_train,y_train)

In [None]:
print('Random Model Accuracy on train data:', randommodel.score(X_train,y_train))
print('Random model Accuracy on test data: ', randommodel.score(X_test,y_test))

In [None]:
y_pred = randommodel.predict(X_test)
print('\t\tError Table')
print('Mean Absolute Error       :', metrics.mean_absolute_error(y_test,y_pred))
print('Mean Squared Error        :', metrics.mean_squared_error(y_test,y_pred))
print('Root Mean Squared Error   :', np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
print('Mean Absolute Error       :', metrics.r2_score(y_test,y_pred))

**Area Under-Receiving Operating Characteristic Curve Evaluation Metric**

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Getting predicted probabilities
y_score1 = reg.predict_proba(X_test)[:,1]
y_score2 = modeltree.predict_proba(X_test)[:,1]
y_score3 = randommodel.predict_proba(X_test)[:,1]
# Creating true and false positive rate
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, y_score1)
false_positive_rate2, true_positive_rate2, threshold2 = roc_curve(y_test, y_score2)
false_positive_rate3, true_positive_rate3, threshold3 = roc_curve(y_test, y_score3)

reg_roc_auc    = roc_auc_score(y_test, y_score1)
tree_roc_auc   = roc_auc_score(y_test, y_score2)
random_roc_auc = roc_auc_score(y_test, y_score3)

print('roc_auc_score for Logistic Regression: ', reg_roc_auc)
print('roc_auc_score for DecisionTree: ', tree_roc_auc)
print('roc_auc_score for RandomForest: ', random_roc_auc)


**Ploting ROC Curve**

In [None]:
plt.figure(figsize=(9, 6))

# Plot Logistic Regression ROC
plt.plot(false_positive_rate1,true_positive_rate1,linestyle= '--',label='Logistic Regression(area = %0.3f)'
         % reg_roc_auc)

# Plot Decision Tree ROC
plt.plot(false_positive_rate2,true_positive_rate2,linestyle= '--',label='Decision Tree (area = %0.3f)'
         % tree_roc_auc)

# Plot Random Forest ROC
plt.plot(false_positive_rate3,true_positive_rate3,linestyle= '--',label='Random Forest (area = %0.3f)'
         % random_roc_auc)

# Plot Base Rate ROC
plt.plot([0,1], [0,1],linestyle= '--',label='Base Rate')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under-Receiving Operating Characteristic Graph')
plt.legend(loc="lower right")
plt.show()

**Conclusion**

The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.

It is evident from the plot that the AUC for the Decision Tree ROC curve(area=0.977) is higher than that for the Logistic(area=0.775). But almost equal to Random Forest ROC curves(area=0.966). 

Therefore, we can say that Decision Tree did a better job of classifying the positive class in the dataset.
