# Import Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import tensorflow.keras.backend as K
cfg = tf.compat.v1.ConfigProto()

import warnings
warnings.filterwarnings('ignore')

# Reading CSV

In [None]:
hr_data = pd.read_csv('../input/hr-analytics/HR_comma_sep.csv', encoding  = 'utf-8')

# Exploratory Data Analysis

In [None]:
hr_data.head()

Moving target feature to the last for our easiness

In [None]:
cols = hr_data.columns.tolist()
new_position = 10

cols.insert(new_position, cols.pop(cols.index('left')))
hr_data = hr_data[cols]

Checking if the columns has sucessfully indexed to the last or not

In [None]:
hr_data.head()

In [None]:
hr_data.info()

In [None]:
hr_data.describe()

We've 3 categorical columns (Department, Salary, eft). Hence we'll do a quick analysis to see how they impact our target variable (left).
Firt of all, let's analyze employees according to their department for better understanding.

In [None]:
hr_data.groupby('Department').mean()

Now lets have a quick analysis or employees according to their salaries. 

In [None]:
hr_data.groupby('salary').mean()

We'll finally do a quick analysis of left column

In [None]:
hr_data.groupby('left').mean()

# Cleaning of data

In [None]:
hr_data.isnull().sum()

In [None]:
print("Number of duplicates : ", len(hr_data[hr_data.duplicated()]))

In [None]:
hr_data = hr_data.drop_duplicates()
print("Number of duplicates : ", len(hr_data[hr_data.duplicated()]))

# Data Visualization

First of all we'll check the distribution of data using countplot.
Let's see the data distribution of our categorical columns in descending order.

In [None]:
sns.countplot(hr_data.left, palette = "Set2")

As we can see that column (left) don't have a normal distribution which can cause biasness in our ML model. Hence we'll standardized our data later in this notebook. 

In [None]:
sns.countplot(x = 'salary' ,hue ='left' ,palette = "Set2" , data= hr_data)

We can observe that employees with low and medium salary are more likely to leave the organization than employees with higher salaries. However, the first two bars show that there are more employees with low and medium salary than higher ones' 

In [None]:
plt.figure(figsize = (15, 7))
sns.countplot(x = 'Department' ,hue ='left' ,palette = "Set2" , data= hr_data)

The points observed are mentioned below:
* The distribution of data is almost normal as we can see.
* This tells that employees working in the sales department are more than any other department.
* The employees working in management department are less than any other department.
* Most employees left are from sales department.
* The department with least number of employees leaving is management.


In [None]:
sns.pairplot(hr_data, hue = 'left')

The pairplot tells us the story of the data. I've listed some points below regardind the employees of left:
* The poeple who left the organization had a satisfication level less than 0.4
* The number of projects done by employees who left were 2 or less than 2.
* The average monthly hours spent by those employees who left were 150 and below it. Seems they weren't that     much intrested due to some reasons. 
* The time spent by these employees who left was 3 months and below it. 
* The employees that have promtional value greater than 0.3 are more likely to stay in company

In [None]:
plt.figure(figsize = (13, 7))
sns.heatmap(hr_data.corr(), annot = True, cmap = 'Greens')

Here I've plotted a heatmap above that shows the relationships between two variables based on values and colors.
* The factor that most influence the decision of an employee whether they will stay in a company or not is time   spend at company
* Satisfication level also does not seems to have a good relation with column (left).

# Data Preprocessing

Data preprocessing is the most important part of a machine learning model building. A ML algo only understand the data in numerical format and it should be standardized or normalized. 
First of all, let's encode the data by creating dummies. 

In [None]:
hr_data = pd.get_dummies(hr_data, columns=['salary'])

Now we'll do feature selection here for training and testing purpose

In [None]:
X = hr_data.drop(columns = ['left', 'Department', 'Work_accident'])
y = hr_data['left']

It's essential to standardized our data like we said earlier if we want a really good machine learning algorithm that can perform well on unseen data

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

As our data is standardized and both encoded now, we'll move towards training our data and then testing it. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42) 

# Model Building

I'll choose 3 machine learning models for my data, Logistic Regression, Decison Tree and Random Forest.

In [None]:
models = {
    '        Logistic Regression': LogisticRegression(),
    '        Decision Tree': DecisionTreeClassifier(),
    '        Random Forest Classifier': RandomForestClassifier(),
}


accuracy, precision, recall = {}, {}, {}

for i in models.keys():
    
    models[i].fit(X_train, y_train)
    y_pred = models[i].predict(X_test)
    
    accuracy[i] = accuracy_score(y_pred, y_test)
    precision[i] = precision_score(y_pred, y_test)

Lets see which our model performs the best here

In [None]:
hr_data_models = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision'])
hr_data_models['Accuracy'] = accuracy.values()
hr_data_models['Precision'] = precision.values()
hr_data_models

In [None]:
cm = confusion_matrix(y_test, y_pred)
conf_mat = pd.DataFrame(data = cm, columns = ['Predicted Not Left', 'Predicted Left'], index = ['Actual Not Left', 'Actual Left'])
sns.heatmap(conf_mat, annot = True, fmt='d', cmap="YlGnBu")

In [None]:
TN=cm[0,0]
TP=cm[1,1]
FN=cm[1,0]
FP=cm[0,1]
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)
print('The acuuracy of the model = TP+TN/(TP+TN+FP+FN) = ',(TP+TN)/float(TP+TN+FP+FN),'\n', '\n',
        'Sensitivity or True Positive Rate = TP/(TP+FN) = ',TP/float(TP+FN),'\n',
        'Specificity or True Negative Rate = TN/(TN+FP) = ',TN/float(TN+FP),'\n', '\n',
        'Positive Predictive value = TP/(TP+FP) = ',TP/float(TP+FP),'\n',
        'Negative predictive Value = TN/(TN+FN) = ',TN/float(TN+FN),'\n',)

**Neural Networks**

I've used Multilayer perceptron and Artifical Neural Network here in this notebook, Let's train them and see their performance.

In [None]:
mlp = MLPClassifier(max_iter = 500)
mlp.fit(X_train, y_train)
mlp_y_pred = mlp.predict(X_test)

print('The accuracy score of MLP is : ', accuracy_score(mlp_y_pred, y_test))
print('The precision score of MLP is : ', precision_score(mlp_y_pred, y_test))

In [None]:
ann = tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))

ann.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
ann.fit(X_train,y_train,batch_size=32,epochs=10)
ann_y_pred = ann.predict(X_test)
ann_y_pred = (ann_y_pred>0.5)

print('The accuracy score of MLP is : ', accuracy_score(y_test, ann_y_pred))
print('The precision score of MLP is : ', precision_score(y_test, ann_y_pred))

# Conclusion

First of all we did some emploratory data analysis of our dataset and then we did some cleaning. Later we did some preprocessing and then trained 3 Lachine Learning algorithms and then 2 Neural Networks. The results showed that **Random Forest Classifier** has outperformed all of the models with 98 percent accuray and the model that performed very bad was logistic regression. 