# Employee Attrition Prediction

## Loading Libraries

In [None]:
# Basic Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


In [None]:
# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [None]:
#import the necessary modelling algos.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE


In [None]:
from sklearn.tree import ExtraTreeClassifier

In [None]:
#preprocess.
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder,OneHotEncoder

In [None]:
from sklearn.utils import resample
from sklearn.metrics import (accuracy_score,
                             f1_score,
                             roc_auc_score,
                             roc_curve,
                             confusion_matrix)
from sklearn.model_selection import (cross_val_score,
                                     GridSearchCV,
                                     RandomizedSearchCV,
                                     learning_curve,
                                     validation_curve,
                                     train_test_split)

from sklearn.pipeline import make_pipeline # For performing a series of operations

from sklearn.metrics import plot_confusion_matrix


In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

### Getting a Feel of the data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes.unique() # There are the only available datatypes in our dataset

### 5 number summary

In [None]:
df.describe()

In [None]:
df.Attrition.describe()

In [None]:
df.Attrition.value_counts()

In [None]:
df.BusinessTravel.value_counts()

In [None]:
sns.distplot(df.Age) # Age is unimodal

In [None]:
df.Age.describe() # Age is Normally Distributed

In [None]:
df.columns

In [None]:
df.skew()

In [None]:
num_cat = df.select_dtypes(exclude='O')
num_cat_cols = num_cat.columns
num_cat_cols

In [None]:
fig,ax = plt.subplots(6,2,figsize=(9,9))
sns.distplot(df['TotalWorkingYears'],ax=ax[0,0])
sns.distplot(df['MonthlyIncome'],ax=ax[0,1])
sns.distplot(df['YearsAtCompany'], ax = ax[1,0]) 
sns.distplot(df['DistanceFromHome'], ax = ax[1,1]) 
sns.distplot(df['YearsInCurrentRole'], ax = ax[2,0]) 
sns.distplot(df['YearsWithCurrManager'], ax = ax[2,1]) 
sns.distplot(df['YearsSinceLastPromotion'], ax = ax[3,0]) 
sns.distplot(df['PercentSalaryHike'], ax = ax[3,1]) 
sns.distplot(df['YearsSinceLastPromotion'], ax = ax[4,0]) 
sns.distplot(df['TrainingTimesLastYear'], ax = ax[4,1]) 
sns.distplot(df['DailyRate'], ax = ax[5,0]) 
sns.distplot(df['HourlyRate'], ax = ax[5,1]) 
plt.tight_layout()

In [None]:
cat_df = df.select_dtypes(include='O')
cat_df.head()

In [None]:
cat_df.columns

In [None]:
# function to plot all categorical variables
def plot_cat(attr):
    #sns.factorplot(data=df,kind='count',size=5,aspect=1.5,x=attr)
    data = [go.Bar(
            x=df[attr].value_counts().index.values,
            y= df[attr].value_counts().values
    )]
    py.iplot(data, filename='basic-bar')


    

In [None]:
plot_cat('Attrition')

**The data is imbalanced**

In [None]:
plot_cat(df.BusinessTravel.name)

In [None]:
plot_cat(df.EducationField.name)

In [None]:
plot_cat(df.Department.name)

In [None]:
plot_cat(df.Gender.name)

In [None]:
plot_cat(df.MaritalStatus.name)

In [None]:
plot_cat(df.JobRole.name)

In [None]:
plot_cat(df.Over18.name)

In [None]:
df.Over18.describe()

In [None]:
df.Over18.value_counts()

In [None]:
plot_cat(df.OverTime.name)

In [None]:
# def plot_num(attr):
#     sns.factorplot(data=df,kind='count',size=5,aspect=1.5,x=attr)
    

In [None]:
# for i in num_cat_cols:
#     plot_num(i)


## 2. Checking Correlation
    1.Using heatmap

In [None]:
cor_mat = df.corr()

In [None]:
np.amin(cor_mat) # No serious -ve correlation can be seen

In [None]:

mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,15)
sns.heatmap(data=cor_mat,mask=mask,fmt='.2f',linewidths=0.1,square=True,annot=True,cbar=True)

**Observations**

    1. Job level has a positive correlation with Age (0.51) - Aged employees are seniors
    2. Monthly Income has a positive correlation with Age (0.50) and Job Level (0.95) - Aged Employees are seniors and have higher salary
    3. Performance Rating has a strong +ve correlation with percent salary hike (0.77) - Hike is based on performance
    4. Total working years has a strong +ve correlation with Age (0.68),Job level(0.78) and Monthly Income (0.77) which is  obvious
    5. Years in current core is highly correlated with years at company (0.76)
    6. Years since last promotion is well correlated with years at company (0.62) - More the no of years at company more the chances pf promotion
    7. Years with current manager is highly correlated with years at company(0.77) and years in current role (0.71)
    8. Years with current manager is also correlated with years since last promotion (0.51)
    9. Years at con=mpany has a +ve correlation with age (0.31) and Job level (0.53), Monthly income (0.51) and Total       working years (0.63)

#### Till Now, we have plotted individual features and visualzed the correlation. Now, Lets plot against target variable

In [None]:
# Creating a function to plot against target variable
def plot_target(attr):
    if attr == df.Age.name:
        sns.factorplot(data=df,y='Age',x='Attrition',size=5,aspect=1,kind='box')
        return
    sns.factorplot(data=df,kind='count',x=df.Attrition.name,col = attr)

In [None]:
plot_target(df.Department.name)

In [None]:
#pd.crosstab(columns=df.Attrition,index=df.Department,values=df.Attrition,aggfunc='mean')
pd.crosstab(columns=df.Attrition,index=df.Department,normalize='index') # normailze = index gives row wise mean

[Cross Table Reference](https://pbpython.com/pandas-crosstab.html)

    1. 19 % HR Employees Leave
    2. 13% R&D Employyes Leave
    3. 20% Sales Employees Leave

In [None]:
plot_target('Age')


**People who have higher age - Seniors have less tendency to leave the organization as compared to Young employees.**

**This is true as young employees look to explore oppurtunites and experiment with their careers, whereas aged employees have already been there and done that. Now they have settled for good.**

In [None]:
sns.factorplot(data=df,kind='bar',x='Attrition',y='MonthlyIncome')

**Employees with High income dont quit**

In [None]:
plot_target(df.JobSatisfaction.name)

**Job Satisfaction level 1 and level 3 employees quit the most. Why Level 3 Though?**

**Maybe because the no of employees with job satisfaction level 3 are more. Hence the trend** 

In [None]:
age_cross_tab = pd.crosstab(columns=df.Attrition,index=df.Age,margins_name='Total',margins=True)

In [None]:
age_cross_tab['Attrition_Ratio'] = age_cross_tab.Yes/age_cross_tab.Total

In [None]:
age_cross_tab

    . Attrition Ratio is the Highest for Age 19

In [None]:
pd.crosstab(columns=[df.Attrition],index=[df.Gender],margins=True,normalize='index')

    1. 14.7% females left
    2. 17% males left
    3. Overall 16.1% employees left

In [None]:
pd.crosstab(columns=df.Attrition,index=df.JobLevel,margins=True,normalize='index')

**People from job level 1 leave the most followed by job level 3**

In [None]:
pd.crosstab(columns=df.Attrition,index=[df.JobLevel,df.JobSatisfaction],margins=True,normalize='index')

**People with Job Satisfaction 1 - Poor leave the most**

In [None]:
pd.crosstab(columns=[df.Attrition],index=[df.EnvironmentSatisfaction],margins=True,normalize='index')

**Poor (1) Environment Satisfaction also results in employee leaving the company**

In [None]:
pd.crosstab(columns=[df.Attrition],index=df.YearsWithCurrManager,margins=True,normalize='index')

**It can be seen that employees with new managers leave the most. Could it be an Unhealthy relationship between the both**

In [None]:
pd.crosstab(columns=[df.Attrition],index=df.YearsSinceLastPromotion,margins=True,normalize='index')

**Promotion can also be a reason for the employee leaving**

In [None]:
pd.crosstab(columns=[df.Attrition],index=[df.WorkLifeBalance],margins=True,normalize='index')

**More Employees with Poor work life balance leave**

In [None]:
pd.crosstab(columns=[df.Attrition],index=[df.BusinessTravel],margins=True,normalize='index')

## Feature Selection

In [None]:
# Using RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


In [None]:
df.shape

In [None]:
df.head()

### Feature Encoding

### Encoding Salary to low medium high

In [None]:
# Ecoding to low medium high based on ranges 
def encode_salary(salary):
    if salary>=1009 and salary < 7339:
        return 'Low'
    elif salary>=7339 and salary < 13669:
        return 'Medium'
    elif salary >=13669 and salary <= 19999:
        return 'High'
    

In [None]:
df['Income_Cat'] = df['MonthlyIncome'].apply(encode_salary)

In [None]:
df.Income_Cat.value_counts()

In [None]:
df.Income_Cat.shape

In [None]:
df.isnull().sum(axis=0)

In [None]:
df.Income_Cat.value_counts()

In [None]:
dic = {'Low':0,'Medium':1, 'High':2}
df.Income_Cat = df.Income_Cat.map(dic)
df.Income_Cat.head()

In [None]:
df.Income_Cat.value_counts()

In [None]:
df.shape

### Dropping column which we think arent important

In [None]:
df.drop(['BusinessTravel','DailyRate','EmployeeCount','EmployeeNumber','HourlyRate','MonthlyRate'
          ,'NumCompaniesWorked','Over18','StandardHours', 'StockOptionLevel','TrainingTimesLastYear','MonthlyIncome'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
def feature_encode(feature):
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    print(le.classes_)

In [None]:
cat_df = df.select_dtypes(include='object')
cat_df.columns

In [None]:
for col in cat_df.columns:
    feature_encode(col)

In [None]:
df.head()

In [None]:
df.Income_Cat.value_counts()

In [None]:
df.dtypes

### We have encoded categorical data. Everything is now numerical, but are of varying magnitudes and range.
#### It is important we perform feature scaling in such cases

### Feature Scaling

In [None]:
# scaler = StandardScaler()

In [None]:
# scaled_df = scaler.fit_transform(df.drop('Attrition',axis=1))
# X= scaled_df
# Y = df['Attrition'].to_numpy()

In [None]:
# X = df.loc[:,df.columns!='Attrition']
# X.head()

In [None]:
df.head()

In [None]:
# Y = df['Attrition']
# Y.head()

## Splitting into Train & Test data

In [None]:
X_2 = df.loc[:, df.columns != "Attrition"].values # All columns except Attrition
y_2 = df.loc[:, df.columns == "Attrition"].values.flatten() # Attrition column and flatten to bring it to row format

In [None]:
X_2

In [None]:
y_2

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_2, y_2, test_size=0.2, stratify=y_2, random_state=1)

In [None]:
X_train_2

In [None]:
y_train_2

In [None]:
X_train_2[y_train_2 == 1].shape

### We know that the dataset is imbalanced and hence we perfeorm upsampling of the minority class below

# Upsampling minority class

[Upsampling Reference](https://elitedatascience.com/imbalanced-classes)

In [None]:
X_train_u, y_train_u  = resample(X_train_2[y_train_2 == 1],
                                y_train_2[y_train_2==1],
                                 replace = True,
                                 n_samples=X_train_2[y_train_2 == 0].shape[0],
                                random_state=1
                                )

In [None]:
# Combine majority class with upsampled minority class
X_train_u = np.concatenate((X_train_2[y_train_2 == 0], X_train_u))
y_train_u = np.concatenate((y_train_2[y_train_2 == 0], y_train_u))


In [None]:
print("Original shape:", X_train_2.shape, y_train_2.shape)
print("Upsampled shape:", X_train_u.shape, y_train_u.shape)

# Building Models

In [None]:
# Build random forest classifier
methods_data = {"Original": (X_train_2, y_train_2),
                "Upsampled": (X_train_u, y_train_u)}

for method in methods_data.keys():
    pip_rf = make_pipeline(StandardScaler(),
                           RandomForestClassifier(n_estimators=500,
                                                  class_weight="balanced",
                                                  random_state=123))
    hyperparam_grid = {
        "randomforestclassifier__n_estimators": [10, 50, 100, 500],
        "randomforestclassifier__max_features": ["sqrt", "log2", 0.4, 0.5],
        "randomforestclassifier__min_samples_leaf": [1, 3, 5],
        "randomforestclassifier__criterion": ["gini", "entropy"]}
    
    gs_rf = GridSearchCV(pip_rf,
                         hyperparam_grid,
                         scoring="f1",
                         cv=10,
                         n_jobs=-1)
    
    gs_rf.fit(methods_data[method][0], methods_data[method][1])
    
    print("\033[1m" + "\033[0m" + "The best hyperparameters for {} data:".format(method))
    for hyperparam in gs_rf.best_params_.keys():
        print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_rf.best_params_[hyperparam])
    
    print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_rf.best_score_) * 100))
    

### Upsampling has the highest CV f1-score with 98.55%.
#### we will use upsampled data on other models too

In [None]:
X_train_u[y_train_u == 0].shape, X_train_u[y_train_u == 1].shape

### After upsampling Our data is divided into 50% Attrition =1 and 50 % Attrition = 0

### Refitting Random Forest with Upsampled Data

In [None]:
# Refit RF classifier using best params
clf_rf = make_pipeline(StandardScaler(),
                       RandomForestClassifier(n_estimators=500,
                                              criterion="gini",
                                              max_features='sqrt',
                                              min_samples_leaf=1,
                                              class_weight="balanced",
                                              n_jobs=-1,
                                              random_state=123))


clf_rf.fit(X_train_u, y_train_u)

In [None]:
# Plot confusion matrix and ROC curve
np.set_printoptions(precision=2)
disp = plot_confusion_matrix(clf_rf,X_test_2,y_test_2,display_labels=df.Attrition.name,cmap=plt.cm.Blues)

In [None]:
# Build Gradient Boosting classifier
pip_gb = make_pipeline(StandardScaler(),
                       GradientBoostingClassifier(loss="deviance",
                                                  random_state=123))

hyperparam_grid = {"gradientboostingclassifier__max_features": ["log2", 0.5],
                   "gradientboostingclassifier__n_estimators": [100, 300, 500],
                   "gradientboostingclassifier__learning_rate": [0.001, 0.01, 0.1],
                   "gradientboostingclassifier__max_depth": [1, 2, 3]}

gs_gb = GridSearchCV(pip_gb,
                      param_grid=hyperparam_grid,
                      scoring="f1",
                      cv=10,
                      n_jobs=-1)

gs_gb.fit(X_train_u, y_train_u)

print("\033[1m" + "\033[0m" + "The best hyperparameters:")
print("-" * 25)
for hyperparam in gs_gb.best_params_.keys():
    print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_gb.best_params_[hyperparam])

print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_gb.best_score_) * 100))

### GBT have a F1-Score of 95.5%

In [None]:
# Build logistic model classifier
pip_logmod = make_pipeline(StandardScaler(),
                           LogisticRegression(class_weight="balanced"))

hyperparam_range = np.arange(0.5, 20.1, 0.5)

hyperparam_grid = {"logisticregression__penalty": ["l1", "l2"],
                   "logisticregression__C":  hyperparam_range,
                   "logisticregression__fit_intercept": [True, False]
                  }

gs_logmodel = GridSearchCV(pip_logmod,
                           hyperparam_grid,
                           scoring="accuracy",
                           cv=2,
                           n_jobs=-1)

gs_logmodel.fit(X_train_u, y_train_u)

print("\033[1m" + "\033[0m" + "The best hyperparameters:")
print("-" * 25)
for hyperparam in gs_logmodel.best_params_.keys():
    print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_logmodel.best_params_[hyperparam])

print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_logmodel.best_score_) * 100))

### Logistic Regression have a F1-Score of 75.10%

### Printing out the Accuracy and f1-score of each model 

In [None]:
estimators = {"RF": clf_rf,
              "LR": gs_logmodel,
              "GBT": gs_gb
             }

# Print out accuracy score on test data
print("The accuracy rate and f1-score on test data are:")
for estimator in estimators.keys():
    print("{}: {:.2f}%, {:.2f}%.".format(estimator,
        accuracy_score(y_test_2, estimators[estimator].predict(X_test_2)) * 100,
         f1_score(y_test_2, estimators[estimator].predict(X_test_2)) * 100))

### Printing out other important metrics

In [None]:
model_names=['RandomForestClassifier','Logistic Regression','GradientBoostingClassifier']
models = [clf_rf,gs_logmodel,gs_gb]

In [None]:
def compare_models(model):
    clf=model
    clf.fit(X_train_u,y_train_u)
    pred=clf.predict(X_test_2)
    
    # Calculating various metrics
    
    acc.append(accuracy_score(pred,y_test_2))
    prec.append(precision_score(pred,y_test_2))
    rec.append(recall_score(pred,y_test_2))
    auroc.append(roc_auc_score(pred,y_test_2))

In [None]:
acc=[]
prec=[]
rec=[]
auroc=[]

In [None]:
for model in models:
    compare_models(model)
d={'Modelling Algo':model_names,'Accuracy':acc,'Precision':prec,'Recall':rec,'Area Under ROC Curve':auroc}
met_df=pd.DataFrame(d)
met_df

### Finding the Important features

In [None]:
clf_rf = RandomForestClassifier(n_estimators=500,
                                criterion="gini",
                                max_features='sqrt',
                                min_samples_leaf=1,
                                class_weight="balanced",
                                n_jobs=-1,
                                random_state=123)


clf_rf.fit(StandardScaler().fit_transform(X_train_u), y_train_u)

# Plot features importance
importances = clf_rf.feature_importances_
indices = np.argsort(clf_rf.feature_importances_)[::-1]
plt.figure(figsize=(12, 6))
plt.bar(range(1, 24), importances[indices], align="center")
plt.xticks(range(1, 24), df.columns[df.columns != "Attrition"][indices], rotation=90)
plt.title("Feature Importance", {"fontsize": 16});

**Conclusion:** Age plays a vital role in attrition according to our analysis and model. This can be true as it is normally seen that younger empoloyees tend to switch organizations in order to explore fields and find what best suits them.