#### This note book focuses on analysis on why people are leaving an organization. It uses the IBM HR data available from Kaggle

In [None]:
# Importing the required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Reading the Data Set
hr_data = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
#Displaying the Top 5 rows in Data Set
hr_data.head()

In [None]:
#Check if any Column's has NA or Missing Values
hr_data.isnull().any()

In [None]:
#Shape of the Data
hr_data.shape

Data set has 1470 rows and 35 Columns

In [None]:
#Data Types of Columns
hr_data.dtypes

In [None]:
#Show how much % of employees left the organization
hr_data.Attrition.value_counts(normalize=True)

 16 % of the employees left the company. 84% did not leave the company.(Class Imbalance Problem) 

### EDA

In [None]:
plt.figure(figsize=(8,6))
Attrition=hr_data.Attrition.value_counts()
sb.barplot(x=Attrition.index ,y=Attrition.values)
plt.title('Distribution of Employee Turnover')
plt.xlabel('Employee Turnover', fontsize=16)
plt.ylabel('Count', fontsize=16)

We can clearly see that Dataset in Imbalanced(84% No , 16% Yes)

In [None]:
sb.distplot(hr_data['Age'])

Age is fairly normally distributed with average Age around 35. Most of the employees in the range 25 to 45

In [None]:
sb.distplot(hr_data['MonthlyIncome'])

Most the employees have their monthly income around 5000$ and is distribution Rightly Skewed

In [None]:
sb.distplot(hr_data['TotalWorkingYears'])

Total Working Years is also right skewed similar to Age

In [None]:
#Method that plot density plots on the columns passed as input
def kdePlot(var):
    fig = plt.figure(figsize=(15,4))
    ax=sb.kdeplot(hr_data.loc[(hr_data['Attrition'] == 'No'),var] , color='b',shade=True, label='no Attrition') 
    ax=sb.kdeplot(hr_data.loc[(hr_data['Attrition'] == 'Yes'),var] , color='r',shade=True, label='Attrition')
    plt.title('Employee Attrition with respect to {}'.format(var))
    

In [None]:
numerical_df=hr_data.select_dtypes(include=np.number)
numeric_cols_kdeplot=list(numerical_df.columns)
remove_columns=['Age','DistanceFromHome','Education','EmployeeCount','EmployeeNumber',
'EnvironmentSatisfaction' ,'HourlyRate','JobInvolvement','JobSatisfaction','MonthlyRate','NumCompaniesWorked',
'PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StandardHours','StockOptionLevel','TotalWorkingYears',
'TrainingTimesLastYear','WorkLifeBalance','YearsSinceLastPromotion']
for l in remove_columns:
    numeric_cols_kdeplot.remove(l)

In [None]:
#Plotting KDE plots
for n in numeric_cols_kdeplot:
    kdePlot(n)

### Observations

Attrition is high with Employees having joblevel as 1.


Employees whose monthly income is below $5000 have high attrition.


Employees who worked in the company around 0 to 1 years have high attrition.


Employees who are in current role for around 0 to 4 years have high attrition.

In [None]:
BarPlot_columns=['Age','DistanceFromHome','EducationField',
                'JobInvolvement','JobLevel','JobRole','OverTime','TotalWorkingYears','TrainingTimesLastYear',
                'WorkLifeBalance','YearsInCurrentRole']

In [None]:
#Method the perform Bar plots
def Bar_plots(var):
    col=pd.crosstab(hr_data[var],hr_data.Attrition)
    col.div(col.sum(1).astype(float), axis=0).plot(kind="bar", stacked=False, figsize=(8,4))
    plt.xticks(rotation=90)

In [None]:
for col in BarPlot_columns:
    Bar_plots(col)

### Observations

Attrition is very high with employees having age in between 18 to 22. These might be due to students who are doing intership or small contract jobs

Attrition is more when the distance of office is more from home

Attrition is high with employee's education in the field of Human Resources, Marketing and Technical Degree that other fields

Attrition is high with employee's whose job role is in the field of Human Resources, Sales Representative, Laboratory Technician.

Employees who work over time have high attrition than employees who did not do over time

Employees who are working less than 2 years have more attrition

In [None]:
#Replacing Yes with 1 and No with 0 in Attrition Column
hr_data['Attrition']=np.where(hr_data['Attrition']=='No', #condition
                 0, #value if condition is true
                 1)

In [None]:
hr_data.describe().iloc[:,:20]

In [None]:
hr_data.EmployeeCount.value_counts()

In [None]:
hr_data.StandardHours.value_counts()

In [None]:
len(set(hr_data.EmployeeNumber))

Dropping the columns EmployeeCount,StandardHours,EmployeeNumber as these columns does not help in model building.

In [None]:
hr_data=hr_data.drop(['EmployeeCount', 'StandardHours','EmployeeNumber'], axis=1)

In [None]:
hr_data.groupby('Attrition').mean().iloc[:,:20]

In [None]:
hr_data.groupby('Attrition').mean().iloc[:,20:26]

In [None]:
corr_matrix = hr_data.corr()
f , ax = plt.subplots(figsize=(20,12))
sb.heatmap(corr_matrix,vmax=0.8, annot=True)

In [None]:
numerical_df=hr_data.select_dtypes(include=np.number)
categorical_df=hr_data.select_dtypes(exclude=np.number)
numeric_cols=list(numerical_df.columns)
categorical_cols=list(categorical_df.columns)

In [None]:
for n in categorical_cols:
    print(pd.crosstab(hr_data['Attrition'],hr_data[n],normalize='columns'))

In [None]:
categorical_df_dummies=pd.get_dummies(hr_data[categorical_cols],drop_first=True)
final_df=pd.concat([categorical_df_dummies,numerical_df],axis=1)

In [None]:
final_df.head()

In [None]:
y=final_df.Attrition
X=final_df.drop(['Attrition'], axis=1)

In [None]:
#Splitting Data in Train and Test Set
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score,precision_recall_curve,confusion_matrix,precision_score,confusion_matrix
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=100)

In [None]:
#Building base model(Predicting that no employee leaves the company)
base=np.zeros(1470)
print(accuracy_score(base,hr_data.Attrition))

Base model has 83.8% accuracy. Thus accuracy is not best criteria to check the performance of the model

In [None]:
#Method that applies model on the data and Predict the attrition
def model(mod,model_name,x_tr,y_tr,x_tes,y_te):
    mod.fit(x_tr,y_tr)
    pred_dt=mod.predict(x_tes)
    print("     ",model_name,"      ")
    print("Accuracy ",accuracy_score(pred_dt,y_te))
    print("ROC_AUC  ",roc_auc_score(pred_dt,y_te))
    cm=confusion_matrix(pred_dt,y_te)
    print("Confusion Matrix  \n",cm)
    print("                    Classification Report \n",classification_report(pred_dt,y_te))

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
model(lr,"Logistic Regression",X_train,y_train,X_test,y_test)

Accuracy is 86% but Precision and Recall for class 1 is low

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(min_samples_leaf=20, max_depth=4)
model(dt,"Decision Tree",X_train,y_train,X_test,y_test)

Accuracy is 85% but again Precision and Recall for class 1 is low

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10,max_depth=4)
model(rf,"Random Forest",X_train,y_train,X_test,y_test)

Accuracy is 85% but even though recall is better ,Precision  is low for class 1

### Due to Class Imbalance problem model is not able to make predictions on class 1(Attrition Yes)

In [None]:
#Performing OverSample using SMOTE(Synthetic Minority Over Sampling Technique)
from imblearn.over_sampling import SMOTE
smote=SMOTE()

In [None]:
X_sm, y_sm=smote.fit_sample(X,y)

In [None]:
X_train_sm,X_test_sm,y_train_sm,y_test_sm=train_test_split(X_sm,y_sm,test_size=0.2,random_state=100)

In [None]:
#logistic Regression for OverSampled Data 
lr_sm=LogisticRegression()
model(lr_sm,"Logistic Regression",X_train_sm,y_train_sm,X_test_sm,y_test_sm)

> Accuracy is 80%(less than the model with original data set) but Precision and Recall for class 1 is improved

In [None]:
dt_sm=DecisionTreeClassifier(min_samples_leaf=20, max_depth=4)
model(dt_sm,"Decision Tree",X_train_sm,y_train_sm,X_test_sm,y_test_sm)

Accuracy is 85% but Precision and Recall for class 1 is improved

In [None]:
rf_sm=RandomForestClassifier(n_estimators=10,max_depth=4)
model(rf_sm,"Random Forest",X_train_sm,y_train_sm,X_test_sm,y_test_sm)

Accuracy is high and  Precision and Recall for class 1 is also High