Abstract : HR Analytics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv('HR_Employee_Attrition.csv')

In [None]:
df.head()

In [None]:
#Move attrition column to the end :
df = df[[c for c in df if c not in ['Attrition']] + ['Attrition']] 

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df['Attrition'].unique()

In [None]:
df['Gender'].unique()

In [None]:
df['BusinessTravel'].unique()

In [None]:
df['Department'].unique()

In [None]:
df['EducationField'].unique()

In [None]:
df['JobRole'].unique()

In [None]:
df['MaritalStatus'].unique()

In [None]:
df['Over18'].value_counts()
#drop this column as it states that all employees are 18 years and above.

In [None]:
df['OverTime'].unique()

In [None]:
df.describe()

# Data Visualization

In [None]:
df.head()

In [None]:
#Range of ages of employees in the organisation :
sns.distplot(df['Age'],color='green')
plt.title("Age Distribution of Employees")

    Most Employees age between 20 and 60 years old, with many employees being around 35 yrs of age.

In [None]:
sns.countplot(df['BusinessTravel'])
plt.title("Types of Employees on the move")

    Very few people need to travel in this organisation.

In [None]:
sns.distplot(df['YearsInCurrentRole'],bins=6,color='brown')
plt.title("Years In Current Role")


        Most people have 0-10 years of experience in the present role. 

In [None]:
sns.barplot(x='Department',y='Age',data=df,hue='Attrition')
plt.title("Attrition based on age and department")

    Most people who have left organisation are from Sales department, followed RnD and then HR.

In [None]:
sns.heatmap(df.isnull())

     No null values are present.

In [None]:
sns.regplot(x='TotalWorkingYears',y='MonthlyIncome',data=df,color='red')
plt.title("Years of Experience vs Income")

       Higher the experience, more the income. Most people have worked in this organisation betwwen 0-20 years.

# Data Manipulation

In [None]:
#Encode the columns
from sklearn.preprocessing import LabelEncoder

cols = ['Attrition','Gender','BusinessTravel','Department','EducationField','JobRole','MaritalStatus','OverTime']
for each in cols:
    encoder = LabelEncoder()
    df[each] = encoder.fit_transform(df[each])
df.info()

In [None]:
#drop unrelated columns
df.drop('Over18',axis=1,inplace=True)
df.drop('EmployeeCount',axis=1,inplace=True)
df.drop('EmployeeNumber',axis=1,inplace=True)
df.drop('StandardHours',axis=1,inplace=True)

In [None]:
#Find correlation between variables
corr = df.corr()
plt.figure(figsize=(19,15))
sns.heatmap(corr,annot=True)

In [None]:
#Drop the following columns as they dont make much impact to target variable.
df.drop('DailyRate',axis=1,inplace=True)
df.drop('HourlyRate',axis=1,inplace=True)


In [None]:
#Check for skewness
col = df.columns.values
plt.figure(figsize=(20,35))
for i in range(0,len(col)):
    plt.subplot(10,5,i+1)
    sns.distplot(df[col[i]],color='crimson')
plt.show()

    Data is not uniformly distributed and has right skewness in most of the columns like YearsAtCompany, PercentSalaryHike

In [None]:
#Check for outliers :
plt.figure(figsize=(8,20))
for i in range(0,len(col)):
    plt.subplot(10,5,i+1)
    sns.boxplot(df[col[i]],palette='rocket',orient='v')
    plt.tight_layout()

    Outliers are represented by dots and are present in few columns.

In [None]:
#Use z-score to find outliers :
from scipy.stats import zscore
#data.info()
z=np.abs(zscore(df))
print(np.where(z>3))

In [None]:
#remove the outliers.
df_new = df[(z<3).all(axis=1)]

In [None]:
print(df.shape,"\t",df_new.shape)

In [None]:
#Feature Selection :

from sklearn.model_selection import train_test_split

X = df_new.drop('Attrition',axis=1)
y = df_new.iloc[:,-1:]

In [None]:
#Check skewness value.
X.skew()

In [None]:
#Use PowerTransformer to remove skewness.
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer('yeo-johnson')
X = pd.DataFrame(pt.fit_transform(X))

X.skew()

In [None]:
X[18].value_counts()

In [None]:
X.drop(18,axis=1,inplace=True)

In [None]:
X.shape

In [None]:
y.shape
y

# Model Prediction and Validation

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression as LR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.linear_model import Lasso,Ridge,ElasticNet


In [None]:
model = [LR(),DTR(),KNR(),SVR(),Lasso(),Ridge(),ElasticNet()]

for i in range(len(model)):
    #randomState = random_state_counter(model[i])
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=40)
    model[i].fit(x_train,y_train)
    y_pred = model[i].predict(x_test)
    r2Score = r2_score(y_test,y_pred)
    
    print("*************************************************************************************")
    print(model[i])
    print("R2 Score : ",r2Score)
    print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
    print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
    print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))
    print("")
    print("*************************************************************************************")

print(y_pred)

     LinearRegression performed better than the other models. 

In [None]:
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(LR(),X,y,cv=4,scoring='r2')
print("*************************************************************************************")
print("Score for ",LR()," : ")
print("Score : ", cv_score)
print("Mean : ", cv_score.mean())
print("Standard Deviation : ", cv_score.std())
print("*************************************************************************************")
print("")

In [None]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

gridsearch = GridSearchCV(LR(),parameters,n_jobs=-1,pre_dispatch=2)
gridsearch.fit(X,y)
gridsearch.best_params_

In [None]:
lr = LR(copy_X=True,fit_intercept=True,normalize=False)
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state = 167 )
lr.fit(x_train,y_train)
score = lr.score(x_train,y_train)
print("Score for Linear regression : ",score)
y_pred = lr.predict(x_test)
r2Score = r2_score(y_test,y_pred)
print("R2 Score for Linear Regression : ",r2Score)
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))



In [None]:
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(lr,X,y,cv=4,scoring='r2')
print("*************************************************************************************")
print("Score for ",lr," : ")
print("Score : ", cv_score)
print("Mean : ", cv_score.mean())
print("Standard Deviation : ", cv_score.std())
print("*************************************************************************************")
print("")

In [None]:
def calBestRandomStateOf(model):
    max_score=0
    for i in range(40,500):
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=i)
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        score = r2_score(y_test,pred)
        if score>max_score:
            max_score = score
            final_state = i

    return final_state

In [None]:
#Adaboost Regressor and RandomforestRegressor
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import RandomForestRegressor as RFR


In [None]:
ada = ABR(n_estimators=20,random_state=200)
gradient = GBR(n_estimators=20,random_state=62)
rfr = RFR(n_estimators=20,random_state=72)

boosting_model = [ada,gradient,rfr]

for i in range(len(boosting_model)):
    boost = boosting_model[i]
    boost.fit(x_train,y_train)
    pred = boost.predict(x_test)
    r2Score = r2_score(y_test,pred)
    print("-----------------------------------------------------------")
    print(boost)
    print("-----------------------------------------------------------")
    print("R2 Score : ", r2Score)
    print("\n")

     Even after boosting, LinearRegressor's scores are better than the rest and hence that will be our final model.

In [None]:
from sklearn.externals import joblib

joblib.dump(lr,'linearRegressionModel.obj')

linearReg_from_joblib = joblib.load('linearRegressionModel.obj')

final_output = linearReg_from_joblib.predict(x_test)

In [None]:
#save final output to a csv file :
pd.DataFrame(final_output).to_csv("HR_Output.csv")