In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')

# Reading Data

In [None]:
# Data Loading
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

# Exploratory Data Analasis

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.nunique().sort_values(ascending=True)

In [None]:
# We have 3 features which have unique values, we can delete them as they don't bring any significance for model 
# We have one feature which has unique values for each record, We can delete this one also, It is kind of unique key
df.drop(['Over18','StandardHours','EmployeeCount','EmployeeNumber'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()

There are no null information in dataset

In [None]:
df.Attrition.replace(['Yes','No'],[1,0],inplace=True)
df.Gender.replace(['Male','Female'],[1,0],inplace=True)
df.OverTime.replace(['Yes','No'],[1,0],inplace=True)
#df.head()

In [None]:
# Object datatypes
obj_fea = df.select_dtypes(include=['object'])
# Numerical datatypes
num_fea = df.select_dtypes(exclude=['object'])
obj_fea

In [None]:
# Relation of target variable with Categorical features
plt.figure(figsize=(20,20))
for i,feat in enumerate(obj_fea.columns):
    plt.subplot(3,2,i+1)
    sns.barplot(x=df[feat],y=df['Attrition'])
    plt.xticks(rotation=45)
plt.show()

Observations : - 

1. Who travel frequently for business purpose tends to leave company early than who travel rarely and Who don't need to travel will stay in company than others 

2. Reasearch and Development team employees will stay longer in company than other departments

3. Whose education background is from Human resources,Techical degrees, Marketing tends to leave company than other ones from other Educational background

4. Sales representatives will leave company much earlier than others, Research Directors rarely leaves the company

5. Singles tends to leave company than Married ones and divorced


In [None]:
num_fea.head()

In [None]:
# Visualizing Numeric features 

# Age
plt.figure(figsize=(10,8))
sns.barplot(x=df['Age'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observations:- 

1. Teenagers highly tends to leave company than Mid aged

2. Persons whose age more than 55 tends to leave company than Mid aged 

In [None]:
# DistanceFromHome
plt.figure(figsize=(10,8))
sns.barplot(x=df['DistanceFromHome'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observation:-

1. Who travels from long distance to workplace tends leave company than others

In [None]:
# Education
plt.figure(figsize=(10,8))
sns.barplot(x=df['Education'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observations:-

1. Whose is less educated tends to leave company sooner than others

In [None]:
# EnvironmentSatisfaction
plt.figure(figsize=(10,8))
sns.barplot(x=df['EnvironmentSatisfaction'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observations:-

1. Who leastly likes the environment in company tends to leave company sooner than others

In [None]:
# JobInvolvement
plt.figure(figsize=(10,8))
sns.barplot(x=df['JobInvolvement'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observations:-

1. Who leastly involves in Job in company tends to leave company sooner than others

In [None]:
# Gender
plt.figure(figsize=(10,8))
sns.barplot(x=df['Gender'],y=df['Attrition'])
plt.xticks(rotation=45)
plt.show()

Observations:-

1. Men tends to leave company sooner than women

In [None]:
other_fea = ['NumCompaniesWorked','OverTime','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel']
plt.figure(figsize=(20,20))
for i,feat in enumerate(other_fea):
    plt.subplot(3,2,i+1)
    sns.barplot(x=df[feat],y=df['Attrition'])
    plt.xticks(rotation=45)
plt.show()

Observations:-

1. Who worked for more number of compannies tends to switch again!

2. Who works overtime tends to leave early

3. Surprisingly Who got more percentage of hike tending to leave company!

4. Performance Rating 3 & 4 Almost equal probability to leave or stay in company (We might need to remove this feature, because it doesn't have much say in Modeling

5. Who have less Relationship Satisfaction tends to leave earlier than other

6. Who have no company stocks tends to leave than who have more options, Who have more stock options tends to leave than who have comparitively less options.


In [None]:
num_fea.columns

In [None]:
other_fea = ['TotalWorkingYears','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']
plt.figure(figsize=(20,20))
for i,feat in enumerate(other_fea):
    plt.subplot(3,2,i+1)
    sns.barplot(x=df[feat],y=df['Attrition'])
    plt.xticks(rotation=45)
plt.show()

Observations:-

1. Employees who have 40 years experience will tend to leave company, Later who joined recently (whose experience is below 2) tends to leave 

2. Who have least worklife balance tends to resign faster than other

3. Seniors in company tends to leave sooner than others

4. Who stays in current role for longer time and who just got new role tends to leave company sooner than others

5. Who don't get promotion for long time tends to leave comapany

6. Who stays with Manager for longer time and shorter time tends to leave company than others

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True,cmap='RdYlGn')
plt.show()

Observations:-

1. Joblevel is highly corelated with MonthlyIncome (95%)

2. JobLevel is correlated with TotalExperience (78%)

3. Percentage of SalaryHike is related to PerformanceRating (77%)

4. MonthlyIncome is correlated with TotalExperience (77%)

5. YearsAtCompany is correlated with YearswithCurrentManager (77%)

6. YearsAtCompany is correlated with YearCurrentRole (76%)

7. Attrition rate is comparatevely highly correlated with OverTime (25%)

In [None]:
df.head()

In [None]:
#Categorical Variable conversion
df = pd.get_dummies(df,drop_first=True)
df.head()

In [None]:
df.info()

In [None]:
# Unique Value Features
uni_val_feat = [feat for feat in df.columns if df[feat].std() == 0]
print(uni_val_feat)

# Quasi Constants
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(df)

quasi_const_feat = df.columns.difference(df.columns[sel.get_support()])
print(quasi_const_feat)

Here we need to find whether Attrition happens or not based on given features, So we have to build **Classification Model** 

In [None]:
X = df.drop(['Attrition'],axis=1)
y = df.Attrition

from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=1/3,random_state=1)

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
lo = LogisticRegression(solver='liblinear')
lo_model = lo.fit(Xtrain,ytrain)
lo_predict = lo_model.predict(Xtest)

from sklearn.metrics import accuracy_score
print('Accuracy with LogisticRegression : ' + str(round(accuracy_score(ytest,lo_predict),2)*100))

In [None]:
# DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_model = dt.fit(Xtrain,ytrain)
dt_predict = dt_model.predict(Xtest)

print('Accuracy with DecisionTreeClassifier : ' + str(round(accuracy_score(ytest,lo_predict),2)*100))

In [None]:
# Ensemble Techniques

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

# RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(Xtrain,ytrain)
# predicting values for test data
rf_ypredict = rf_model.predict(Xtest)
print('Accuracy Score with RandomForestClassifier : ' + str(round(accuracy_score(ytest,rf_ypredict),2)*100))
print('#####################################################')

# AdaBoostClassifier
ada = AdaBoostClassifier()
ada_model = ada.fit(Xtrain,ytrain)
# predicting values for test data
ada_ypredict = ada_model.predict(Xtest)
print('Accuracy Score with AdaBoostClassifier : ' + str(round(accuracy_score(ytest,ada_ypredict),2)*100))
print('#####################################################')

# GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb_model = gb.fit(Xtrain,ytrain)
# predicting values for test data
gb_ypredict = gb_model.predict(Xtest)
print('Accuracy Score with GradientBoostingClassifier : ' + str(round(accuracy_score(ytest,gb_ypredict),2)*100))
print('#####################################################')

# XGBClassifier
xgb = XGBClassifier()
xgb_model = xgb.fit(Xtrain,ytrain)
# predicting values for test data
xgb_ypredict = xgb_model.predict(Xtest)
print('Accuracy Score with XGBClassifier : ' + str(round(accuracy_score(ytest,xgb_ypredict),2)*100))
print('#####################################################')

**AdaBoostClassifier** performed well on this problem