In [None]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

from scipy import stats
from functools import reduce

# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Some matplotlib options
%matplotlib inline
matplotlib.style.use("ggplot")

# General pandas options
pd.set_option('display.max_colwidth', -1)  # Show the entire column 
pd.options.display.max_columns = 100 
pd.options.display.max_rows = 10000 

# Seaborn options
sns.set_style("whitegrid")


# model to implement
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve
from imblearn.over_sampling import SMOTE


# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
print(df.shape)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Looking at each column and generating hypothesis w.r.t to  Attrition
1) Age = High Age, High Experience 
2) High Daily Rate , Less Attrition 
3) Less distance, less attrition
4) High Education , High Income
5) High Environment Satisfaction, Less attrition
6) High Job Satisfaction, Less attrition
7) High NumCompaniesWorked, High salary, High chance of leaving for more saary
8) High Overtime High Attrition,who are working overtime and who in many cases have a relatively low salary 
9) High percent salary hike less attrition
10) High performance rating less attrition 
11) stock option yes, less attrition
12) work life balance high, less attrition
13) high years at company, less attrition
14) high years since last promotion ,  high attrition

# encoding categorical variables into numerical

In [None]:
df.Attrition = df.Attrition.astype("category")
df.Attrition = df.Attrition.cat.reorder_categories(['No','Yes'])
df.Attrition = df.Attrition.cat.codes

In [None]:
df.Attrition.dtype

In [None]:
df.BusinessTravel.value_counts() # I am considering them in order.

In [None]:
df.BusinessTravel = df.BusinessTravel.astype("category")
df.BusinessTravel = df.BusinessTravel.cat.reorder_categories(['Non-Travel','Travel_Rarely','Travel_Frequently'])
df.BusinessTravel = df.BusinessTravel.cat.codes

In [None]:
df.Department.value_counts() # This is nominal data here label encoding and just assigning nos. won't work so I create dummy variables.

In [None]:
df.EducationField.value_counts()  # This is nominal data here label encoding and just assigning nos. won't work so I create dummy variables.

In [None]:
df.Gender.value_counts()  # This is nominal data here label encoding and just assigning nos. won't work so I create dummy variables.

In [None]:
df.JobRole.value_counts() # This is nominal data here label encoding and just assigning nos. won't work so I create dummy variables.

In [None]:
df.MaritalStatus.value_counts() # This is nominal data here label encoding and just assigning nos. won't work so I create dummy variables.

In [None]:
df.Over18.value_counts() # constant so delete

In [None]:
df.OverTime.value_counts()

In [None]:
df.OverTime = df.OverTime.astype("category")
df.OverTime = df.OverTime.cat.reorder_categories(['No','Yes'])
df.OverTime = df.OverTime.cat.codes

# EDA Univariate analysis # to check distribution 

# Distribution plot for numerical variables
# Frequency count plot for categorical variables

In [None]:
# Plot for all variables distribution + Count
# Graph distribution
df.hist (bins=50, figsize=(20,15), color = 'deepskyblue')
plt.show()

In [None]:
#seprating numerical columns from dataframe
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','int8']

newdf = df.select_dtypes(include=numerics)

# Distribution plot for numerical variables
# Frequency count plot for categorical variables

In [None]:
newdf.columns # numerical variable

In [None]:
# Create a figure space matrix consisting of 3 columns and 2 rows
fig, ax = plt.subplots(figsize=(20,15), ncols=3, nrows=5)
# The amount of space above titles
"""y_title_margin = .2
ax[0][0].set_title("Age",y = y_title_margin)
ax[0][1].set_title("BusinessTravel",y = y_title_margin)
ax[0][2].set_title("DailyRate",y = y_title_margin)
ax[1][0].set_title("DistanceFromHome",y = y_title_margin)
ax[1][1].set_title("EnvironmentSatisfaction",y = y_title_margin)
ax[1][2].set_title("JobSatisfaction",y = y_title_margin)
ax[2][0].set_title("MonthlyRate",y = y_title_margin)
ax[2][1].set_title("OverTime",y = y_title_margin)
ax[2][2].set_title("PerformanceRating",y = y_title_margin)
ax[3][0].set_title("RelationshipSatisfaction",y = y_title_margin)
ax[3][1].set_title("TotalWorkingYears",y = y_title_margin)
ax[3][2].set_title("WorkLifeBalance",y = y_title_margin)
ax[4][0].set_title("YearsAtCompany",y = y_title_margin)
ax[4][1].set_title("YearsSinceLastPromotion",y = y_title_margin)
ax[4][2].set_title("YearsWithCurrManage",y = y_title_margin)"""

sns.distplot(df.Age,kde=False,color="b", ax=ax[0][0])
sns.distplot(df.BusinessTravel,kde=False,color="b", ax=ax[0][1])
sns.distplot(df.DailyRate,kde=False,color="b", ax=ax[0][2])
sns.distplot(df.DistanceFromHome,kde=False,color="b", ax=ax[1][0])
sns.distplot(df.EnvironmentSatisfaction,kde=False,color="b", ax=ax[1][1])
sns.distplot(df.JobSatisfaction,kde=False,color="b", ax=ax[1][2])
sns.distplot(df.MonthlyRate,kde=False,color="b", ax=ax[2][0])
sns.distplot(df.OverTime,kde=False,color="b", ax=ax[2][1])
sns.distplot(df.PerformanceRating,kde=False,color="b", ax=ax[2][2])
sns.distplot(df.RelationshipSatisfaction,kde=False,color="b", ax=ax[3][0])
sns.distplot(df.TotalWorkingYears,kde=False,color="b", ax=ax[3][1])
sns.distplot(df.WorkLifeBalance,kde=False,color="b", ax=ax[3][2])
sns.distplot(df.YearsAtCompany,kde=False,color="b", ax=ax[4][0])
sns.distplot(df.YearsSinceLastPromotion,kde=False,color="b", ax=ax[4][1])
sns.distplot(df.YearsWithCurrManager,kde=False,color="b", ax=ax[4][2])



# insights from univariate analysis of numerical data
1) Age = max employ has age in range of 27-29, 35-37, 39-40
2) Business_travel = Travel_Rarely 1043, Travel_Frequently 277 ,Non-Travel  150
3) Daily rate = mostly between range of 115- 130 for all employees
4) Distance from home = Good no. of  people live near office range of distance = 0-3
5) Environment satidfaction = most people are satisfied with environment of company as they have rated it 3 and 4 and same for job satisfaction and same for relationship satisfaction this means that less employee are not satisfied and thus less people have higher chance of attrition.
6) overtime = less people do overtime and thus they might have higher chance of attrition
7) worklife balance =  most people have balanced life b/w work and personal life but some have rated it 1 and 2 thus they have high chance of attrition.
8) YearsSinceLastPromotion = most people got promoted recentely as there are more people who in range of 0-1 years and hypothesis is this that poeple who did not get fromotion from 2-5 years are likely to churn more.

In [None]:
#separting categorical columns 

In [None]:
cat = ['object']

newdf1 = df.select_dtypes(include=cat)
newdf1.columns

In [None]:
# Create a figure space matrix consisting of 3 columns and 2 rows
fig, ax = plt.subplots(figsize=(20,15), ncols=3, nrows=2)
sns.countplot(x="Department",data=df,palette="Greens_d",ax= ax[0][0])
sns.countplot(x="EducationField",data=df,palette="Greens_d",ax= ax[0][1])
sns.countplot(x="Gender",data=df,palette="Greens_d",ax= ax[0][2])
sns.countplot(x="JobRole",data=df,palette="Greens_d",ax= ax[1][0])
sns.countplot(x="MaritalStatus",data=df,palette="Greens_d",ax= ax[1][1])
sns.countplot(x="Over18",data=df,palette="Greens_d",ax= ax[1][2]) # drop Over18

# Bivariate analysis with respect to target variable

In [None]:
df.columns

In [None]:
# Create a figure space matrix consisting of 3 columns and 2 rows ## box plot for categorical vs numerical
sns.boxplot(x="BusinessTravel",y="Age",hue="Attrition",data=df) 
# we can conclude that most employers who are in range of 27-38 leave company.this can be due to career switch or want salary hike.

In [None]:
sns.countplot(x="Department",data=df,hue='Attrition')  # no specific relation

In [None]:
sns.boxplot(x="Attrition",y='DistanceFromHome',hue="Attrition",data=df) # no specific relation

In [None]:
sns.countplot(x="Education",data=df,hue='Attrition') # no specific relation

In [None]:
sns.countplot(x="EducationField",data=df,hue='Attrition') # no specific relation

In [None]:
sns.countplot(x="EnvironmentSatisfaction",data=df,hue='Attrition') # we can see that % of attrition for environment satisfaction = 1,2 will be more that of 3 and 4.

In [None]:
sns.countplot(x="Gender",data=df,hue='Attrition') #no specific relation

In [None]:
sns.countplot(x="JobSatisfaction",data=df,hue='Attrition') # we can see that % of attrition for job satisfaction = 1,2 will be more that of 3 and 4.

In [None]:
sns.boxplot(x="Attrition",y="YearsSinceLastPromotion",hue="Attrition",data=df)

In [None]:
True_Class = print(sum(df['Attrition']==1))
Total_length = print(len(df['Attrition']))
print((237/1470)*100) # percentage of class 1, checking class imbalance # it is not highly imbalanced but still we will use SMOTE for 1 model and do it without SMOTE for other model

In [None]:
#Checking class imbalance
sns.countplot(x ='Attrition',data = df)

In [None]:
pandas_profiling.ProfileReport(df)

# Encoding nominal data and removing constant and highly correlated data
Generally when making a predictive model, it would be preferable to train a model with features that are not too correlated with one another so that we do not need to deal with redundant features. In the case that we have quite a lot of correlated features one could perhaps apply a technique such as Principal Component Analysis (PCA) to reduce the feature space.

In [None]:
df = df.drop(['EmployeeCount','MonthlyIncome','Over18','StandardHours'],axis =1)
df = pd.get_dummies(df)

In [None]:
df.head()

# ML Models
Splitting Data into Train and Test sets
But before we even start training a model, we will have to partition our dataset into a training set and a test set (unlike Kaggle competitions where the train and test data are already segregated for you). To split our data we will utilise sklearn's

# Decision Tree

In [None]:
# Choose the dependent variable column (churn) and set it as target
target = df.Attrition
# Drop column churn and set everything else as features
features = df.drop("Attrition",axis=1)
# Import the train_test_split method
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

# Split data into train and test sets as well as for validation and testing
# Use that function to create the splits both for target and for features
# Set the test sample to be 25% of your observations
target_train, target_test, features_train, features_test = train_test_split(target,features,test_size=0.25,random_state=42)

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)

In [None]:
# Import the classification algorithm
from sklearn.tree import DecisionTreeClassifier

# Initialize it and call model by specifying the random_state parameter
model = DecisionTreeClassifier(random_state=42,class_weight='balanced')

# Apply a decision tree model to fit features to the target
model.fit(features_train,target_train)

In [None]:
# Do k-fold cross-validation
cv_results = cross_val_score(model, # Pipeline
                             features_train, # Feature matrix
                             target_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [None]:
# Calculate mean # cross validated score
CV_mean = cv_results.mean()
print(CV_mean*(100))

In [None]:
# overfited model
# Check the accuracy score of the prediction for the training set
print(model.score(features_train,target_train)*100)

# Check the accuracy score of the prediction for the test set
print(model.score(features_test,target_test)*100)

# Parameter tuning

In [None]:
# generate max depth range
depth = [i for i in range (5,21,1)]
samples = [i for i in range(50,450,1)]
Parameters = dict(max_depth = depth, min_samples_leaf = samples)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
Param_search = GridSearchCV(model,Parameters)

In [None]:
Param_search.fit(features_train,target_train)

In [None]:
print(Param_search.best_params_)

# Tuned DT Model

In [None]:
model1 = DecisionTreeClassifier(random_state=42,class_weight='balanced',max_depth = 5, min_samples_leaf = 368)

In [None]:
model1.fit(features_train,target_train)

In [None]:
# Do k-fold cross-validation
cv_results1 = cross_val_score(model1, # Pipeline
                             features_train, # Feature matrix
                             target_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores
# Calculate mean # cross validated score
CV_mean1 = cv_results1.mean()
print(CV_mean1*(100))

In [None]:
# No overfiting
# Check the accuracy score of the prediction for the training set
print(model1.score(features_train,target_train)*100)

# Check the accuracy score of the prediction for the test set
print(model1.score(features_test,target_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print (confusion_matrix(target_test, model1.predict(features_test)))
print (classification_report(target_test, model1.predict(features_test)))

# Important features for DT model

In [None]:
important_features = model.feature_importances_
feature_list = list(features)
relative_importances = pd.DataFrame(index = feature_list, data = important_features, columns = ['Important'])

In [None]:
relative_importances.sort_values(by='Important', ascending = False)

In [None]:
selected_features = relative_importances[relative_importances.Important> 0.02]

In [None]:
selected_list = selected_features.index

In [None]:
feature_train_selected = features_train[selected_list]
feature_test_selected = features_test[selected_list]

# Random forest

In [None]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

seed = 0   # We set our random seed to zero for reproducibility

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 800,
    'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 9,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

# Instantiate rf
rf = RandomForestClassifier(**rf_params)
            
# Fit rf to the training set    
rf.fit(features_train, target_train)

In [None]:
# Do k-fold cross-validation
cv_results2 = cross_val_score(rf, # Pipeline
                             features_train, # Feature matrix
                             target_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores
# Calculate mean # cross validated score
CV_mean2 = cv_results2.mean()
print(CV_mean2*(100))

In [None]:
# Slight overfiting
# Check the accuracy score of the prediction for the training set
print(rf.score(features_train,target_train)*100)

# Check the accuracy score of the prediction for the test set
print(rf.score(features_test,target_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print (confusion_matrix(target_test, rf.predict(features_test)))
print (classification_report(target_test, rf.predict(features_test)))

In [None]:
import matplotlib.pyplot as plt
#Create a pd.Series of features importances
importances = pd.Series(data=rf.feature_importances_,
                        index= features_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

"""Apparently, Overtime, Total Working Years, Age are the most important features according to rf. The importances of these two features add up to 14% roughly"""

# Most RF important features : Overtime, Marital Status

As observed in the plot of feature importances, it seems that our Random Forest Classifier has decided to rank the features of OverTime highest, which is followed by marital status.

I don't know about you, but working overtime to me does indeed affect my satisfaction derived from any job (and I have worked many an overtime). Maybe then it should come as no surprise that our classifier has caught on to this and thus ranked overtime the highest


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(features_train,target_train)

In [None]:
# Do k-fold cross-validation
cv_results2 = cross_val_score(lr, # Pipeline
                             features_train, # Feature matrix
                             target_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores
# Calculate mean # cross validated score
CV_mean2 = cv_results2.mean()
print(CV_mean2*(100))

In [None]:
# No overfiting
# Check the accuracy score of the prediction for the training set
print(lr.score(features_train,target_train)*100)

# Check the accuracy score of the prediction for the test set
print(lr.score(features_test,target_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print (confusion_matrix(target_test, lr.predict(features_test)))
print (classification_report(target_test, lr.predict(features_test)))

In [None]:
print(lr.coef_)
print(lr.intercept_)

In [None]:
probability = lr.predict_proba(features_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(target_test,probability[:,1])

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
fpr, tpr, thresholds = roc_curve(target_test, lr.predict(features_test))
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();