In [None]:
df <- read.csv("../input/ibm-attrition-analysis/WA_Fn-UseC_-HR-Employee-Attrition.csv")
head(df)

# This will be used for training and testing.
original_df <- df

In [None]:
library(dplyr)
df %>% glimpse()

In [None]:
library(ggcorrplot)
options(repr.plot.width=10, repr.plot.height=7) 

nums <- select_if(df, is.numeric)

corr <- round(cor(nums), 1)

ggcorrplot(corr, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="square", 
           colors = c("tomato2", "white", "#01A9DB"), 
           title="Correlogram Employee Attritions", 
           ggtheme=theme_minimal())

In [None]:
set.seed(142)
library(caret)
# # I personally prefer to shuffle my data before splitting.
original_df <- original_df[sample(nrow(original_df)),]

# Let's encode the ordinal variables
original_df$BusinessTravel = factor(original_df$BusinessTravel,
                         levels = c('Travel_Frequently', 'Travel_Rarely', 'Non-Travel'),
                         labels = c(1, 2, 3))



# Changing the datatype from integer to factors from the ordinal variables.
cols <- c("Education", "EnvironmentSatisfaction", "JobInvolvement", "JobLevel",
         "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction", 
         "StockOptionLevel", "TrainingTimesLastYear", "WorkLifeBalance")

original_df[cols] <- lapply(original_df[cols], factor)

# Delete unecessary columns
cols <- c("Over18", "EmployeeNumber", "EmployeeCount")

original_df[cols] <- NULL


# Splitting our data
trainIndex <- createDataPartition(original_df$Attrition, p=0.8, 
                                 list=FALSE, times=1)

train <- original_df[trainIndex,]
test <- original_df[-trainIndex,]



# Checking that both the training and testing sets have the same label proportions.
prop_train <- train %>% select(Attrition) %>% group_by(Attrition) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))

prop_test <- test %>% select(Attrition) %>% group_by(Attrition) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))

prop_train
prop_test

In [None]:
library(rpart)
options(repr.plot.width=10, repr.plot.height=8) 

rpart.tree <- rpart(Attrition ~ ., data=train)
plot(rpart.tree, uniform=TRUE, branch=0.6, margin=0.05)
text(rpart.tree, all=TRUE, use.n=TRUE)
title("Training Set's Classification Tree")

In [None]:
options(repr.plot.width=8, repr.plot.height=6) 
library(ggplot2)
library(ggthemes)
predictions <- predict(rpart.tree, test, type="class")
conf_df <- data.frame(table(test$Attrition, predictions))
summary(predictions)
confusionMatrix(table(test$Attrition, predictions))

ggplot(data =  conf_df, mapping = aes(x = predictions, y = Var1)) +
  geom_tile(aes(fill = Freq), colour = "white") +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "#F3F781", high = "#58FA82") +
  theme_economist() + theme(legend.position="none", strip.background = element_blank(), strip.text.x = element_blank(), 
     plot.title=element_text(hjust=0.5, color="white"), plot.subtitle=element_text(color="white"), plot.background=element_rect(fill="#0D7680"),
                                                        axis.text.x=element_text(colour="white"), axis.text.y=element_text(colour="white"),
                                                        axis.title=element_text(colour="white"), 
     legend.background = element_rect(fill="#FFF9F5",
                                  size=0.5, linetype="solid", 
                                  colour ="black")) + 
labs(title="Confusion Matrix", y="Attrition Status", x="Predictions")

In [None]:
library(partykit)

rparty.tree <- as.party(rpart.tree)
rparty.tree

In [None]:
# Pruning reduces the size of decision trees by removing parts of the tree that do not provide power to classify instances

prune.rpart.tree <- prune(rpart.tree, cp=0.02) # pruning the tree
plot(prune.rpart.tree, uniform=TRUE, branch=0.6)
text(prune.rpart.tree, all=TRUE, use.n=TRUE)

In [None]:
library(rpart.plot)
library(RColorBrewer)

options(repr.plot.width=12, repr.plot.height=12) 

fancyRpartPlot(rpart.tree)