In [3]:
# The following command cleans the working memory and ensures that all references are to items created in this file.
rm(list=(ls()))

# Data set containing sales of child car seats (coded as high or low) at 400 different stores
mort <- read.csv("mort_data.csv")

table(mort$death)/nrow(mort)[1]

#Let's perform some pre-processing

#Convert the variables below into a factor.
#There are only three categorical variables that we want to convert into a factor, but when you have many variables to convert, you may want to automate the process.

mort$sex <- factor(mort$sex)
mort$smoker <- factor(mort$smoker)
mort$death <- factor(mort$death)


       No       Yes 
0.6930502 0.3069498 

In [4]:
#Let's split the data into a train (70%) and test (30%) sets.

set.seed (23) # for reproducibility
library(caret) 

# Caret has a nice createdataPatrition function that creates a train and test split. It performs stratified sampling based on the target variable (label).

index <- createDataPartition(y = mort$death,
                                     p = .7,
                                     list = FALSE)

mort.train <- mort[index,] 
mort.test <- mort[-index,]

In [6]:
library(rpart)
library(rpart.plot)
set.seed(23)

In [7]:
# Set the formula with all variables.If you choose, you can manually type the formula.
target <- "death"
features <- c("attage",
              "sex",
              "smoker")



rforest.f <- as.formula(paste(target, paste(features, collapse = "+"), sep = "~"))

In [13]:
if (!require("randomForest")) {install.packages("randomForest")}
library(randomForest)

In [14]:
set.seed(23)
model.rf <- randomForest(formula = rforest.f, 
                         data = mort.train,
                         ntree = 50,
                         mtry = 3, # The number of features to use at each split. 
                         importance = TRUE
                         )

predictions <- predict(model.rf, mort.train) 

table(predictions,mort.train$death)

# Accuracy
# It has 99.2% accuracy on the train set.
# With classification problems, make sure your data is not imbalanced.For imbalanced data, the overall accuracy maynot be important. 

(250+111)/(250+111+2+1)

#Accuracy on the error data

predictions_test <- predict(model.rf, mort.test) 

table(predictions_test,mort.test$death)

           
predictions  No Yes
        No  250   1
        Yes   2 111

                
predictions_test No Yes
             No  99  11
             Yes  8  36