In [4]:
library(lattice)
library(ggplot2)
library(caret)
library(rpart)
library(rattle)
library(mlbench)
library(ranger)
adult = read.csv("adult.csv", header = FALSE)
names(adult) = c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
  "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
  "hours_per_week", "native_country", "target")



#2-3. replace NA's
for(i in 1:ncol(adult)){
    adult[,i][adult[,i] == "?"] = NA
    mode = levels(adult[,i])[which.max(table(adult[,i]))]
    if(is.factor(adult[,i])){
      adult[,i][is.na(adult[,i])] = mode
    }
}

adultNAfix = preProcess(adult, method='medianImpute')
adultFixed = predict(adultNAfix, newdata = adult)

#4. Group factors greater than 10 categories into 5 categories
#str(adultFixed)
#education, occupation, & native_country are over 10 categories
levels(adultFixed$education) = c("Gradeshool","Gradeschool","Gradeschool", "Gradeschool",
                                 "Gradeshool","Gradeshool","Gradeshool","Associates",
                                 "Associates", "Bachelors", "Post_Grad", "HS+", "Post_Grad",
                                 "Gradeschool","HS+", "HS+")
levels(adultFixed$occupation) = c("Misc", "Business", "Protection", "Labor", "Business",
                                  "Labor", "Labor", "Tech", "Misc", "Labor", "Misc",
                                  "Protection", "Business", "Tech", "Labor")
levels(adultFixed$native_country) = c("NA", "AFR", "NA", "AS", "SA", "NA", "NA", "SA", "NA",
                                      "EU", "EU", "EU", "EU", "NA", "NA", "EU", "NA", "AS",
                                      "EU", "AS", "AS", "EU", "EU", "NA", "AS", "AS", "NA",
                                      "NA", "NA", "SA", "AS", "EU", "EU", "NA", "EU",
                                      "AS", "AS", "AS", "NA", "NA", "AS", "EU")


#5. Dummy encoding for categorical vars
dummies_model <- dummyVars(target ~ ., data=adultFixed)
adultFixed_mat <- predict(dummies_model, newdata = adultFixed)
adultFixed2 <- data.frame(adultFixed_mat)
adultFixed2$target <- adult$target

#6. Scale and center data
ScaleAndCenter <- preProcess(adultFixed2, method= c("center", "scale"))
FinalAdult <- predict(ScaleAndCenter, newdata = adultFixed2)

##7. split data train:test 70:30, seeding = 2018, build decision tree and report accuracy
set.seed(2018)
splitIndex <- createDataPartition(FinalAdult$target, p = .70, list = FALSE, times = 1)
train <- FinalAdult[ splitIndex,]
test <- FinalAdult[-splitIndex,]
model = rpart(target ~., data = train, method = "class")

pred = predict(model, test, type = "class")

cm = confusionMatrix(data = pred, reference = test$target, positive = ">50K")
cm$overall['Accuracy']
cm$byClass['Balanced Accuracy']
#8. Build random forest with ranger
set.seed(2018)
model2 = ranger(target ~., data = train)
pred2  = predict(model2, data = test)$predictions
cm2 = confusionMatrix(pred2, test$target, positive=">50K")
cm2$overall['Accuracy']
cm2$byClass['Balanced Accuracy']

#9.& 10. not needed as there are no missing numeric vars

#11. Redo 6 with scaling only on non encoded vars, knnimpute is scaling and centering
adult = read.csv("adult.csv", header = FALSE)
names(adult) = c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
  "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
  "hours_per_week", "native_country", "target")

for(i in 1:ncol(adult)){
    adult[,i][adult[,i] == "?"] = NA
    mode = levels(adult[,i])[which.max(table(adult[,i]))]
    if(is.factor(adult[,i])){
      adult[,i][is.na(adult[,i])] = mode
    }
    }

adultNAfix = preProcess(adult, method='knnImpute')
adultFixed = predict(adultNAfix, newdata = adult)

levels(adultFixed$education) = c("Gradeshool","Gradeschool","Gradeschool", "Gradeschool",
                                 "Gradeshool","Gradeshool","Gradeshool","Associates",
                                 "Associates", "Bachelors", "Post_Grad", "HS+", "Post_Grad",
                                 "Gradeschool","HS+", "HS+")
levels(adultFixed$occupation) = c("Misc", "Business", "Protection", "Labor", "Business",
                                  "Labor", "Labor", "Tech", "Misc", "Labor", "Misc",
                                  "Protection", "Business", "Tech", "Labor")
levels(adultFixed$native_country) = c("NA", "AFR", "NA", "AS", "SA", "NA", "NA", "SA", "NA",
                                      "EU", "EU", "EU", "EU", "NA", "NA", "EU", "NA", "AS",
                                      "EU", "AS", "AS", "EU", "EU", "NA", "AS", "AS", "NA",
                                      "NA", "NA", "SA", "AS", "EU", "EU", "NA", "EU",
                                      "AS", "AS", "AS", "NA", "NA", "AS", "EU")

dummies_model <- dummyVars(target ~ ., data=adultFixed)
adultFixed_mat <- predict(dummies_model, newdata = adultFixed)
FinalAdult <- data.frame(adultFixed_mat)
FinalAdult$target <- adult$target

set.seed(2018)
splitIndex <- createDataPartition(FinalAdult$target, p = .70, list = FALSE, times = 1)
train <- FinalAdult[ splitIndex,]
test <- FinalAdult[-splitIndex,]
model = rpart(target ~., data = train, method = "class")

pred = predict(model, test, type = "class")

cm = confusionMatrix(data = pred, reference = test$target, positive = ">50K")
cm$overall['Accuracy']
cm$byClass['Balanced Accuracy']

model2 = ranger(target ~., data = train)
pred2  = predict(model2, data = test)$predictions
cm2 = confusionMatrix(pred2, test$target, positive=">50K")
cm2$overall['Accuracy']
cm2$byClass['Balanced Accuracy']
#12 encode in step five differently, read factors as.numeric
adult = read.csv("adult.csv", header = FALSE)
names(adult) = c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
  "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
  "hours_per_week", "native_country", "target")

for(i in 1:ncol(adult)){
    adult[,i][adult[,i] == "?"] = NA
    mode = levels(adult[,i])[which.max(table(adult[,i]))]
    if(is.factor(adult[,i])){
      adult[,i][is.na(adult[,i])] = mode
      
    }
}

adultNAfix = preProcess(adult, method='medianImpute')
adultFixed = predict(adultNAfix, newdata = adult)

levels(adultFixed$education) = c("Gradeshool","Gradeschool","Gradeschool", "Gradeschool",
                                 "Gradeshool","Gradeshool","Gradeshool","Associates",
                                 "Associates", "Bachelors", "Post_Grad", "HS+", "Post_Grad",
                                 "Gradeschool","HS+", "HS+")
levels(adultFixed$occupation) = c("Misc", "Business", "Protection", "Labor", "Business",
                                  "Labor", "Labor", "Tech", "Misc", "Labor", "Misc",
                                  "Protection", "Business", "Tech", "Labor")
levels(adultFixed$native_country) = c("NA", "AFR", "NA", "AS", "SA", "NA", "NA", "SA", "NA",
                                      "EU", "EU", "EU", "EU", "NA", "NA", "EU", "NA", "AS",
                                      "EU", "AS", "AS", "EU", "EU", "NA", "AS", "AS", "NA",
                                      "NA", "NA", "SA", "AS", "EU", "EU", "NA", "EU",
                                      "AS", "AS", "AS", "NA", "NA", "AS", "EU")

for(i in 1:(ncol(adultFixed)-1)){
    adultFixed[,i]<-as.numeric(adultFixed[,i])
}

ScaleAndCenter <- preProcess(adultFixed, method= c("center", "scale"))
FinalAdult <- predict(ScaleAndCenter, newdata = adultFixed)

set.seed(2018)
splitIndex <- createDataPartition(FinalAdult$target, p = .70, list = FALSE, times = 1)
train <- FinalAdult[ splitIndex,]
test <- FinalAdult[-splitIndex,]
model = rpart(target ~., data = train, method = "class")

pred = predict(model, test, type = "class")

cm = confusionMatrix(data = pred, reference = test$target, positive = ">50K")
cm$overall['Accuracy']
cm$byClass['Balanced Accuracy']

model2 = ranger(target ~., data = train)
pred2  = predict(model2, data = test)$predictions
cm2 = confusionMatrix(pred2, test$target, positive=">50K")
cm2$overall['Accuracy']
cm2$byClass['Balanced Accuracy']


#13. Skip step 4
adult = read.csv("adult.csv", header = FALSE)
names(adult) = c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
  "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
  "hours_per_week", "native_country", "target")

for(i in 1:ncol(adult)){
    adult[,i][adult[,i] == "?"] = NA
    mode = levels(adult[,i])[which.max(table(adult[,i]))]
    if(is.factor(adult[,i])){
      adult[,i][is.na(adult[,i])] = mode
    }
    }

adultNAfix = preProcess(adult, method='medianImpute')
adultFixed = predict(adultNAfix, newdata = adult)


dummies_model <- dummyVars(target ~ ., data=adultFixed)
adultFixed_mat <- predict(dummies_model, newdata = adultFixed)
adultFixed2 <- data.frame(adultFixed_mat)
adultFixed2$target <- adult$target

ScaleAndCenter <- preProcess(adultFixed2, method= c("center", "scale"))
FinalAdult <- predict(ScaleAndCenter, newdata = adultFixed2)

set.seed(2018)
splitIndex <- createDataPartition(FinalAdult$target, p = .70, list = FALSE, times = 1)
train <- FinalAdult[ splitIndex,]
test <- FinalAdult[-splitIndex,]
model = rpart(target ~., data = train, method = "class")

pred = predict(model, test, type = "class")

cm = confusionMatrix(data = pred, reference = test$target, positive = ">50K")
cm$overall['Accuracy']
cm$byClass['Balanced Accuracy']

model2 = ranger(target ~., data = train)
pred2  = predict(model2, data = test)$predictions
cm2 = confusionMatrix(pred2, test$target, positive=">50K")
cm2$overall['Accuracy']
cm2$byClass['Balanced Accuracy']

Loading required package: lattice
Rattle: A free graphical interface for data science with R.
Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
Type 'rattle()' to shake, rattle, and roll your data.

Attaching package: 'ranger'

The following object is masked from 'package:rattle':

    importance

"These variables have zero variances: workclass.."

"variable 'target' is not a factor"

"These variables have zero variances: workclass.., occupation.., native_country.."