<a href="https://colab.research.google.com/github/stephenfrein/csc8491/blob/main/DecisionTreeCensus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
install.packages("tree")

In [None]:
library(tree)
income = read.csv("https://csc8491.s3.amazonaws.com/census_income.csv", stringsAsFactors = TRUE)
head(income)

In [None]:
income_clean = income
# remove questionable variable
income_clean = income_clean[,-7]
summary(income_clean)

In [None]:
# create train and test sets to build and check model
set.seed(101)
sample_pct <- .80
train <- sample(1:nrow(income_clean), sample_pct * nrow(income_clean))
train

In [None]:
income.train <- income_clean[train,] #train set
nrow(income.train)
income.test <- income_clean[-train,] #test set
nrow(income.test)

In [None]:
tree.income<-tree(over_50k~.,income.train)
summary(tree.income)

In [None]:
tree.income
plot(tree.income)
text(tree.income, pretty=0) # pretty uses factor names for plot

In [None]:
tree.pred=predict(tree.income,income.test,type="class") # predictions
# confusion matrix and accuracy calc
install.packages("caret")
library(caret)
confusionMatrix(tree.pred, income.test$over_50k, mode = "prec_recall", positive="Yes")


Go back to slide 19 here for confusion matrix interpretation.

In [None]:
install.packages("randomForest")
library(randomForest)
set.seed(54321)
# build random forest and track variable importance
rf.income=randomForest(over_50k~.,data=income.train,importance=TRUE)
rf.income

Go back to slide 21 to discuss variable importance.

In [None]:
# show importance
importance(rf.income)
varImpPlot(rf.income)

In [None]:
# predict on test set
rf.income.pred=predict(rf.income,income.test,type="class")
# confusion matrix and accuracy calc
library(caret)
confusionMatrix(rf.income.pred, income.test$over_50k, mode = "prec_recall", positive="Yes")


Go back to slide 22 for boosted decision trees.

In [None]:
# setting up for boosted
# need our target to be ones and zeroes
income.train.forboost = income.train
income.train.forboost$over_50k = ifelse(income.train.forboost$over_50k=="Yes",1,0)
income.test.forboost = income.test
income.test.forboost$over_50k = ifelse(income.test.forboost$over_50k=="Yes",1,0)

In [None]:
install.packages("gbm")

In [None]:
library(gbm)
set.seed(123)
# create a boosted model with 1000 piggybacked trees â€“ verbose lets us see interim results
# Bernoulli distribution good for our 0 and 1 values in the target
boosted.income = gbm(over_50k~.,data=income.train.forboost,n.trees=1000, distribution = "bernoulli", verbose=TRUE)

In [None]:
boosted.income

In [None]:
summary(boosted.income)

In [None]:
# make predictions
boosted.pred = predict(boosted.income,income.test.forboost, n.trees=1000, type="response")
# turn probabilities to text values
boosted.pred = ifelse(boosted.pred > 0.5,"Yes","No")
# confusion matrix and accuracy calc
confusionMatrix(as.factor(boosted.pred), income.test$over_50k, mode = "prec_recall", positive="Yes")


Go back to slide 23.

In [None]:
# create ROC curve for model
install.packages("pROC")

In [None]:
library(pROC)
boosted.pred.roc = predict(boosted.income,income.test.forboost, 	n.trees=1000, type="response")
roc_obj <- roc(income.test$over_50k, boosted.pred.roc)
# see AUC - area under curve
roc_obj$auc
# draw ROC curve with title and AUC
roc_plot <- ggroc(roc_obj, legacy.axes = TRUE)
roc_plot + xlab("FPR") + ylab("TPR") +
  	geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1),
  	color="darkgrey", linetype="dashed") + ggtitle(paste("ROC Curve with AUC",roc_obj$auc))
