week9_assi5_solExtra.R

# # -*- coding: utf-8 -*-
# """Week9 Assi5 Sol1.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1_Et_v9mmzBrmbE8jR1XTGHR_m1d5DieY
# """

###########################################################################
## Week-9, Homework-5, Sol-4 (Extra)
## Sreya Dhar 
## Created: Nov 04, 2020
## Edited: Nov 16, 2020
###########################################################################

rm(list=ls())
## installing all the libaries in R kernel


# install.packages("corrplot")
# install.packages("forecast")
# install.packages("zoo")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("caret")
# install.packages("ROCR")
# install.packages("PerformanceAnalytics")
# install.packages("funModeling")
# install.packages("hrbrthemes")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("kableExtra")
# install.packages("broom")
# install.packages("knitr")
# install.packages("psych")
# install.packages("aod")
# install.packages("epiDisplay")
# install.packages("e1071")
# install.packages("class")
# install.packages("rpart.plot")
# install.packages("party")
# install.packages("partykit")
# install.packages("rattle")

## importing the libraries in R kernel

library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(repr)
library(reshape2)
library(forecast)
library(zoo)
library(rsample)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(leaps)
library(car)
library(PerformanceAnalytics)
library(funModeling)
library(gridExtra)
library(caret)
library(MASS)
library(Hmisc)
library(hrbrthemes)
library(GGally)
library(glmnet)
library(pROC)
library(psych)
library(aod)
library(epiDisplay)
library(e1071)
library(ggthemes)
library(kableExtra)
library(broom)
library(knitr)
library(devtools)
library(rpart)  #for trees
library(rattle)    # Fancy tree plot This is a difficult library to install (https://gist.github.com/zhiyzuo/a489ffdcc5da87f28f8589a55aa206dd) 
library(rpart.plot)             # Enhanced tree plots
library(RColorBrewer)       # Color selection for fancy tree plot
library(party)                  # Alternative decision tree algorithm
library(partykit)               # Convert rpart object to BinaryTree
library(randomForest)
library(viridis)
library(tree)
library(factoextra)

## set directory ##
setwd("C:/File E/EAS 506 Statistical Mining I/Week 9/Assignment-5")

## converting the RData file into .csv file
# load("C:/File E/EAS 506 Statistical Mining I/Week 9/Assignment-5/covertype.RData")
# write.csv(covertype,'covertype.csv')
covertype <- read.csv("covertype.csv", header = TRUE)
covertype<- covertype[,-1]

soil_type<- covertype[ ,c(15:54)]
wild_area<- covertype[,c(11:14)]
forest<- covertype[,c(-15:-54, -11:-14)]
soil_encode <- factor(apply(soil_type, 1, function(x) which(x == 1)), labels = c(1:40)) 
forest$soil_type<- as.integer(soil_encode)
wild_encode <- factor(apply(wild_area, 1, function(x) which(x == 1)), labels = c(1:4)) 
forest$wild_area<- as.integer(wild_encode)
forest<- forest[ ,c(1:10,12,13,11)]
head(forest)

## After rearranging the columns and renaming them ##
names(forest)<- c("Elevation", "Aspect","Slope","Hor_Dist_To_Hydro","Virt_Dist_To_Hydro","Hor_Dist_To_Roads","Hillshade_9am" ,"Hillshade_Noon" ,"Hillshade_3pm","Hor_Dist_To_Fire_Points" ,"Soil_Type","Wilderness_Area","Cover_Type" )

## Correlation plot for Different variables ##
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
L <- cor(forest)
corrplot(L, method = "circle",  type = "lower")

head(forest)

names(forest)

status(forest)

profiling_num(forest)

#### to investigate unique values in column 'Soil_Type' and 'Wilderness_Area'
as.data.frame(unique(forest[c("Soil_Type")]))
as.data.frame(unique(forest[c("Wilderness_Area")]))
############# EDA on the dataset based on covertype ########
############################################################
violin(forest[,c(1:6,10)], las=3, par(mar = c(4, 4, 2, 2)), col="pink", main="Boxplot of different variables")
theme_set(theme_bw(base_size = 10))
options(repr.plot.width=7, repr.plot.height=5, repr.plot.res = 230)
ggplot(data = forest, mapping = aes(x = factor(Cover_Type), y =Elevation )) + geom_boxplot(alpha = 0.1, aes(color = factor(Cover_Type)))
ggplot(data = forest, mapping = aes(x = factor(Cover_Type), y =Aspect )) + geom_boxplot(alpha = 0.1, aes(color = factor(Cover_Type)))
ggplot(data = forest, mapping = aes(x = factor(Cover_Type), y =Hor_Dist_To_Roads )) + geom_boxplot(alpha = 0.1, aes(color = factor(Cover_Type)))
ggplot(data = forest, mapping = aes(x = factor(Cover_Type), y =Hor_Dist_To_Fire_Points )) + geom_boxplot(alpha = 0.1, aes(color = factor(Cover_Type)))

################## Preparing the data for Analysis ########################

## splitting the dataset into train and test sets
set.seed(4444) ## seeding the sampling
data_split <- initial_split(forest, prop = 0.75) ## spliting the data by library 'rsample'
data_train <- training(data_split)
data_test  <- testing(data_split)

## Growing classifcation tree
tree_mod = tree(factor(Cover_Type) ~., data = data_train, split='deviance')
plot(tree_mod)
text(tree_mod, cex=1.5)

tree_pred_tr = predict(tree_mod, data_train[,-13], type='class')
tree_tab_tr <- table(Original=data_train$Cover_Type, Prediction=tree_pred_tr)
confusionMatrix(tree_tab_tr)

tree_pred_te = predict(tree_mod, data_test[,-13], type='class')
tree_tab_te <- table(Original=data_test$Cover_Type, Prediction=tree_pred_te)
confusionMatrix(tree_tab_te)

###################################################################
###################### Bagging ####################################
###################################################################

bag <- randomForest(factor(Cover_Type) ~ ., mtry = 12, ntrees=500, importance = TRUE, data_train) ## suggested mtry=sqrt(p) 
bag_pred_tr <- predict(bag, data_train[,-13], type='class')  
bag_pred_te <- predict(bag, data_test[,-13], type='class')
bag_tab_tr <- table(Original=data_train$Cover_Type, Prediction=bag_pred_tr)
bag_tab_te <- table(Original=data_test$Cover_Type, Prediction=bag_pred_te)
confusionMatrix(bag_tab_tr)
confusionMatrix(bag_tab_te)

importance(bag, type = 1)
varImpPlot(bag, type = 1)

###################################################################
################## Random Forest for Classification ###############
###################################################################
rf <- randomForest(factor(Cover_Type) ~ ., mtry = 4, ntrees=500, importance = TRUE, data_train) ## suggested mtry=sqrt(p) 
rf_pred_tr <- predict(rf, data_train[,-13], type='class')  
rf_pred_te <- predict(rf, data_train[,-13], type='class')
rf_tab_tr <- table(Original=data_train$Cover_Type, Prediction=rf_pred_tr)
rf_tab_te <- table(Original=data_test$Cover_Type, Prediction=rf_pred_te)
confusionMatrix(rf_tab_tr)
confusionMatrix(rf_tab_te)

importance(rf, type = 1)
varImpPlot(rf, type = 1)

################## Iteration on mtry for RF #############
#########################################################

rf.c <- list()
yhat.rf <-list()
misclass_rf<-list()
for ( i in 1:12 ) {
  set.seed(4444)
  rf.c[[i]]<-randomForest(factor(Cover_Type) ~ ., data = data_train, mtry = i, importance = TRUE)
  yhat.rf[[i]]<-predict(rf.c[[i]], newdata = data_train[,-13])
  misclass_rf[[i]]<- mean(yhat.rf[[i]] != data_test$Cover_Type)
}

par(bg = "white" )
matplot(1:12, misclass_rf, xlab = 'No. of Variables (mtry)', ylab = 'Misclassification error', main = "Subset of predictors for Misclassification error")
lines(1:12, misclass_rf, type = "o")

###############################################
# Boosting ##
###############################################
mod_boost = gbm(factor(Cover_Type) ~.,
                data = data_train,
                distribution = "multinomial",
                cv.folds = 10,
                shrinkage = 0.1,
                n.minobsinnode = 10,
                n.trees = 200)

print(mod_boost)

pred_boost_tr = predict.gbm(object = mod_boost,
                            newdata = data_train,
                            n.trees = 200,
                            type = "response")

labels_tr = colnames(pred_boost_tr)[apply(pred_boost_tr, 1, which.max)]
result_tr = data.frame(data_train$Cover_Type, labels_tr)
confusionMatrix(table(data_train$Cover_Type, labels_tr))

pred_boost_te = predict.gbm(object = mod_boost,
                            newdata = data_test,
                            n.trees = 200,
                            type = "response")


labels = colnames(pred_boost_te)[apply(pred_boost_te, 1, which.max)]
result = data.frame(data_test$Cover_Type, labels)
# print(result)

confusionMatrix(table(data_test$Cover_Type, as.factor(labels)))

#######################################
## Linear Discriminant Analysis (LDA)
#######################################

lda_mod <- lda(factor(Cover_Type) ~., data = data_train)
lda.pred.train <- predict(lda_mod, newdata = data_train)
y_lda_train <- lda.pred.train$class
lda.pred.test <- predict(lda_mod, newdata = data_test)
y_lda_test <- lda.pred.test$class

y_true_train <- data_train$Cover_Type
y_true_test <- data_test$Cover_Type

# Compute the LDA error
lda_train_error <- mean(y_true_train  != y_lda_train)
lda_test_error <- mean(y_true_test != y_lda_test)

## Misclassifiaction error for LDA
lda_train_error
lda_test_error

lda_mod

summary(lda.pred.train$class)

## AUC for LDA

roc_lda_tr <- multiclass.roc(data_train$Cover_Type, lda.pred.train$posterior)
roc_lda_te <- multiclass.roc(data_test$Cover_Type, lda.pred.test$posterior)

print("LDA train set AUC")
auc(roc_lda_tr)
print("LDA test set AUC")
auc(roc_lda_te)

## confusion matrix from LDA on train set

tab_tr_lda <- table(Predicted=y_lda_train, Reference=data_train$Cover_Type )
caret::confusionMatrix(tab_tr_lda)

## probability table

round(prop.table(caret::confusionMatrix(tab_tr_lda)$table), 2)

## confusion matrix from LDA on test set

tab_te_lda <- table(Predicted=y_lda_test, Reference=data_test$Cover_Type)
caret::confusionMatrix(tab_te_lda, positive = "Barbera")

## probability table

conf_mat_lda <- round(prop.table(caret::confusionMatrix(tab_te_lda)$table), 2)
conf_mat_lda

## probability table
round(prop.table(caret::confusionMatrix(tab_te_lda)$table), 2)

##############################################################
###################### PCA ###################################
############# applying pca on train data #####################

pca_tr <- prcomp(data_train[,1:12], scale = TRUE)
sum_pca_tr <- summary(pca_tr)
sum_pca_tr

############# applying pca on train data #####################
pca_te <- prcomp(data_test[,1:12], scale = TRUE)
sum_pca_te <- summary(pca_te)
sum_pca_te

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,1))
fviz_eig(pca_tr, title="Explained variance in train set", xlab="Components")

pca_tr

### eigen values from PCA
eig_val_tr <- get_eigenvalue(pca_tr)
eig_val_tr

### extracting 1st two componenets from PCA on train and test set ######### 
pred_pca_tr <- pca_tr$x[,1:6]
resp_pca_tr <- data_train[,13]
dat_pca_tr <- cbind(pred_pca_tr,resp_pca_tr )
data_pca_tr <- data.frame(dat_pca_tr)

pred_pca_te <- pca_te$x[,1:6]
resp_pca_te <- data_test[,13]
dat_pca_te <- cbind(pred_pca_te,resp_pca_te )
data_pca_te <- data.frame(dat_pca_te)

############# applying LDA on 1st 6 components of PCA from train data #####################

lda_mod <- lda(resp_pca_tr ~., data = data_pca_tr)
lda.pred.train <- predict(lda_mod, newdata = data_pca_tr)
y_lda_train <- lda.pred.train$class
lda.pred.test <- predict(lda_mod, newdata = data_pca_te)
y_lda_test <- lda.pred.test$class

## confusion matrix from LDA on train set from PCA (comp=6) ###################
tab_tr_lda <- table(Predicted=y_lda_train, Reference=data_train$Cover_Type )
caret::confusionMatrix(tab_tr_lda)

## confusion matrix from LDA on test set from PCA (comp=6) ###################
tab_te_lda <- table(Predicted=y_lda_test, Reference=data_test$Cover_Type)
caret::confusionMatrix(tab_te_lda, positive = "Barbera")


###################################################################
############ Knn prediction on the dataset ########################
###################################################################

accuracy = function(actual, predicted) { ## defining accuracy function 
  mean(actual == predicted)}

error = function(actual, predicted) { ## defining error function 
  mean(actual != predicted)}

iter_k = c(1,3,5,7,9,13,15) ## giving k-values
accu_pca_te = rep(x = 0, times = length(iter_k))
accu_pca_tr = rep(x = 0, times = length(iter_k))

error_pca_te = rep(x = 0, times = length(iter_k))
error_pca_tr = rep(x = 0, times = length(iter_k))

for(i in seq_along(iter_k)) {
  pred_pca_tr = knn( train = data_train[,1:12], 
                     test = data_train[,1:12], 
                     cl = data_train$Cover_Type, 
                     k = iter_k[i])
  accu_pca_tr[i] = accuracy(pred_pca_tr,data_train$Cover_Type) ## accuracy from knn on train set
  error_pca_tr[i] = error( pred_pca_tr, data_train$Cover_Type) ## error from knn on train set
} ## error from knn train set


for(i in seq_along(iter_k)) {
  pred_pca_te = knn( train = data_train[,1:12], 
                     test = data_test[,1:12], 
                     cl = data_train$Cover_Type, 
                     k = iter_k[i])
  accu_pca_te[i] = accuracy(pred_pca_te, data_test$Cover_Type) ## accuracy from knn on test set
  error_pca_te[i] = error(pred_pca_te, data_test$Cover_Type) ## error from knn on test set
  
} ## error from knn test set

error_pca_train <- mean(data_train$Cover_Type != pred_pca_tr)
error_pca_test  <- mean(data_test$Cover_Type != pred_pca_te)

print(paste('Accuracy of train set from linear regression',(1-error_pca_train)*100,'%'))
print(paste('Accuracy of test set from linear regression',(1-error_pca_test)*100,'%'))

c(error_pca_te, error_pca_tr)

####################### Comparing classification accuracy and errror in knn from PCA components  #########################
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
par(mfrow=c(2,2))

# plot accuracy vs choice of k on Training set
plot(iter_k, accu_pca_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, lty = 2,
     xlab = "k, number of neighbors", ylim= c(95,100),
     ylab = "classification accuracy, %", main = "Accuracy of Training set")

abline(v = which.max(accu_pca_tr),y = max(accu_pca_tr)*100, type = "l", col = "red", lwd = 2, lty = 2)
abline(x = which.max(accu_pca_tr),h = max(accu_pca_tr)*100, type = "l", col = "black", lty = 2)


# plot accuracy vs choice of k on Test set
plot(iter_k, accu_pca_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,
     ylim= c(95,100),
     xlab = "k, number of neighbors", ylab = "classification accuracy, %",
     main = "Accuracy of Test set")

abline(v = 3,y = max(accu_pca_te)*100, type = "l", col = "red", lwd = 2, lty = 2)
abline(x = which.max(accu_pca_te),h = max(accu_pca_te)*100, type = "l", col = "black", lty = 2)

# plot accuracy vs choice of k on Training set
plot(iter_k, error_pca_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, 
     xlab = "k, number of neighbors", ylim=c(0,5),
     ylab = "classification error, %", main = "classification error of Training set")

abline(v = which.min(error_pca_tr),y = min(error_pca_tr)*100, type = "l", col = "red", lwd = 2, lty = 2)
abline(x = which.min(error_pca_tr),h = min(error_pca_tr)*100, type = "l", col = "black", lty = 2)

# # plot accuracy vs choice of k on Test set
plot(iter_k, error_pca_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,lty = 2,
     ylim=c(0,5),
     xlab = "k, number of neighbors", ylab = "classification error, %",
     main = "classification error of Test set")

abline(v = 3,y = min(error_pca_te)*100, type = "l", col = "red", lwd = 2, lty = 2)
abline(x = which.min(error_pca_te),h = min(error_pca_te)*100, type = "l", col = "black", lty = 2)

## KNN for K=3 ##

pred_pca_tr = knn( train = data_train[,1:12], 
                   test = data_train[,1:12], 
                   cl = data_train$Cover_Type, 
                   k = 3)
accu_pca_tr = accuracy(pred_pca_tr,data_train$Cover_Type) ## accuracy from knn on train set
error_pca_tr = error( pred_pca_tr, data_train$Cover_Type) ## error from knn on train set
confusionMatrix(table(pred_pca_tr, data_train$Cover_Type))
pred_pca_te = knn( train = data_train[,1:12], 
                   test = data_test[,1:12], 
                   cl = data_train$Cover_Type, 
                   k = 3)

accu_pca_te = accuracy(pred_pca_te, data_test$Cover_Type) ## accuracy from knn on test set
error_pca_te = error(pred_pca_te, data_test$Cover_Type) ## error from knn on test set
confusionMatrix(table(pred_pca_te, data_test$Cover_Type))


## end ##