week3_assi2_sol3_Dhar.r

# # -*- coding: utf-8 -*-
# """Week3 Assi2 Sol3.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1_aj8zTek2uUBfhhm967FKELtU3sssEMR
# """
###########################################################################
## Week-4, Homework-2, Sol-3
## Sreya Dhar 
## Created: Sept 20, 2020
## Edited: Sept 28, 2020
###########################################################################

rm(list = ls())

## installing all the libaries in R kernel

# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("hrbrthemes")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("pls")
# install.packages("caret")
# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("pcr")
# install.packages("pls")

## importing the libraries in R kernel
library(ISLR)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(ggplot2)
library(reshape2)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(rsample) 
library(leaps)
library(car)
library(pls)
library(PerformanceAnalytics)
library(funModeling)
library(caret)
library(glmnet)
library (pls)


# Set working directory to where data file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 3/Assignment-2")

head(College)

names(College)

glimpse(College)

status(College)

College_C <- College

glimpse(College_C)

status(College_C)

profiling_num(College_C)

College_n1 <- College_C %>% mutate_if(is.factor, as.numeric)

## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(College_n1, histogram=TRUE, pch=15)

res <- cor(College_n1, method="pearson")
corrplot::corrplot(res, method= "color", order = "hclust", tl.pos = 'n')

profiling_num(College_n1)

options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
plot_num(College_n1)

describe(College_n1)

summary(College_n1)

head(data.matrix(College_n1))

options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
pairs(College_n1, main = "Pairwise plot")

###############################################################################################
################################ Test and training data #######################################

## min-max scaling on boston dataset prior to regression
max <- apply(College_n1, 2 , max)
min <- apply(College_n1, 2 , min)
College_n2 <- as.data.frame(scale(College_n1, center = min, scale = max - min))

## splitting the dataset into train and test sets
set.seed(100) ## seeding the sampling
clg_split1 <- initial_split(College_n2, prop = 0.75) ## spliting the data by library 'rsample'
clg_train1 <- training(clg_split1)
clg_test1  <- testing(clg_split1)

################################################################################################
################################ Linear Regression #############################################
################################################################################################

clg_lm1 <- lm(Apps~., data = clg_train1)
summary(clg_lm1)

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(2,2))
plot(clg_lm1)

College_n<- College_n2[-c(484),]
## splitting the dataset into train and test sets
set.seed(100) ## seeding the sampling
clg_split <- initial_split(College_n, prop = 0.75) ## spliting the data by library 'rsample'
clg_train <- training(clg_split)
clg_test  <- testing(clg_split)

# fit the model on train data
clg_lm <- lm(Apps~., data = clg_train)
summary(clg_lm)

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(2,2))
plot(clg_lm)

# # Other useful functions
# coefficients(clg_lm) # model coefficients
# confint(clg_lm, level=0.95) # CIs for model parameters
# fitted(clg_lm) # predicted values
# residuals(clg_lm) # residuals
# anova(clg_lm) # anova table
# vcov(clg_lm) # covariance matrix for model parameters
# influence(clg_lm) # regression diagnostics

anova(clg_lm)['Residuals', 'Mean Sq'] # MSE calculation from anova table
sigma(clg_lm) # residual standard deviation

# Fit the model on train data
clg_pred_train <- predict(clg_lm, newdata = clg_train)

# # Calculate MSE on train data
clg_mse_err_tr <- sum((clg_pred_train - clg_train$Apps)^2)/length(clg_train$Apps)
c(MSE = clg_mse_err_tr)

# Calculate Accuracy on train data
comb_lm_tr <- cbind(actual=clg_train$Apps, clg_pred_train)  # combine
mean(abs(apply(comb_lm_tr, 1, min)/apply(comb_lm_tr, 1, max))) # calculate accuracy, %

# Fit the model on test data
clg_pred_test <- predict(clg_lm, newdata = clg_test)

# Calculate MSE on test data
clg_mse_error <- sum((clg_pred_test - clg_test$Apps)^2)/length(clg_test$Apps)
c(MSE = clg_mse_error, Adj_R2=summary(clg_lm)$adj.r.squared )

# Calculate Accuracy on test data
comb_lm <- cbind(actual=clg_test$Apps, clg_pred_test)  # combine
100*(mean(abs(apply(comb_lm, 1, min)/apply(comb_lm, 1, max)))) # calculate accuracy, %

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,2))

plot(clg_train$Apps, clg_pred_train, col="blue", xlab="Original", ylab="Predicted", xlim = c(0,0.5), ylim =c(0,0.5), main="Prediction on train set" )
abline(a = 0, b = 1, lty = 2)

plot(clg_test$Apps, clg_pred_test, col="red", xlab="Original", ylab="Predicted", xlim = c(0,0.5), ylim =c(0,0.5), main="Prediction on test set" )
abline(a = 0, b = 1, lty = 2)

###############################################################################################
################################ Ridge Regression #############################################
###############################################################################################

## converting the dataframe to matrix
X_train <- as.matrix(clg_train[,-2])
Y_train <- as.matrix(clg_train[,2])
X_test <- as.matrix(clg_test[,-2])
Y_test <- as.matrix(clg_test[,2])

## defining a range of lambda
lam_ridge <- 10^seq(2, -3, by = -.1)
ridge_mod = glmnet(X_train, Y_train, nlambda = 25, alpha = 0, family = 'gaussian', lambda = lam_ridge)

summary(ridge_mod)

ridge_mod$dev.ratio

ridge_mod$lambda

options(repr.plot.width=12, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(1,2))
plot(ridge_mod, xvar="lambda",  ylab="Standardised coefficients", label=TRUE)
plot(ridge_mod,  ylab="Standardised coefficients", xlab= "L2 norm", label=TRUE)
#plot(ridge_mod$lambda, ridge_mod$coefficients , ylab="Standardised coefficients", xlab="lambda")

# finding the optimal lambda value
cvglm_ridge <- cv.glmnet(X_train, Y_train, alpha = 0, lambda = lam_ridge)

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)

# plot(cvglm_ridge, xvar="lambda",  ylab="Standardised coefficients", label=TRUE)
opt_lam <- cvglm_ridge$lambda.min
opt_lam

par(mfrow=c(1,2))
plot(cvglm_ridge, ylab="MSE from CV in Ridge")
abline(v=log(opt_lam), col="green", lty=2, ldw=3)

#Creating training model using ridge regression
ridge_best =glmnet(X_train, Y_train,alpha=0,lambda=opt_lam)
#Printing out the logistic model
ridge_best$beta

# Computing R^2 from original and predicted values
eval_results <- function(original, predicted) {
  SSE <- sum((predicted - original)^2)
  SST <- sum((original - mean(original))^2)
  R_square <- (1 - SSE / SST)*100 ## in percentage
  MSE = SSE/nrow(original) ## calculating mse
 
  # Model performance metrics
  data.frame(
    MSE = MSE,
    Rsquare_percent = R_square)
}

#Retrieving the ridge coefficients
ridge_coef=predict(ridge_best,type="coefficients",s=opt_lam)[0:length(ridge_best$beta)+1,]
#Printing non zero coefficients
as.data.frame(ridge_coef[ridge_coef !=0])

# Prediction and evaluation on train data
ridge_pred_train <- predict(ridge_best, s = opt_lam, newx = X_train)
# Calculate MSE and R2 on test data
eval_results(Y_train, ridge_pred_train)

# Calculate Accuracy on train set
comb_ridge_acc_tr <- cbind(actual=clg_train$Apps, ridge_pred_train)  # combine
mean(abs(apply(comb_ridge_acc_tr, 1, min)/apply(comb_ridge_acc_tr, 1, max)))*100 # calculate test accuracy

# Prediction and evaluation on test data
ridge_pred_test <- predict(ridge_best, s = opt_lam, newx = X_test)
eval_results(Y_test, ridge_pred_test)

mse_ridge_test<-sum((ridge_pred_test - clg_test$Apps)^2)/length(clg_test$Apps)
mse_ridge_test

# Calculate Accuracy on test set
comb_ridge_acc <- cbind(actual=clg_test$Apps, ridge_pred_test)  # combine
mean(abs(apply(comb_ridge_acc, 1, min)/apply(comb_ridge_acc, 1, max)))*100 # calculate test accuracy

###################################################################################################
################################ Lasso Regression #############################################
######################################################################################################

lam_lasso <- 10^seq(2, -4, by = -.1)

# Setting alpha = 1 implements lasso regression
lasso_mod <- glmnet(X_train, Y_train, alpha = 1, lambda = lam_lasso)
sum_lasso <- summary(lasso_mod)

sum_lasso

options(repr.plot.width=12, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(1,2))
plot(lasso_mod, xvar="lambda", label=TRUE, cex=5)
plot(lasso_mod,   label=TRUE)

cvglm_lasso <- cv.glmnet(X_train, Y_train, alpha = 1, lambda = lam_lasso)
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)

# Best lambda selection
opt_las <- cvglm_lasso$lambda.min 
opt_las

par(mfrow=c(1,1))
plot(cvglm_lasso, ylab= "MSE from CV in Lasso")
abline(v=log(opt_las), col="green", lty=2, ldw=3)

#Creating training model using lasso regression with best lambda
lasso_best =glmnet(X_train, Y_train,alpha=1,lambda=opt_las)
#Printing out the logistic model
lasso_best$beta

#Retrieving the lasso coefficients
lasso_coef=predict(lasso_best,type="coefficients",s=opt_las)[1:length(lasso_best$beta)+1,]
#Printing non zero coefficients
as.data.frame(lasso_coef[lasso_coef!=0])

# Prediction and evaluation on train data
lasso_pred_train <- predict(lasso_best, s = opt_las, newx = X_train)
eval_results(Y_train, lasso_pred_train)

# Calculate train set Accuracy
comb_lasso_acc_tr <- cbind(actual=clg_train$Apps, lasso_pred_train)  # combine
mean(abs(apply(comb_lasso_acc_tr, 1, min)/apply(comb_lasso_acc_tr, 1, max)))*100 # calculate accuracy

# Prediction and evaluation on test data
lasso_pred_test <- predict(lasso_best, s = opt_las, newx = X_test)
# Calculate MSE and R2 on test data
eval_results(Y_test, lasso_pred_test)

# Calculate test set Accuracy
comb_lasso_acc <- cbind(actual=clg_test$Apps, lasso_pred_test)  # combine
mean(abs(apply(comb_lasso_acc, 1, min)/apply(comb_lasso_acc, 1, max)))*100 # calculate accuracy

#############################################################################################
################################ PCR Regression #############################################
#############################################################################################

pcr_fit =pcr(Apps~., data= clg_train, scale = TRUE, validation ="CV", jackknife = TRUE)

#validationplot(pcr_fit, val.type ="R2")
summary(pcr_fit)

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,2))
validationplot(pcr_fit, val.type ="RMSEP")
#axis(side = 1, at = c(5), cex.axis=0.7)
#abline(v = 5, col = "blue", lty = 3)

validationplot(pcr_fit, val.type ="MSEP")
#axis(side = 1, at = c(5), cex.axis=0.7)
#abline(v = 5, col = "blue", lty = 3)

# Initial set of plots:
par(mfrow = c(2, 2))
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 250)
obsfit_1 <- predplot(pcr_fit, labels = rownames(clg_train), which = "validation")
abline(lm(obsfit_1[,2] ~ obsfit_1[,1]), col = "red", lty = 2, lwd=2)
plot(pcr_fit, "validation", estimate = c("train", "CV"), legendpos = "topright")
plot(pcr_fit, "validation", estimate = c("train", "CV"), val.type = "R2", legendpos = "bottomright")
scoreplot(pcr_fit, labels = rownames(clg_train))


# Fit the model on training set via CV
clg_pcr <- train(Apps~., data = clg_train, method = "pcr",scale = TRUE, 
  trControl = trainControl("cv", number = 10), tuneLength = 18)

summary(clg_pcr)

# Plot model RMSE vs different values of components
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,1))
plot(clg_pcr)
# axis(side = 1, at = c(clg_pcr$bestTune), cex.axis=0.7)
# abline(v = clg_pcr$bestTune, col = "blue", lty = 3)

# Print the best tuning parameter ncomp min(RMSE) 
clg_pcr$bestTune

# choosing 16 components
options(repr.plot.width=12, repr.plot.height=8, repr.plot.res = 200)
pcr_fit3 <- pcr(Apps~., data= clg_train, ncomp = 16, scale = TRUE, validation ="CV", jackknife = TRUE)
summary(pcr_fit3)

# the results ( "interpret/conclude")
par(mfrow = c(2, 3))
# Plot coefficients with uncertainty from Jacknife:
obsfit <- predplot(pcr_fit3, labels = rownames(clg_train), which = "validation")
abline(lm(obsfit[,2] ~ obsfit[,1]), col = "red", lty = 2, lwd=2)
plot(pcr_fit3, "validation", estimate = c("train", "CV"), val.type = "R2",
  legendpos = "bottomright")
coefplot(pcr_fit3, se.whiskers = TRUE, labels = prednames(pcr_fit3), 
  cex.axis = 0.5)
biplot(pcr_fit3)
scoreplot(pcr_fit3, labels = rownames(clg_train))

pcr_pred_te = predict(pcr_fit3, X_test, ncomp=16)
pcr_pred_fr <- data.frame(pcr_pred_te, Y_test)
par(mfrow = c(1, 1))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
plot(Y_test, pcr_pred_te, xlab="original", ylab="predicted ", main="Prediction on test set from PCR",xlim=c(0,0.4), ylim=c(0,0.4))
abline(a = 0, b=1,col = "red", lty = 2, lwd=2)

mean((pcr_pred_fr$Apps.16.comps - pcr_pred_fr$Y_test)^2) ## calculate mse on test set

# Calculate Accuracy
comb_pcr1 <- cbind(actual=pcr_pred_fr$Y_test, pcr_pred_fr$Apps.16.comps)  # combine
mean(abs(apply(comb_pcr1, 1, min)/apply(comb_pcr1, 1, max)))*100 # calculate test accuracy

pcr_pred_tr = predict(pcr_fit3, X_train, ncomp=16)  ## calculate mse on train set
pcr_pred_fr_tr <- data.frame(pcr_pred_tr, Y_train)
par(mfrow = c(1, 1))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
plot(Y_train, pcr_pred_tr, xlab="original", ylab="predicted", main="Prediction on train set from PCR",xlim=c(0,0.4), ylim=c(0,0.4))
abline(a = 0, b=1,col = "red", lty = 2, lwd=2)

mean((pcr_pred_fr_tr$Apps.16.comps - pcr_pred_fr_tr$Y_train)^2) ## calculate mse on train set

# Calculate Accuracy
comb_pcr1_tr <- cbind(actual=pcr_pred_fr_tr$Y_train, pcr_pred_fr_tr$Apps.16.comps)  # combine
mean(abs(apply(comb_pcr1_tr, 1, min)/apply(comb_pcr1_tr, 1, max)))*100 # calculate train accuracy


############################################################################################
################################ PLS Regression ############################################
############################################################################################


pls_fit =plsr(Apps~., data= clg_train, scale = TRUE, validation ="CV")
summary(pls_fit)

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,2))

validationplot(pls_fit, val.type ="RMSEP")
# axis(side = 1, at = c(9), cex.axis=0.7)
# abline(v = 9, col = "blue", lty = 3)
validationplot(pls_fit, val.type ="MSEP")
# axis(side = 1, at = c(9), cex.axis=0.7)
# abline(v = 9, col = "blue", lty = 3)

# Initial set of plots:
par(mfrow = c(2, 2))
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 250)
plot(pls_fit, labels = rownames(clg_train), which = "validation")
abline(a=0, b=1, col = "red", lty = 2, lwd=2)
plot(pls_fit, "validation", estimate = c("train", "CV"), legendpos = "topright")
plot(pls_fit, "validation", estimate = c("train", "CV"), val.type = "R2",
legendpos = "bottomright")
scoreplot(pls_fit, labels = rownames(clg_train))

# Fit the model on training set via CV
clg_pls <- train( Apps~., data = clg_train, method = "pls", scale = TRUE,
  trControl = trainControl("cv", number = 10), tuneLength = 17)

# Plot model RMSE vs different values of components
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,1))
plot(clg_pls)

# Print the best tuning parameter ncomp minimize RMSE
clg_pls$bestTune

# choosing 8 components and fitting pls again
pls_fit1 <- plsr(Apps~., data= clg_train, ncomp = 8, scale = TRUE, 
                 validation ="CV", jackknife = TRUE)
summary(pls_fit1)

# the results to interpret
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 250)

par(mfrow = c(2, 2))
# Plot coefficients with uncertainty from Jacknife:
obs_pls <- predplot(pls_fit1, labels = rownames(clg_train), which = "validation")
abline(lm(obs_pls[,2] ~ obs_pls[,1]), col = "red", lty = 2, lwd=2)
plot(pls_fit1, "validation", estimate = c("train", "CV"), val.type = "R2",
legendpos = "bottomright")
coefplot(pls_fit1, se.whiskers = TRUE, labels = prednames(pls_fit1), cex.axis = 0.5)
biplot(pls_fit1)

pls_pred_te = predict(pls_fit1, X_test, ncomp=8)
pcr_pred_fr <- data.frame(pls_pred_te, Y_test)
par(mfrow = c(1, 1))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
plot(Y_test, pls_pred_te, xlab="original", ylab="predicted",  
     main="Prediction on test set from PLS", xlim=c(0,0.4), ylim=c(0,0.4))
abline(a = 0, b=1,col = "red", lty = 2, lwd=2)

mean((clg_test[,2] - pls_pred_te)^2) ## calculate mse on test set

# Calculate Accuracy on test set
comb_pls1 <- cbind(actual=clg_test[,2], pls_pred_te)  # combine
mean(abs(apply(comb_pls1, 1, min)/apply(comb_pls1, 1, max)))*100 # calculate test accuracy

pls_pred_tr = predict(pls_fit1, X_train, ncomp=8)
pcr_pred_fr_tr <- data.frame(pls_pred_tr, Y_train)


par(mfrow = c(1, 1))
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
plot(Y_train, pls_pred_tr, xlab="original", ylab="predicted", main="Prediction on train set from PLS", xlim=c(0,0.4), ylim=c(0,0.4))
abline(a = 0, b=1,col = "red", lty = 2, lwd=2)

mean((clg_train[,2] - pls_pred_tr)^2) ## calculate mse on train set

# Calculate Accuracy on train set
comb_pls1_tr <- cbind(actual=clg_train[,2], pls_pred_tr)  # combine
mean(abs(apply(comb_pls1_tr, 1, min)/apply(comb_pls1_tr, 1, max)))*100 # calculate train accuracy

## end ##