week5_assi3_sol3.r

# -*- coding: utf-8 -*-
# """Week5 Assi3 Sol3.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1v5eRqFi8d_TEfAjG8v-jE5PDRrp7Qyvc
# """

###########################################################################
## Week-6, Homework-3, Sol-3
## Sreya Dhar 
## Created: Oct 02, 2020
## Edited: Oct 14, 2020
###########################################################################



## installing all the libaries in R kernel

# install.packages("corrplot")
# install.packages("forecast")
# install.packages("zoo")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("caret")
# install.packages("ROCR")
# install.packages("PerformanceAnalytics")
# install.packages("funModeling")
# install.packages("hrbrthemes")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("ggfortify")
# install.packages("cvms")
# install.packages("broom")
# install.packages("tibble")
# install.packages("rsvg")
# install.packages("ggimage")
# install.packages("factoextra")
# install.packages("Publish")
# install.packages("psych")

## importing the libraries in R kernel
library(psych)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(repr)
library(reshape2)
library(forecast)
library(zoo)
library(rsample)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(leaps)
library(car)
library(PerformanceAnalytics)
library(funModeling)
library(caret)
library(MASS)
library(Hmisc)
library(hrbrthemes)
library(GGally)
library(cvms)
library(tibble) 
library("factoextra")
library(ggfortify)
library(data.table)
library(Publish)
library(psych)


# Set working directory to where data file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 5/Assignment-3")

head(iris)

dim(iris)

names(iris)

glimpse(iris)

status(iris)

glimpse(iris)

profiling_num(iris)

summary(iris)

iris_1 <- iris %>% mutate_if(is.factor, as.numeric)

## min-max scaling on boston dataset prior to regression
max <- apply(iris_1, 2 , max)
min <- apply(iris_1, 2 , min)
iris_s <- as.data.frame(scale(iris_1, center = min, scale = max - min))

# heatmap and correlation matrix 
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
iris_h <- as.data.frame(scale(iris_s,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(iris_h), scale = "none", col = bluered(100), trace = "none", density.info = "none")

profiling_num(iris_s)

options(repr.plot.width=6, repr.plot.height=4, repr.plot.res = 200)
plot_num(iris_s)

describe(iris_s)

apply(iris[,1:4], 2, sd)

iris_s[,5] -> Spec

Species_orig_full <- factor(Spec, 
  labels = c("setosa", "versicolor", "virginica"))

iris_r <- cbind(iris_s, Species_orig_full)

options(repr.plot.width=6, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,2))

ggplot(data = iris_r, aes(x = Sepal.Length, y = Sepal.Width, col = Species_orig_full)) +
  geom_point(size=2) + theme_bw()
         

ggplot(data = iris_r, aes(x = Petal.Length, y = Petal.Width, col = Species_orig_full)) +
  geom_point(size=2) + theme_bw()

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 150)
chart.Correlation(iris_s, histogram=TRUE, pch=9)

## pairwise plot on PCA train ###
pairs.panels(iris[,1:4], main = "Correlation plot on variables of Iris dataset", pch = 21, bg = c("red", "green", "blue")[unclass(iris$Species)], hist.col="red")

## splitting the dataset into train and test sets
set.seed(123) ## seeding the sampling
iris_split <- initial_split(iris_s, prop = 0.75) ## spliting the data by library 'rsample'
iris_train <- training(iris_split)
iris_test  <- testing(iris_split)

dim(iris_s)

# X_test_pca

dim(iris_train)

### preparing dataset for knn ###
X_train <- iris_train[,-5]
X_test  <- iris_test[,-5]
Y_train <- iris_train[,5]
Y_test  <- iris_test[,5]

head(X_train)

(Y_test)

############ Knn prediction ########################

accuracy = function(actual, predicted) { ## defining accuracy function 
  mean(actual == predicted)}

error = function(actual, predicted) { ## defining error function 
  mean(actual != predicted)}

# set.seed(100)
iter_k = c(1,3,5,7,9,11,13,15) ## giving k-values
accu_k_te = rep(x = 0, times = length(iter_k))
accu_k_tr = rep(x = 0, times = length(iter_k))

error_k_te = rep(x = 0, times = length(iter_k))
error_k_tr = rep(x = 0, times = length(iter_k))

for(i in seq_along(iter_k)) {
  pred_k_te = knn( train = X_train, 
              test = X_test, 
              cl = Y_train, 
              k = iter_k[i])
  accu_k_te[i] = accuracy(pred_k_te, Y_test) ## accuracy from knn on test set
  error_k_te[i] = error(pred_k_te, Y_test) ## error from knn on test set
  
} ## error from knn test set

error_k_te

for(i in seq_along(iter_k)) {
  pred_k_tr = knn( train = X_train, 
                test = X_train, 
                cl = Y_train, 
                k = iter_k[i])
  accu_k_tr[i] = accuracy(pred_k_tr,Y_train) ## accuracy from knn on train set
  error_k_tr[i] = error(Y_train, pred_k_tr) ## error from knn on train set
}

error_k_tr

error_tr <- mean(Y_train != pred_k_tr)
error_te <- mean(Y_test != pred_k_te)

print(paste('Accuracy of test set from linear regression',(1-error_tr)*100,'%'))
print(paste('Accuracy of test set from linear regression',(1-error_te)*100,'%'))

c(error_te, error_tr)

####################### Comparing classification accuracy in knn and lm model #########################
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 180)
par(mfrow=c(2,2))

# plot accuracy vs choice of k on Training set
plot(iter_k, accu_k_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, 
     xlab = "k, number of neighbors", ylim= c(95,100),
     ylab = "classification accuracy, %", main = "Accuracy of Training set")

abline(v = which.max(accu_k_tr),y = max(accu_k_tr)*100, type = "l", col = "red", lwd = 2)
abline(x = which.max(accu_k_tr),h = max(accu_k_tr)*100, type = "l", col = "black", lty = 2)
#abline(a=100-0.6, b = 0, type = "l", col = "green", lty = 2, lwd = 3)

# plot accuracy vs choice of k on Test set
plot(iter_k,accu_k_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,
     ylim= c(95,100),
     xlab = "k, number of neighbors", ylab = "classification accuracy, %",
     main = "Accuracy of Test set")

abline(v = which.max(accu_k_te),y = max(accu_k_te)*100, type = "l", col = "red", lwd = 2)
abline(x = which.max(accu_k_te),h = max(accu_k_te)*100, type = "l", col = "black", lty = 2)

######################### Comparing classification errors in knn and lm model #######################
# options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 180)
# par(mfrow=c(1,2))

# plot accuracy vs choice of k on Training set
plot(iter_k, error_k_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, 
     xlab = "k, number of neighbors", ylim=c(0,5),
     ylab = "classification error, %", main = "classification error of Training set")

abline(v = which.min(error_k_tr),y = min(error_k_tr)*100, type = "l", col = "red", lwd = 2)
abline(x = which.min(error_k_tr),h = min(error_k_tr)*100, type = "l", col = "black", lty = 2)

# plot accuracy vs choice of k on Test set
plot(iter_k,error_k_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,
     ylim=c(0,5),
     xlab = "k, number of neighbors", ylab = "classification error, %",
     main = "classification error of Test set")

abline(v = which.min(error_k_te),y = min(error_k_te)*100, type = "l", col = "red", lwd = 2)
abline(x = which.min(error_k_te),h = min(error_k_te)*100, type = "l", col = "black", lty = 2)

### extracing confusion matrix for train set, k=1 ###
k_pred_tr <- knn(train = X_train, test = X_train, cl = Y_train, k=1)
iris_tab_tr<- table(Y_train,k_pred_tr, dnn = c("Original", "Predicted" ))
colnames(iris_tab_tr) = c("setosa", "versicolor", "virginica")
rownames(iris_tab_tr) = c("setosa", "versicolor", "virginica")
iris_tab_tr

### extracing confusion matrix for test set, k=5 ###
k_pred_te <- knn(train = X_train, test = X_test, cl = Y_train, k=5)
iris_tab_te<- table(Y_test,k_pred_te, dnn = c("Original", "Predicted" ))
colnames(iris_tab_te) = c("setosa", "versicolor", "virginica")
rownames(iris_tab_te) = c("setosa", "versicolor", "virginica")
iris_tab_te

# lvs <- c("setosa", "versicolor", "virginica") 
# 
# confu_mat <- confusion_matrix(targets = Y_test,
#                              predictions = k_pred_te)
# cm <- plot_confusion_matrix(confu_mat$`Confusion Matrix`[[1]],
#                           font_counts = font(size = 5,
#                                          angle = 0,
#                                          color = "cos2"), palette = "blue")
#                       
############# applying pca on train data ################

iris_pca_tr <- prcomp(X_train, scale = TRUE)
sum_pca_tr <- summary(iris_pca_tr)
sum_pca_tr

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,1))
fviz_eig(iris_pca_tr, title="Explained variance in train set", xlab="Components")

iris_pca_tr

## pairwise plot on PCA train ###
pairs.panels(iris_pca_tr$x, main = "Panel plot on PCA scores on train data", pch = 21, bg = c("red", "green", "blue")[unclass(iris_train$Species_orig)], hist.col="red")

#### plotting of biplot and contribution from PCA on train set ####  
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
fviz_pca_biplot(iris_pca_tr, repel = TRUE,
                col.var = "contrib", # Variables color
                gradient.cols = c("red", "green", "blue"),
                col.ind = "darkgreen"  # Individuals color
                , title="Biplot and contributions from first two components in train set", xlab="Component 1", ylab = "Component 2")

### eigen values from PCA
eig_val_tr <- get_eigenvalue(iris_pca_tr)
eig_val_tr

iris_pca_tr

#### plotting of data ellipse on first 2 components of PCA scores of train set ####  
col_choice <- rep("blue", length(iris_pca_tr$x[,1]))
id_sent <- which(iris_train$Species == 0)
id_vers <- which(iris_train$Species == 0.5)
col_choice[id_sent] <- "red"
col_choice[id_vers] <- "green"

options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
dataEllipse(iris_pca_tr$x[,1:2], levels = c(0.80, 0.95), pch = 0, cex = 0, col = col_choice, main = "Maping from PCA on train", xlim=c(-4.5,4.5), ylim=c(-3,3))
text(iris_pca_tr$x[,1:2], col = col_choice, labels = 1:length(iris_pca_tr$x[,1]))
legend("topleft", c("Sentosa", "Versicolor", "Virginica"), pch = .5, col = c( "red", "green", "blue"))


############# applying pca on test data ################

iris_pca_te <- prcomp(X_test, scale = TRUE)
sum_pca_te <- summary(iris_pca_te)
sum_pca_te

Species_orig_te <- factor(Y_test_pca, 
                       labels = c("setosa", "versicolor", "virginica"))
iris_test["Species_orig"] <- Species_orig_te

## pairwise plot on PCA train ###
pairs.panels(iris_pca_te$x, main = "Panel plot on PCA scores on test data", pch = 21, bg = c("red", "green", "blue")[unclass(iris_test$Species_orig)], hist.col="red")

## variance and eigenvalue plot on test set of PCA scores 
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,1))
fviz_eig(iris_pca_te, title="Explained variance in test set", xlab="Components")

#### plotting of biplot and contribution from PCA on test set #### 
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
fviz_pca_biplot(iris_pca_te, repel = TRUE,
                col.var = "contrib", # Variables color
                gradient.cols = c("red", "green", "blue"),
                col.ind = "darkgreen"  # Individuals color
                , title="Biplot and contributions from first two components in test set", xlab="Component 1", ylab = "Component 2")

#### plotting of data ellipse on first 2 components of PCA scores of test set #### 
col_choice <- rep("blue", length(iris_pca_te$x[,1]))
id_sent <- which(iris_test$Species == 0)
col_choice[id_sent] <- "red"
col_choice[id_vers] <- "green"

options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
dataEllipse(iris_pca_te$x[,1:2], levels = c(0.80, 0.95), pch = 0, cex = 0, col = col_choice, main = "Maping from PCA on test", xlim=c(-4.5,4.5), ylim=c(-3,3))
text(iris_pca_te$x[,1:2], col = col_choice, labels = 1:length(iris_pca_te$x[,1]))
legend("topleft", c("Sentosa", "Versicolor", "Virginica"), pch = .5, col = c( "red", "green", "blue"))



### extracting 1st two componenets from PCA on train set ######### 

pred_pca_tr <- iris_pca_tr$x[,1:2]
resp_pca_tr <- Y_train
dat_pca_tr <- cbind(pred_pca_tr,resp_pca_tr )
data_pca_tr <- data.frame(dat_pca_tr)

### extracting 1st two componenets from PCA on test set ######### 

pred_pca_te <- iris_pca_te$x[,1:2]
resp_pca_te <- Y_test
dat_pca_te <- cbind(pred_pca_te,resp_pca_te )
data_pca_te <- data.frame(dat_pca_te)

### converting vectors from dataframe ####
X_train_pca <- data_pca_tr[,-3]
X_test_pca  <- data_pca_te[,-3]
Y_train_pca <- data_pca_tr[,3]
Y_test_pca  <- data_pca_te[,3]

Y_train_pca

Species_orig <- factor(Y_train_pca, 
  labels = c("setosa", "versicolor", "virginica"))
iris_train["Species_orig"] <- Species_orig

############ Knn prediction on the scores of first two principle componenents ########################

accuracy = function(actual, predicted) { ## defining accuracy function 
  mean(actual == predicted)}

error = function(actual, predicted) { ## defining error function 
  mean(actual != predicted)}

iter_k = c(1,3,5,7,9,11,13,15) ## giving k-values
accu_pca_te = rep(x = 0, times = length(iter_k))
accu_pca_tr = rep(x = 0, times = length(iter_k))

error_pca_te = rep(x = 0, times = length(iter_k))
error_pca_tr = rep(x = 0, times = length(iter_k))

for(i in seq_along(iter_k)) {
  pred_pca_tr = knn( train = X_train_pca, 
                test = X_train_pca, 
                cl = Y_train_pca, 
                k = iter_k[i])
  accu_pca_tr[i] = accuracy(pred_pca_tr,Y_train_pca) ## accuracy from knn on train set
  error_pca_tr[i] = error( pred_pca_tr, Y_train_pca) ## error from knn on train set
} ## error from knn train set


for(i in seq_along(iter_k)) {
  pred_pca_te = knn( train = X_train_pca, 
              test = X_test_pca, 
              cl = Y_train_pca, 
              k = iter_k[i])
  accu_pca_te[i] = accuracy(pred_pca_te, Y_test_pca) ## accuracy from knn on test set
  error_pca_te[i] = error(pred_pca_te, Y_test_pca) ## error from knn on test set
  
} ## error from knn test set


error_pca_train <- mean(Y_train_pca != pred_pca_tr)
error_pca_test  <- mean(Y_test_pca != pred_pca_te)

print(paste('Accuracy of train set from linear regression',(1-error_pca_train)*100,'%'))
print(paste('Accuracy of test set from linear regression',(1-error_pca_test)*100,'%'))

c(error_pca_te, error_pca_tr)

####################### Comparing classification accuracy and errror in knn from PCA components  #########################
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
par(mfrow=c(2,2))

# plot accuracy vs choice of k on Training set
plot(iter_k, accu_pca_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, 
     xlab = "k, number of neighbors", ylim= c(90,100),
     ylab = "classification accuracy, %", main = "Accuracy of Training set")

abline(v = which.max(accu_pca_tr),y = max(accu_pca_tr)*100, type = "l", col = "red", lwd = 2)
abline(x = which.max(accu_pca_tr),h = max(accu_pca_tr)*100, type = "l", col = "black", lty = 2)


# plot accuracy vs choice of k on Test set
plot(iter_k, accu_pca_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,
     ylim= c(85,100),
     xlab = "k, number of neighbors", ylab = "classification accuracy, %",
     main = "Accuracy of Test set")

abline(v = which.max(accu_pca_te),y = max(accu_pca_te)*100, type = "l", col = "red", lwd = 2)
abline(x = which.max(accu_pca_te),h = max(accu_pca_te)*100, type = "l", col = "black", lty = 2)

# plot accuracy vs choice of k on Training set
plot(iter_k, error_pca_tr*100, type = "b",col = "blue", cex = 1, pch = 20, lwd = 2, 
     xlab = "k, number of neighbors", ylim=c(0,10),
     ylab = "classification error, %", main = "classification error of Training set")

abline(v = which.min(error_pca_tr),y = min(error_pca_tr)*100, type = "l", col = "red", lwd = 2)
abline(x = which.min(error_pca_tr),h = min(error_pca_tr)*100, type = "l", col = "black", lty = 2)

# # plot accuracy vs choice of k on Test set
plot(iter_k, error_pca_te*100, type = "b", col = "blue", cex = 1, pch = 20, lwd = 2,
     ylim=c(0,15),
     xlab = "k, number of neighbors", ylab = "classification error, %",
     main = "classification error of Test set")

abline(v = which.max(accu_pca_te),y = min(error_pca_te)*100, type = "l", col = "red", lwd = 2)
abline(x = which.min(error_pca_te),h = min(error_pca_te)*100, type = "l", col = "black", lty = 2)


#### plotting the confusion matrix for train set, k=1 ####
k_pred_tr_pca <- knn(train = X_train_pca, test = X_train_pca, cl = Y_train_pca, k=1)
iris_tab_tr_pca<- table(Y_train_pca,k_pred_tr_pca, dnn = c("Original", "Predicted" ))
colnames(iris_tab_tr_pca) = c("setosa", "versicolor", "virginica")
rownames(iris_tab_tr_pca) = c("setosa", "versicolor", "virginica")
iris_tab_tr_pca

#### plotting the confusion matrix for test set, k=7 ####
k_pred_te_pca <- knn(train = X_train_pca, test = X_test_pca, cl = Y_train_pca, k=7)
iris_tab_te_pca<- table(Y_test_pca, k_pred_te_pca,  dnn = c("Original", "Predicted"))
colnames(iris_tab_te_pca) = c("setosa", "versicolor", "virginica") 
rownames(iris_tab_te_pca) = c("setosa", "versicolor", "virginica")
iris_tab_te_pca


#### plotting the original and predicted Species for train set ####
Species_predicted <- factor(k_pred_tr_pca, 
  labels = c("setosa", "versicolor", "virginica"))

Species_original <- factor(Y_train_pca, 
  labels = c("setosa", "versicolor", "virginica"))

options(repr.plot.width=6, repr.plot.height=3, repr.plot.res = 200)
par(mfrow=c(1,2))

ggplot(X_train_pca, aes(x=PC1, y=PC2, color=Species_original)) + 
    geom_point(size=2) +
    theme_bw()

ggplot(X_train_pca, aes(x=PC1, y=PC2, color=Species_predicted)) + 
    geom_point(size=2) +
    theme_bw()


#### plotting the original and predicted Species for test set ####
Species_predicted <- factor(k_pred_te_pca, 
  labels = c("setosa", "versicolor", "virginica"))

Species_original <- factor(Y_test_pca, 
  labels = c("setosa", "versicolor", "virginica"))

 
options(repr.plot.width=6, repr.plot.height=3, repr.plot.res = 200)
par(mfrow=c(1,2))

ggplot(X_test_pca, aes(x=PC1, y=PC2, color=Species_predicted)) + 
    geom_point(size=2) +
    theme_bw()

ggplot(X_test_pca, aes(x=PC1, y=PC2, color=Species_original)) + 
    geom_point(size=2) +
    theme_bw()


## end ##