week7_assi4_sol1.r

# -*- coding: utf-8 -*-
# """Week7 Assi4 Sol1.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1uOvEEnsIREmI-mF0k5zw2IDoKww05kZV
# """

rm(list = ls())

## installing all the libaries in R kernel

# install.packages("corrplot")
# install.packages("forecast")
# install.packages("zoo")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("caret")
# install.packages("ROCR")
# install.packages("PerformanceAnalytics")
# install.packages("funModeling")
# install.packages("hrbrthemes")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("glmnet")
# install.packages("psych")
# install.packages("e1071")

## importing the libraries in R kernel

library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(repr)
library(reshape2)
library(forecast)
library(zoo)
library(rsample)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(leaps)
library(car)
library(PerformanceAnalytics)
library(funModeling)
library(caret)
library(MASS)
library(Hmisc)
library(hrbrthemes)
library(GGally)
library(glmnet)
library(pROC)
library(psych)
library(e1071)

# Set working directory to where data file is located
setwd("C:/File E/EAS 506 Statistical Mining I/Week 7/Assignment-4")

Diabetis <-read.csv('Diabetis.csv', stringsAsFactors=T)

Diabetis[,-1] -> data

head(data)

head(data)

names(data)

glimpse(data)

status(data)

profiling_num(data)

status(data)

hist(data, col="red")

describe(data)

summary(data)


options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 250)

pairs.panels(data[,1:5], main = "Pairs plot on Diabetis dataset, unclassed on group", pch = 21, bg = c("yellow", "green", "blue")[unclass(data$group)], hist.col="red")


 
# ggpair plot divided into three groups
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 150)
ggpairs(data, columns = 1:5, ggplot2::aes(colour=group))+theme_bw()

var(data[,1:5])

### covariance matrices for different classes ###

# Filtering by each class
f_CD <- filter(data, group == "Chemical_Diabetic")
f_N <- filter(data, group == "Normal")
f_OD <- filter(data, group == "Overt_Diabetic")

## Covariance matrices of Chemical_Diabetic class
cov(f_CD[,1:5])

## Covariance matrices of Normal class
cov(f_N[,1:5])

## Covariance matrices of Overt_Diabetic class
cov(f_OD[,1:5])

# on whole dataset
cov(data[,1:5])

## Correlation Matrix of Multivariate sample:
(Cl <- cor(data[,1:5]))

## Graphical Correlation Matrix:
symnum(Cl) # highly correlated

## Spearman's rho  and  Kendall's tau
symnum(clS <- cor(data[,1:5], method = "spearman"))
symnum(clK <- cor(data[,1:5], method = "kendall"))

## How much do they differ?
i <- lower.tri(Cl)
cor(cbind(P = Cl[i], S = clS[i], K = clK[i]))

data_1 <- data %>% mutate_if(is.factor, as.numeric)
status(data_1)

profiling_num(data_1)

## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
chart.Correlation(data_1, histogram=TRUE, pch=15)

# heatmap matrix 
options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
data_h <- as.data.frame(scale(data_1,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(data_h), scale = "none", col = bluered(100), trace = "none", density.info = "none")


options(repr.plot.width=6, repr.plot.height=4, repr.plot.res = 200)
plot_num(data_1)

## splitting the dataset into train and test sets
set.seed(1205) ## seeding the sampling
data_split <- initial_split(data_1, prop = 0.75) ## spliting the data by library 'rsample'
data_train <- training(data_split)
data_test  <- testing(data_split)

data_train[,6] -> y_true_train
data_test[,6] -> y_true_test


dim(data_train)

####################################
## Linear Discriminant Analysis (LDA)
##
####################################
lda_mod <- lda(group ~., data = data_train)
lda.pred.train <- predict(lda_mod, newdata = data_train)
y_lda_train <- lda.pred.train$class
lda.pred.test <- predict(lda_mod, newdata = data_test)
y_lda_test <- lda.pred.test$class

# Compute the LDA error
lda_train_error <- mean(y_true_train  != y_lda_train)
lda_test_error <- mean(y_true_test != y_lda_test)

## Misclassifiaction error for LDA
lda_train_error
lda_test_error

lda_mod

summary(lda.pred.train$class)

####################################
#   Quadratic Discriminant Analysis (QDA)
#
####################################
qda_mod <- qda(group ~., data = data_train)
qda.pred.train = predict(qda_mod, newdata = data_train)
y_qda_train <- qda.pred.train$class
qda.pred.test = predict(qda_mod, newdata = data_test)
y_qda_test <- qda.pred.test$class

# Compute the QDA error
qda_train_error <- mean(y_true_train  != y_qda_train)
qda_test_error <- mean(y_true_test != y_qda_test)

## Misclassifiaction error for LDA
qda_train_error
qda_test_error

qda_mod

relwt=c(1.86)
glufast=c(184)
glutest=c(68)
instest=c(122)
sspg=c(544)
group=c(2)
df=data.frame(relwt,glufast,glutest, instest,sspg, group)

df

## prediction on LDA
predict(lda_mod,df)

## prediction on QDA
predict(qda_mod,df)


## AUC for LDA

roc_lda_tr <- multiclass.roc(data_train$group, lda.pred.train$posterior)
roc_lda_te <- multiclass.roc(data_test$group, lda.pred.test$posterior)

print("LDA train set AUC")
auc(roc_lda_tr)
print("LDA test set AUC")
auc(roc_lda_te)

## AUC for QDA

roc_qda_tr <- multiclass.roc(data_train$group, qda.pred.train$posterior)
roc_qda_te <- multiclass.roc(data_test$group, qda.pred.test$posterior)

print("QDA train set AUC")
auc(roc_qda_tr)
print("QDA test set AUC")
auc(roc_qda_te)

## confusion matrix from LDA on train set

tab_tr_lda <- table(Predicted=y_lda_train, Reference=data_train$group )
colnames(tab_tr_lda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
rownames(tab_tr_lda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
caret::confusionMatrix(tab_tr_lda, positive = "Overt_Diabetic")

## probability table

round(prop.table(caret::confusionMatrix(tab_tr_lda)$table), 2)

## confusion matrix from LDA on test set

tab_te_lda <- table(Predicted=y_lda_test, Reference=data_test$group)
colnames(tab_te_lda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
rownames(tab_te_lda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
caret::confusionMatrix(tab_te_lda, positive = "Overt_Diabetic")

## probability table

conf_mat_lda <- round(prop.table(caret::confusionMatrix(tab_te_lda)$table), 2)
conf_mat_lda

## confusion matrix from QDA on train set

tab_tr_qda <- round(table(Predicted=y_qda_train, Reference=data_train$group),2)
colnames(tab_tr_qda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
rownames(tab_tr_qda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
caret::confusionMatrix(tab_tr_qda, positive = "Overt_Diabetic")

## probability table
round(prop.table(caret::confusionMatrix(tab_tr_qda)$table), 2)

## confusion matrix from QDA on test set

tab_te_qda <- table(Predicted=y_qda_test, Reference=data_test$group)
colnames(tab_te_qda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
rownames(tab_te_qda) = c("Chemical_Diabetic", "Normal", "Overt_Diabetic")
caret::confusionMatrix(tab_te_qda, positive = "Overt_Diabetic")

## probability table
round(prop.table(caret::confusionMatrix(tab_te_qda)$table), 2)

options(repr.plot.width=3, repr.plot.height=6, repr.plot.res = 250)
par(mfrow = c(1,1))

ldahist(lda.pred.train$x[,1], g= y_lda_train, col="red", main="Group vs. coeff. plot from LDA on train set")

## plot for LDA components ###
options(repr.plot.width=3, repr.plot.height=6, repr.plot.res = 250)
par(mfrow = c(1,1))
ldahist(lda.pred.test$x[,1], g= y_lda_test, col="green")

LD_func_test <- as.data.frame(lda.pred.test$x)
group_orig_test <- factor(data_test$group, 
                          labels = c("Chemical_Diabetic", "Normal", "Overt_Diabetic"))

LD_func_train <- as.data.frame(lda.pred.train$x)
group_orig_train <- factor(data_train$group, 
                           labels = c("Chemical_Diabetic", "Normal", "Overt_Diabetic"))

options(repr.plot.width=7, repr.plot.height=5, repr.plot.res = 250)
ggplot(LD_func_test , aes(x=LD1, y=LD2, color=group_orig_test)) + 
  geom_point(size=2) +
  theme_bw()

options(repr.plot.width=7, repr.plot.height=5, repr.plot.res = 250)
ggplot(LD_func_train , aes(x=LD1, y=LD2, color=group_orig_train)) + 
  geom_point(size=2) +
  theme_bw()


### end ###