-
Notifications
You must be signed in to change notification settings - Fork 23
/
Wine_PCA.r
82 lines (56 loc) · 3.11 KB
/
Wine_PCA.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#Perform Principal component analysis and perform clustering using
#first 3 principal component scores (both heirarchial and k mean clustering(scree plot or elbow curve) and obtain
#optimum number of clusters and check whether we have obtained same number of clusters with the original data
#(class column we have ignored at the begining who shows it has 3 clusters)df
#Lets import the data
wine <- read.csv(file.choose())
summary(wine)
#Now Lets perform PCA on the wine dataset. We will ignore the first column of the dataset as it contains the class of the wine
wine1 <- wine[,2:14]
wine_pca <- princomp(wine1, cor = TRUE, scores = TRUE, covmat=NULL)
summary(wine_pca)
wine_pca$loadings
wine_pca$scores
scores <- wine_pca$scores
plot(wine_pca$scores[,1:2], col = "white", pch = 19, cex = 1)
#If we want the text values of the plot then we use the text function as below
text(wine_pca$scores[,1:2], labels = c(1:178), cex = 1)
lets do some more plottings with different coluns of scores
plot(wine_pca$scores[,2:3], col = "blue", cex=1)
plot(wine_pca$scores[,5:6], col = "purple", cex = 1)
#Lets plot all the columns of Scores
pairs(wine_pca$scores)
#Now as we have got the Scores using PCA, lets perform clustering and determine whether the predicted and actual class are same or different
#We have to perform Clustering using the first 3 principle components
install.packages(c("FactoMineR", "factoextra"))
library(factoextra)
library(FactoMineR)
wine_3_pca <- PCA(wine1,ncp = 3, graph = FALSE)#Here we gave the argument ncp = 3 because we want first 3principle component
wine_3_pca_heirarchical_clust <- HCPC(wine_3_pca,nb.clust = 3, graph = TRUE) #performing Heirarchical clustering on PCA,
#nb.clust: an integer specifying the number of clusters. Possible values are:
#0: the tree is cut at the level the user clicks on
#-1: the tree is automatically cut at the suggested level
#Any positive integer: the tree is cut with nb.clusters clusters
#Lets visualize it using another function
fviz_cluster(wine_3_pca_heirarchical_clust,repel = TRUE,show.clust.cent = TRUE, cex = 0.8, pallete = "jco", rect = TRUE, rect_fill= TRUE, rect_border = "jco", labels_track_size = 0.8)
#repel is used so that the numbers dont get overlapped
#lets check if the cluster formed matches with the original data class
wine_heirarchical_final <- data.frame(wine[,1], wine_3_pca_heirarchical_clust$data.clust$clust)
#lets make a confusion matrix
conf <- table(wine_heirarchical_final)
conf
#Lets do the K-means clustering for the first 3 principle component
#First lets find the value of k
scores <- as.data.frame(scores)
wss <- c()
for (i in 2:15) wss[i]<- sum(kmeans(scores[,1:3], centers = i)$withinss)
plot(1:15,wss,type = "b", xlab = "No of Clusters", ylab = "Avg Distance")
#Here using the Elbow plot, we got the k value as 3
wine_kmeans <- kmeans(scores[,1:3],3)
kmeans_cluster <-wine_kmeans$cluster
kmeans_cluster
wine_kmeans_final_data <- data.frame(wine[,1],kmeans_cluster)
#Lets make a confusion matrix
kmean_table <- table(wine_kmeans_final_data)
kmean_table
#The most accurate clusters that almost matched with the dataset was found with heirarchical clusters.