In [None]:
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"

# Download the data: wisc.df
wisc.df<-read.csv(url)

# Convert the features of the data: wisc.data
wisc.data<-as.matrix(wisc.df[3:32])

# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id

# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")

In [None]:
# Check column means and standard deviations
colMeans(wisc.data)
apply(wisc.data,MARGIN=2,sd)
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr<-prcomp(wisc.data,scale=T)

# Look at summary of results
summary(wisc.pr)

In [None]:
# Create a biplot of wisc.pr
biplot(wisc.pr)

# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC2")

# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC3")

# Do additional data exploration of your choosing below (optional)


In [None]:
# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))

# Calculate variability of each component
pr.var<-wisc.pr$sdev^2

# Variance explained by each principal component: pve
pve<-pr.var/sum(pr.var)

# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component", 
     ylab = "Cumulative Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

In [None]:
# Scale the wisc.data data: data.scaled
data.scaled<-scale(wisc.data)

# Calculate the (Euclidean) distances: data.dist
data.dist<-dist(data.scaled)

# Create a hierarchical clustering model: wisc.hclust
wisc.hclust<-hclust(data.dist,method="complete")

In [None]:
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters<-cutree(wisc.hclust,k=4)

# Compare cluster membership to actual diagnoses
table(diagnosis,wisc.hclust.clusters)

In [None]:
# Create a k-means model on wisc.data: wisc.km
wisc.km <- kmeans(scale(wisc.data),centers=2,nstart=20)

# Compare k-means to actual diagnoses
table(diagnosis,wisc.km$cluster)

# Compare k-means to hierarchical clustering
table(wisc.hclust.clusters,wisc.km$cluster)

In [None]:
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")

# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters<-cutree(wisc.pr.hclust,k=4)

# Compare to actual diagnoses
table(diagnosis,wisc.pr.hclust.clusters)

# Compare to k-means and hierarchical
table(wisc.hclust.clusters,diagnosis)
table(wisc.km$cluster,diagnosis)