# Clustering Analysis

The following code performs a cluster analysis. This unsupervised modelling is used to detect whether certain distinct clusters exist in the data, based on the various input fields

In [None]:
# Load packages
library(bigrquery)
library(cluster)
library(dplyr)
library(ggplot2)
library(readr)
library(Rtsne)

In [None]:
# Load project information to pull data from database
projectid = "astute-veld-253418"

# Set your query
sql <- "SELECT * FROM `astute-veld-253418.Masters.Masters`"

# Run the query and store the data in a dataframe
df_insurance <- query_exec(sql, projectid, use_legacy_sql = FALSE, max_pages = Inf)

# Print the query result
head(df_insurance)

In [None]:
#We first attempt this with a sample of 10k rows
df_insurance2=df_insurance[1:10000,-1]
head(df_insurance2)
dim(df_insurance2)

In [None]:
#We need to transform the features before Gower distance can be calculated
df_insurance2$Gender <- as.factor(df_insurance2$Gender)
df_insurance2$Occupation_Grouping <- as.factor(df_insurance2$Occupation_Grouping)
df_insurance2$Habit <- as.factor(df_insurance2$Habit)
df_insurance2$TypeName <- as.factor(df_insurance2$TypeName)

In [None]:
#Compute the Gower distance
gower_dist <- daisy(df_insurance2, metric = "gower")
gower_mat <- as.matrix(gower_dist)

In [None]:
#Calculate the Silhouette width to find ideal number of clusters
sil_width <- c(NA)
for(i in 2:8){  
  pam_fit <- pam(gower_dist, diss = TRUE, k = i)  
  sil_width[i] <- pam_fit$silinfo$avg.width  
}
plot(1:8, sil_width,
     xlab = "Number of clusters",
     ylab = "Silhouette Width")
lines(1:8, sil_width)


In [None]:
#Assign number of clusters, perform a PAM clustering algolrithm, and print a summary
k <- 3
pam_fit <- pam(gower_dist, diss = TRUE, k)
pam_results <- df %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))
pam_results$the_summary

In [None]:
#Now we plot and visualise this data in lower dimensional space
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster))