# Part 2. Plotting, clusterization and extracting centers

This is the second part of the methodology explained in "NAME OF ARTICLE" for clusterization in mixed solvent simulations

If you do not have the R kernel installed in your jupyter notebook please type the following command in the anaconda prompt:

In [None]:
conda install -c r r-irkernel

Otherwise you can use the following code in another software like RStudio
Furthermore, please install the libraries listed below if you do not have them with:

In [None]:
install.packages("name_of_library")

# 1. Libraries and loading data

## 1.1 Load libraries

In [None]:
library(ggplot2)
library(gridExtra)
library(grid)
library(wesanderson)
library(dplyr)
library(ggpointdensity)
library(ggsci)
library(cowplot)

## 1.2 Set variables and working directory

In [None]:
##INPUT YOUR VARIABLES HERE
SOL="IMI" #Solvent
FEAT="torsions" #Feature
PROT="IL13Ra1" #Protein

#This is used to allow minimum editing of the script while working with multiple solvents and proteins

In [None]:
setwd(paste0("TYPE_YOUR_WORKING_DIRECTORY_HERE"))
getwd()

## 1.3 Load data

In [None]:
mydata <- read.csv(paste0("tica_",FEAT,".csv")) # .csv file obtained from part 1
colnames(mydata) <- c("TIC1","TIC2","TIC3","TIC4")
mydata

# 2 Plots and clusters

## 2.1 Density plot of TIC1 and TIC2 

In [None]:
plot_TICA <- ggplot(mydata, aes(TIC1,TIC2))+geom_pointdensity()+
  scale_color_viridis_c()+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        plot.title = element_text(size = 20, face = "bold"),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        axis.text = element_text(size = 20),
        axis.title = element_text(size = 20))
plot_TICA

## 2.2 Clusterize via kmeans and plot

In [None]:
clusters <- kmeans(mydata[,1:2], 15) #Our default is 15 clusters, but you can optimize this value to your liking
centers <- clusters$centers
centers <- as.data.frame(clusters$centers)
centers$ID <- seq.int(nrow(centers))
plot_TICA_km <- ggplot(mydata, aes(TIC1,TIC2))+geom_point(aes(colour= as.factor(clusters$cluster)))
plot_TICA_km <- plot_TICA_km + scale_color_simpsons()+
  geom_label(data=centers, aes(label=ID), fontface="bold")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        plot.title = element_text(size = 20, face = "bold"),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        axis.text = element_text(size = 20),
        axis.title = element_text(size = 20))
plot_TICA_km

### Attention!
We will use this plot to help ourselves with the state discretization. Now is up to the user to decide in how many states do we want to divide the ensemble and which clusters form part of them. We will need this information in the future.

## 2.3 Extract the frames for every cluster + its centers

In [None]:
setwd(paste0("YOUR_OUTPUTS_DIRECTORY_HERE"))

cluster_frame_ID <- data.frame(clusters$cluster)
number <- cluster_frame_ID$Frame <- 1:nrow(cluster_frame_ID)-1
TICA_only <- mydata[,1:2]
prot <- cluster_frame_ID$protein <- paste("protein", cluster_frame_ID$Frame, ".pdb", sep="")
TICA_frame <- cbind(cluster_frame_ID, TICA_only)
TICA_frame_arranged <- TICA_frame %>% arrange(clusters.cluster)
write.table(TICA_frame_arranged, file="cluster_frame_ID.txt", quote = FALSE)
# Extract a txt for each cluster, with list of corresponding frames #
out <- split(TICA_frame_arranged, f=TICA_frame_arranged$clusters.cluster)
for (a in 1:15){ #Remember to edit this value if you use a different number of clusters
  cluster_a <- out[[a]]
  cluster_a <- cluster_a$protein
  write(cluster_a, file= paste0("cluster_",a,".txt"))
  
}

center_added <- data.frame()
for (i in 1:15){ #Remember to edit this value if you use a different number of clusters
  cluster1 <- data.frame(TICA_frame[clusters$cluster==i,])
  #calculate the euclidean distance between each data point in cluster 1 and the centroid 1
  #store in column dist
  cluster1 <- cluster1 %>% mutate(dist=sqrt(  (cluster1[,4] - clusters$centers[1,1])^2 +
                                                (cluster1[,5] -clusters$centers[1,2])^2) 
  )
  
  
  #nearest point to cluster i
  center <- cluster1[which.min(cluster1$dist), ]
  center_added <- rbind(center_added,center)
}
center_added

center_added <- center_added[c(1:2)]
write.table(center_added, "cluster_centers.txt", row.names = FALSE)

## 3. Join the plots and save

In [None]:
figure <- grid.arrange(plot_TICA, plot_TICA_km,
                        nrow=1)
figure2 <- cowplot::ggdraw(figure)+
  theme(plot.background = element_rect(fill="white", color=NA))

png(file=paste0("TICA_joined.png"), type="cairo",
    width = 800, height = 400)
plot(figure2)
dev.off()

In [None]:
png(file=paste0("TICA_km.png"),type = "cairo",
    width = 450, height = 450)
plot(plot_TICA_km)
dev.off()
png(file=paste0("TICA_dens.png"),type = "cairo",
    width = 450, height = 450)
plot(plot_TICA)
dev.off()
