In [None]:
library(tidyverse)
library(caret)
library(glmnet)
library(dplyr)
#library(psych) 
library(lme4)
library(e1071)
library(corrplot)
library(xtable)
options(warn=-1)

In [None]:
seed_list<-c(1001,1111,1221,1331,1441,1551,1661,1771,1881,1991)
cut_off<-c(10,20,50,100)
time_off<-c(10,0,20,50,100)
lambda <- 10^seq(-3, 3, length = 100)
context<-c('CompoundAware','CompoundAgnostic')
df_total <- matrix(ncol=8,nrow = length(context)*length(seed_list)*length(cut_off)*length(time_off))


In [None]:
k<-1
for (c in context) {
for (s in seed_list) {
    print(s)
    for (i in time_off) {
        for (j in cut_off)  {
            set.seed(s)
            seeds <- vector(mode = "list", length = 11)
            for(z in 1:10) seeds[[z]] <- sample.int(n=1000, 54)
            #for the last model
            seeds[[11]]<-sample.int(1000, 1)
            path<-paste0("./coha_compounds/features_",c,"_",i,"_",j,"_300.pkl")
            #print(path)
            df<-read.csv(path,sep = '\t')
            df<-df[ , colSums(is.na(df)) == 0]
            ridge <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std, 
                data = df, method = "glmnet",na.action = na.pass,
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda)
            )

            elastic <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std,
                data = df, method = "glmnet",na.action = na.pass,
              trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
              tuneLength = 10
              )

            lasso <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std,
                data = df, method = "glmnet",na.action = na.pass,
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
              tuneGrid = expand.grid(alpha = 1, lambda = lambda)
              )

            #print(round(getTrainPerf(ridge)[2],3))
            df_total[k,] <- c(c,s,i,j,as.numeric(getTrainPerf(elastic)[2]),as.numeric(getTrainPerf(lasso)[2]),as.numeric(getTrainPerf(ridge)[2]),nrow(df))
            k<-k+1
    
        }
    }
}
}

In [None]:
new_df_total <- df_total
df_total

In [None]:
colnames(new_df_total)<-c('context','seed','time','cutoff','ridge','elastic','lasso','size')
new_df_total<-as.data.frame(new_df_total)
new_df_total$ridge<-as.numeric(as.character(new_df_total$ridge))
new_df_total$elastic<-as.numeric(as.character(new_df_total$elastic))
new_df_total$lasso<-as.numeric(as.character(new_df_total$lasso))
new_df_total$size<-as.numeric(as.character(new_df_total$size))

temp0<-new_df_total  %>% filter(time==0) %>% group_by(context,time,cutoff) %>% 
summarise(mean_rsquared=round(mean(ridge,na.rm=TRUE),3),sd_rsquared=round(sd(ridge,na.rm=TRUE),3),size=mean(size)) %>% 
arrange(desc(mean_rsquared),.by_group = T)

temp0

In [None]:
temp10<-new_df_total  %>% filter(time==10) %>% group_by(context,time,cutoff) %>% 
summarise(mean_rsquared=round(mean(ridge,na.rm=TRUE),3),sd_rsquared=round(sd(ridge,na.rm=TRUE),3),size=mean(size)) %>% 
arrange(desc(time),.by_group = T)

temp10

In [None]:
temp20<-new_df_total  %>% filter(time==20) %>% group_by(context,time,cutoff) %>% 
summarise(mean_rsquared=round(mean(ridge,na.rm=TRUE),3),sd_rsquared=round(sd(ridge,na.rm=TRUE),3),size=mean(size)) %>% 
arrange(desc(time),.by_group = T)

temp20

In [None]:
temp50<-new_df_total  %>% filter(time==50) %>% group_by(context,time,cutoff) %>% 
summarise(mean_rsquared=round(mean(ridge,na.rm=TRUE),3),sd_rsquared=round(sd(ridge,na.rm=TRUE),3),size=mean(size)) %>% 
arrange(desc(time),.by_group = T)

temp50

In [None]:
temp100<-new_df_total  %>% filter(time==100) %>% group_by(context,time,cutoff) %>% 
summarise(mean_rsquared=round(mean(ridge,na.rm=TRUE),3),sd_rsquared=round(sd(ridge,na.rm=TRUE),3),size=mean(size)) %>% 
arrange(desc(time),.by_group = T)

temp100

In [None]:
temp_all <- rbind(temp0, temp10, temp20, temp50, temp100)
temp_all$time <- as.numeric(as.character(temp_all$time))
temp_all$cutoff <- as.numeric(as.character(temp_all$cutoff))
temp_all <- temp_all[with(temp_all, order(context, time, cutoff)), ]
temp_all

In [None]:
print(xtable(temp_all, caption= "Regression", 
             align="llSSSSS", 
             digits=c(1,1,0,0,2,2,0), 
             label="tab:regression-results"),
             display="dsddffd")

In [None]:
models <- list(ridge = ridge, lasso = lasso, elastic = elastic)
resamples(models) %>% summary( metric = "RMSE")

In [None]:
            path<-paste0("./coha_compounds/features_CompoundAware","_",0,"_",20,"_300.pkl")
            print(path)
            df<-read.csv(path,sep = '\t')
            df<-df[ , colSums(is.na(df)) == 0]
            ridge <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std, 
                data = df, method = "glmnet",na.action = na.pass,
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda)
            )

In [None]:
varImp(ridge)

In [None]:
            path<-paste0("./coha_compounds/features_CompoundAgnostic","_",0,"_",20,"_300.pkl")
            print(path)
            df<-read.csv(path,sep = '\t')
            df<-df[ , colSums(is.na(df)) == 0]
            ridge <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std, 
                data = df, method = "glmnet",na.action = na.pass,
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda)
            )

In [None]:
varImp(ridge)

In [None]:
summary(lm(compound_mean~ppmi,data=df))

In [None]:
summary(lm(compound_mean~mod_prod,data=df))

In [None]:
summary(lm(compound_mean~sim_with_head,data=df))

In [None]:
summary(lm(compound_mean~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std ,data=df))

In [None]:
path<-paste0("./coha_compounds/features_CompoundAgnostic","_",0,"_",10,"_300.pkl")
df<-read.csv(path,sep = '\t')
df<-df[ , colSums(is.na(df)) == 0]

corr <- df[, !names(df) %in% c("compound_mean", "modifier", "head", "modifier_mean", "modifier_std", "head_mean", "head_std", "compound_std")]
corr <- cor(corr, corr)
corr

In [None]:
corrplot(corr, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", cl.ratio = 0.3, cl.align = "r",
        diag = FALSE)

In [None]:
path<-paste0("./coha_compounds/features_CompoundAware","_",0,"_",10,"_300.pkl")
df<-read.csv(path,sep = '\t')
df<-df[ , colSums(is.na(df)) == 0]

corr <- df[, !names(df) %in% c("compound_mean", "modifier", "head", "modifier_mean", "modifier_std", "head_mean", "head_std", "compound_std")]
corr <- cor(corr, corr)
corr

In [None]:
corrplot(corr, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", cl.ratio = 0.3, cl.align = "r",
        diag = FALSE)