In [2]:
library(tidyverse)
library(caret)
library(glmnet)
library(dplyr)
#library(psych) 
library(lme4)
library(e1071)
library(GGally)
options(warn=-1)
options(scipen=999)
library(crosstable)

In [12]:
seed_list<-c(1001,1111,1221,1331,1441,1551,1661,1771,1881,1991)
cut_off<-c(0,10,50,100,500,1000)
time_off<-c(10,20,50,100,0)
lambda <- 10^seq(-3, 3, length = 100)
alpha<-seq(0,1,length=10)

In [13]:
library(doParallel)
cl <- makePSOCKcluster(20)
registerDoParallel(cl)

In [14]:
m<-1
list_of_vi=list()

orig_comp_ratings_df<-read.table("../data/MeanAndDeviations.clean.txt",sep = '\t')
orig_compound_list<-gsub(x=orig_comp_ratings_df$V1,pattern = "-n",replacement = "",fixed = TRUE)
noun_compound_list<-gsub(x=orig_comp_ratings_df$V1,pattern = "-n",replacement = "_NOUN",fixed = TRUE)
pnoun_compound_list<-gsub(x=orig_comp_ratings_df$V1,pattern = "-n",replacement = "_PROPN",fixed = TRUE)

list_of_rsqr=list()
compound_list<-c("reddy","all")
tagged_list<-c("Tagged")

ppmi_setting_list<-c("PPMI")
comp_setting_list<-c("Agnostic")

caret_spearman <- function(data, lev = NULL, model = NULL) {
  spearman_val <- cor(x = data$pred, y = data$obs, method = "spearman",use="complete.obs")
  c(Spearman = spearman_val)
}


In [15]:
for (t in tagged_list) {
for (p in ppmi_setting_list){
for (a in comp_setting_list){
for (i in time_off) {
  
for (j in cut_off) {
  
  
  
  for (k in compound_list) {
      
      if (i==10 & j==1000){
          break
          }
  
  print(paste0(p," ",a," ",t," ",i," ",j," ",k))
  #Load datasets  
  
  coha_nona_df<-read.csv(paste0("../../Compounding/datasets/features_new/features_Compound",a,"_",p,"_",t,"_",i,"_",j,"_","med.csv"),sep = '\t')
 
 
  coha_setting_med_df<-read.csv(paste0("../../Compounding/datasets/features_new/features_Setting_",p,"_",t,"_",i,"_",j,"_","med.csv"),sep = '\t')
  
  #Find common compounds c(
  coha_nona_plus_df<-merge(coha_nona_df, coha_setting_med_df, by = c("modifier","head","modifier_mean","modifier_std","head_mean","head_std","compound_mean","compound_std"))
  
  coha_nona_df$compound<-paste(coha_nona_df$modifier,coha_nona_df$head)

  coha_nona_plus_df$compound<-paste(coha_nona_plus_df$modifier,coha_nona_plus_df$head)  

  if (k=="reddy"){
    if (t=="Tagged") {
      compound_compounds<-c(noun_compound_list)

    }
    else {
      compound_compounds<-orig_compound_list

    }

  }
  else {
  if (t=="Tagged") {
  compound_compounds<-coha_nona_df$compound
  nouns<-grep("^.*_NOUN .*_NOUN$", compound_compounds, value = TRUE)
  propns<-grep("^.*_PROPN .*_PROPN$", compound_compounds, value = TRUE)
  compound_compounds<-c(nouns)
    }
    else {
      compound_compounds<-coha_nona_df$compound
    }
  }
  
  coha_nona_df <- coha_nona_df %>% filter(compound %in% compound_compounds)
  print(dim(coha_nona_df))

  coha_nona_plus_df <- coha_nona_plus_df %>% filter(compound %in% compound_compounds)
  print(dim(coha_nona_plus_df))
 

  coha_nona_df$corpus<-"coha"
  
  coha_nona_plus_df$corpus<-"coha" 
  #Distribution of mean and std values
  
  trainX_coha_nona<-coha_nona_df %>% select(-c(compound_mean,modifier,head,modifier_mean,modifier_std,head_mean,head_std,compound_std,compound,corpus)) %>% select(-one_of("comp_freq_bins"))
  trainY_coha_nona<-coha_nona_df %>% select(compound_mean)
  trainY_coha_nona<-trainY_coha_nona$compound_mean  

  
  trainX_plus_coha_nona<-coha_nona_plus_df %>% select(-c(compound_mean,modifier,head,modifier_mean,modifier_std,head_mean,head_std,compound_std,compound,corpus)) %>% select(-one_of("comp_freq_bins"))
  trainY_plus_coha_nona<-coha_nona_plus_df %>% select(compound_mean)
  trainY_plus_coha_nona<-trainY_plus_coha_nona$compound_mean  

  
  for (s in seed_list)  {
      set.seed(s)
      seeds <- vector(mode = "list", length = 14)
      for(z in 1:13) seeds[[z]] <- sample.int(n=1000, 10)
      #for the last model
      seeds[[14]]<-sample.int(1000, 1)
      

      elastic_nona_coha <- train(trainX_coha_nona,trainY_coha_nona,method = "glmnet",metric = "Rsquared",
        trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),tuneGrid = expand.grid(alpha = alpha, lambda = lambda),
        preProc = c("nzv","center", "scale")
      )
      
      
      elastic_nona_coha_spearman <- train(trainX_coha_nona,trainY_coha_nona,method = "glmnet",metric = "Spearman",
                                          trControl = trainControl("cv", number = 10,search="grid",seeds=seeds,summaryFunction = caret_spearman),tuneGrid = expand.grid(alpha = alpha, lambda = lambda),
                                          preProc = c("nzv","center", "scale")
      )
      

      elastic_nona_plus_coha <- train(trainX_plus_coha_nona,trainY_plus_coha_nona,method = "glmnet",metric = "Rsquared",
                                      trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),tuneGrid = expand.grid(alpha = alpha, lambda = lambda),
                                      preProc = c("nzv","center", "scale")
      )
      
      
      elastic_nona_plus_coha_spearman <- train(trainX_plus_coha_nona,trainY_plus_coha_nona,method = "glmnet",metric = "Spearman",
                                               trControl = trainControl("cv", number = 10,search="grid",seeds=seeds,summaryFunction = caret_spearman),tuneGrid = expand.grid(alpha = alpha, lambda = lambda),
                                               preProc = c("nzv","center", "scale")
      )
       
      perf_elastic_nona_plus_coha<-data.frame(model="plus",impute="python",tag=t,setting=a,dataset=k,corpus="coha",ml_algo="elastic",n=nrow(trainX_plus_coha_nona),seed=s,timespan=i,cutoff=j,ppmi=p,method=getTrainPerf(elastic_nona_plus_coha)[,"method"],TrainRsquared=getTrainPerf(elastic_nona_plus_coha)[,"TrainRsquared"],TrainSpearman=getTrainPerf(elastic_nona_plus_coha_spearman)[,"TrainSpearman"])

      varimp_elastic_nona_plus_coha<-data.frame(model="plus",impute="python",tag=t,setting=a,dataset=k,corpus="coha",ml_algo="elastic",seed=s,timespan=i,cutoff=j,t(varImp(elastic_nona_plus_coha)$importance),ppmi=p)

      perf_elastic_nona_coha<-data.frame(model="norm",impute="python",tag=t,setting=a,dataset=k,corpus="coha",ml_algo="elastic",n=nrow(trainX_coha_nona),seed=s,timespan=i,cutoff=j,ppmi=p,method=getTrainPerf(elastic_nona_coha)[,"method"],TrainRsquared=getTrainPerf(elastic_nona_coha)[,"TrainRsquared"],TrainSpearman=getTrainPerf(elastic_nona_coha_spearman)[,"TrainSpearman"])

      varimp_elastic_nona_coha<-data.frame(model="norm",impute="python",tag=t,setting=a,dataset=k,corpus="coha",ml_algo="elastic",seed=s,timespan=i,cutoff=j,t(varImp(elastic_nona_coha)$importance),ppmi=p)

    
      list_of_rsqr[[m]]<-perf_elastic_nona_coha
      list_of_vi[[m]]<-varimp_elastic_nona_coha
      m<-m+1
          
      list_of_rsqr[[m]]<-perf_elastic_nona_plus_coha
      list_of_vi[[m]]<-varimp_elastic_nona_plus_coha
      m<-m+1
  }
  }
  }
  }
}
}
}

[1] "PPMI Agnostic Tagged 10 0 reddy"
[1]  89 629
[1]  89 689
[1] "PPMI Agnostic Tagged 10 0 all"
[1] 211 629
[1] 210 689
[1] "PPMI Agnostic Tagged 10 10 reddy"
[1]  89 629
[1]  89 689
[1] "PPMI Agnostic Tagged 10 10 all"
[1] 211 629
[1] 210 689
[1] "PPMI Agnostic Tagged 10 50 reddy"
[1]  89 629
[1]  89 689
[1] "PPMI Agnostic Tagged 10 50 all"
[1] 211 629
[1] 210 689
[1] "PPMI Agnostic Tagged 10 100 reddy"
[1]  89 629
[1]  89 689
[1] "PPMI Agnostic Tagged 10 100 all"
[1] 211 629
[1] 210 689
[1] "PPMI Agnostic Tagged 10 500 reddy"
[1]  89 629
[1]  89 689
[1] "PPMI Agnostic Tagged 10 500 all"
[1] 211 629
[1] 210 689
[1] "PPMI Agnostic Tagged 20 0 reddy"
[1]  89 350
[1]  89 383
[1] "PPMI Agnostic Tagged 20 0 all"
[1] 211 350
[1] 210 383
[1] "PPMI Agnostic Tagged 20 10 reddy"
[1]  89 350
[1]  89 383
[1] "PPMI Agnostic Tagged 20 10 all"
[1] 211 350
[1] 210 383
[1] "PPMI Agnostic Tagged 20 50 reddy"
[1]  89 350
[1]  89 383
[1] "PPMI Agnostic Tagged 20 50 all"
[1] 211 350
[1] 210 383
[1] "PPM

In [16]:
varimp_df<-bind_rows(list_of_vi)
varimp_df$cutoff<-as.factor(varimp_df$cutoff)
varimp_df[is.na(varimp_df)] <- 0

In [17]:
rsquared_df<-bind_rows(list_of_rsqr)
rsquared_df$cutoff<-as.factor(rsquared_df$cutoff)

In [18]:
write.csv(rsquared_df,"rsquared_Tagged_PPMI_Agnostic.csv",row.names = FALSE)

In [19]:
write.csv(varimp_df,"varimp_Tagged_PPMI_Agnostic.csv",row.names = FALSE)