# Ridge Regression on ContextAware and DecadeAware system

In [None]:
library(tidyverse)
library(caret)
library(glmnet)
library(dplyr)
#library(psych) 
library(lme4)
library(e1071)
options(warn=-1)

In [6]:
seed_list<-c(1001,1111,1221,1331,1441,1551,1661,1771,1881,1991)
cut_off<-c(20,50,100)
time_off<-c(0)
lambda <- 10^seq(-3, 3, length = 100)
df_total <- matrix(ncol=6,nrow = length(seed_list)*length(cut_off)*length(time_off))


In [7]:
k<-1
for (s in seed_list) {
    for (i in time_off) {
        for (j in cut_off)  {
            set.seed(s)
            seeds <- vector(mode = "list", length = 11)
            for(z in 1:10) seeds[[z]] <- sample.int(n=1000, 54)
            #for the last model
            seeds[[11]]<-sample.int(1000, 1)
            df<-read.csv(paste0("../../datasets/features_CompoundAware_",i,"_",j,"_300.csv"),sep = '\t')
            ridge <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std, 
                data = df, method = "glmnet",
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda)
            )

            elastic <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std,
                data = df, method = "glmnet",
              trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
              tuneLength = 10
              )

            lasso <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std,
                data = df, method = "glmnet",
                trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
              tuneGrid = expand.grid(alpha = 1, lambda = lambda)
              )

            #print(round(getTrainPerf(elastic)[2],3))
            print(k)
            df_total[k,] <- c(s,i,j,as.numeric(getTrainPerf(elastic)[2]),as.numeric(getTrainPerf(lasso)[2]),as.numeric(getTrainPerf(ridge)[2]))
            k<-k+1
    
        }
    }
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30


In [8]:
df_total<-as.data.frame(df_total)
colnames(df_total)<-c('seed','time','cutoff','elastic','lasso','ridge')
#df_total

In [9]:
df_total %>% group_by(time,cutoff) %>% summarise(mean_e=round(mean(elastic),3), sd_e=round(sd(elastic),3))

time,cutoff,mean_e,sd_e
0,20,0.343,0.028
0,50,0.344,0.026
0,100,0.337,0.035


We use Ridge Regression to find a linear relationship between the features (PPMI, Local Mutual Information, Log Likelihood and the three similarity features) and compound_mean from the Reddy dataset for the 80 compounds.

In [10]:
df_total %>% group_by(cutoff) %>% summarise(mean_e=round(mean(elastic),3), sd_e=round(sd(elastic),3))

cutoff,mean_e,sd_e
20,0.343,0.028
50,0.344,0.026
100,0.337,0.035


In [11]:
df_total %>% group_by(time) %>% summarise(mean_e=round(mean(elastic),3), sd_e=round(sd(elastic),3))

time,mean_e,sd_e
0,0.341,0.029


In [22]:

# Model coefficients
#coef(ridge$finalModel, ridge$bestTune$lambda)
# Make predictions
#predictions <- ridge %>% predict(test.data)
# Model prediction performance
#data.frame(
#  RMSE = RMSE(predictions, test.data$medv),
#  Rsquare = R2(predictions, test.data$medv)
#)

"There were missing values in resampled performance measures."

In [25]:
round(getTrainPerf(lasso)[2],3)

TrainRsquared
0.193


In [118]:
#1.324194	0.3610691	1.1601	glmnet

In [24]:
models <- list(ridge = ridge, lasso = lasso, elastic = elastic)
resamples(models) %>% summary( metric = "RMSE")


Call:
summary.resamples(object = ., metric = "RMSE")

Models: ridge, lasso, elastic 
Number of resamples: 10 

RMSE 
             Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
ridge   1.1327733 1.381881 1.486881 1.498544 1.560698 1.965494    0
lasso   0.9580396 1.325377 1.467451 1.459236 1.601759 1.830630    0
elastic 1.0045829 1.322360 1.451215 1.449042 1.599522 1.781900    0


In [120]:
dev.new(width = 550, height = 900, unit = "px")

In [68]:
df<-read.csv(paste0("../../datasets/features_CompoundAware_",20,"_",100,"_300.csv"),sep = '\t')
df$compound_rating<-''
#colnames(df)

In [69]:
df$compound_rating[df$compound_mean<2]<-'low'
df$compound_rating[df$compound_mean>=4]<-'high'
df$compound_rating[df$compound_mean>=2 & df$compound_mean<4]<-'med'

summary(as.factor(df$compound_rating))

In [70]:
            df<-read.csv(paste0("../../datasets/features_CompoundAware_",20,"_",100,"_300.csv"),sep = '\t')

            elastic <- train(
              compound_mean ~. -modifier -head -modifier_mean -modifier_std -head_mean -head_std -compound_std,
                data = df, method = "glmnet",
              trControl = trainControl("cv", number = 10,search="grid",seeds=seeds),
              tuneLength = 10
              )


            print(round(getTrainPerf(elastic)[2],3))

  TrainRsquared
1         0.373


In [71]:
varImp(elastic)

glmnet variable importance

  only 20 most important variables shown (out of 66)

                          Overall
X1980_ppmi                    100
X1920_sim_bw_constituents       0
X1860_log_ratio                 0
X1940_local_mi                  0
X1980_sim_with_head             0
X1960_sim_with_modifier         0
X1920_sim_with_modifier         0
X1960_sim_with_head             0
X1800_local_mi                  0
X1820_sim_bw_constituents       0
X2000_sim_with_head             0
X1940_log_ratio                 0
X2000_local_mi                  0
X1920_log_ratio                 0
X1860_sim_bw_constituents       0
X1880_sim_with_modifier         0
X1940_sim_with_head             0
X1800_ppmi                      0
X1940_ppmi                      0
X1920_local_mi                  0

Variable Importance is used to look at the importance of a feature in predicting compound_mean. We show the top 20 features out of the possible 126 (21 decades * 6 features).

When we look at the features irrespective of the decade,

log ratio is the most important and occupies 11/20 positions with local mutual information in the other 9. 

We do not see any of the similarity features being as important as the information theory based features. 

(This is good because previous studies only looked at similarity and frequencies, but never information theory features)

Given below is the distribution of the variables that were deemed important based on the decade

1800 : 1

1830 : 2

1840 : 2

1850 : 2

1860 : 2

1870 : 2

1880 : 2

1890 : 2

1900 : 2

1910 : 2

1920 : 1

( We need to figure out how and if the decade information makes any sense)

When we compare this model with the non temporal model, we get better R squared values, so we can say that temporal information is helping us predict the compositionality score.