In [1]:
# Loading
library(dplyr)


# fit and test linear mixed models
library(lme4)
library(lmerTest)

# contrasts
# install.packages("pbkrtest")
library(emmeans)

# function pvalue is here
library(scales)

# code to add *, ** and *** for significances
makeStars <- function(x){
  stars <- c("***", "**", "*", "")
  vec <- c(0, 0.001, 0.01, 0.05, 1.1)
  i <- findInterval(x, vec)
  stars[i]
}

# xlsx files
df <- read.csv("data/main.csv")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Matrix


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step




# Factorization

In [2]:
# factorize topics
df$region <- as.factor(df$region)
df$browser <- as.factor(df$browser)
df$engine <- as.factor(df$engine)
df$id <- as.factor(df$id)

df$topic <- as.factor(df$topic)
df$query_type <- factor(df$query_type, levels=c('health','technology')) 
df$trt <- factor(df$trt, levels = c('risks', 'benefits'))


# Model selection

In [3]:
fit1 <- lmer(valence ~   trt * query_type * engine + region + browser  + (1|topic) + (day|id) ,  data = df, REML=F)
fit2 <- lmer(valence ~   trt * query_type * engine + region + browser  + (1|topic) + (1|day) + (1|id) ,  data = df, REML=F)
fit3 <- lmer(valence ~   trt * query_type * engine + region + browser  + (day|id) ,  data = df, REML=F)

AIC(fit1, fit2, fit3)

boundary (singular) fit: see help('isSingular')

boundary (singular) fit: see help('isSingular')

boundary (singular) fit: see help('isSingular')



Unnamed: 0_level_0,df,AIC
Unnamed: 0_level_1,<dbl>,<dbl>
fit1,15,42376.61
fit2,14,42374.61
fit3,14,42785.52


# Full Analisys

In [4]:
options(width = 10000, scipen=4)

fit <- lmer(valence ~   trt * query_type * engine + region + browser + (day|id) ,  data = df)


print(anova(fit, type='II'))
print(summary(fit))



boundary (singular) fit: see help('isSingular')



Type II Analysis of Variance Table with Satterthwaite's method
                       Sum Sq Mean Sq NumDF DenDF   F value    Pr(>F)    
trt                    885.44  885.44     1 27667 3225.1350 < 2.2e-16 ***
query_type            1100.43 1100.43     1 27667 4008.2110 < 2.2e-16 ***
engine                  33.63   33.63     1 27667  122.5057 < 2.2e-16 ***
region                   0.85    0.85     1 27667    3.0856  0.078999 .  
browser                  0.00    0.00     1 27667    0.0126  0.910568    
trt:query_type         566.56  566.56     1 27667 2063.6471 < 2.2e-16 ***
trt:engine               0.08    0.08     1 27667    0.3081  0.578824    
query_type:engine        1.85    1.85     1 27667    6.7495  0.009383 ** 
trt:query_type:engine   99.43   99.43     1 27667  362.1559 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Linear mixed model fit by REML. t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: valence ~ trt * query_type * engin

# Contrasts for Treatment: Risks vs Benefits

In [5]:
# asymptotic is used for approximations because other methods ("kenward-roger", "satterthwaite") are 
# computationally too expensive
# https://link.springer.com/article/10.3758/s13428-016-0809-y


the_means <- emmeans(fit, ~  trt |(engine:query_type), lmer.df = "asymptotic")
contrast_trt <- pairs(the_means)
# print(contrast_trt)


# create dataframe of contrasts
em <- pairs(the_means, interaction = "pairwise", infer = c(TRUE, TRUE)) %>%  rbind() 

# bonferroni method for 16 tests 
em

 engine   query_type trt_pairwise     estimate     SE  df asymp.LCL asymp.UCL z.ratio p.value
 google   health     risks - benefits  -0.1942 0.0129 Inf   -0.2264   -0.1620 -15.065  <.0001
 semantic health     risks - benefits   0.0579 0.0126 Inf    0.0265    0.0894   4.596  <.0001
 google   technology risks - benefits  -0.5185 0.0128 Inf   -0.5504   -0.4865 -40.556  <.0001
 semantic technology risks - benefits  -0.7465 0.0122 Inf   -0.7769   -0.7161 -61.303  <.0001

Results are averaged over some or all of the levels of: region, browser 
Degrees-of-freedom method: asymptotic 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 4 estimates 
P value adjustment: bonferroni method for 4 tests 

In [6]:
print (the_means %>%  rbind() %>% as.data.frame() 
       %>% arrange(desc(engine))  %>% mutate(across(where(is.numeric), round, 4)))
     

[1m[22m[36mℹ[39m In argument: `across(where(is.numeric), round, 4)`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


    engine query_type      trt  emmean     SE  df asymp.LCL asymp.UCL
1 semantic     health    risks  0.2980 0.0092 Inf    0.2729    0.3230
2 semantic     health benefits  0.2400 0.0087 Inf    0.2163    0.2637
3 semantic technology    risks -0.5007 0.0088 Inf   -0.5246   -0.4767
4 semantic technology benefits  0.2458 0.0085 Inf    0.2227    0.2690
5   google     health    risks  0.2542 0.0092 Inf    0.2290    0.2793
6   google     health benefits  0.4484 0.0091 Inf    0.4236    0.4731
7   google technology    risks -0.3308 0.0091 Inf   -0.3555   -0.3060
8   google technology benefits  0.1877 0.0091 Inf    0.1629    0.2124


In [7]:


# convert to data frame and sort
emdf <-  as.data.frame(em) %>% arrange(engine, desc(query_type))

# add significative starts
emdf['sig'] <- makeStars(emdf$p.value)

# restrict p-values decimals and add <0.0001 where correspond
emdf[,"p.value"] <- pvalue(emdf[,"p.value"], accuracy = 0.001)

# round all values to 4 decimals
embf <- emdf %>% mutate(across(where(is.numeric), round, 4))

print(embf)
     


    engine query_type     trt_pairwise estimate     SE  df asymp.LCL asymp.UCL  z.ratio p.value sig
1   google technology risks - benefits  -0.5185 0.0128 Inf   -0.5504   -0.4865 -40.5562  <0.001 ***
2   google     health risks - benefits  -0.1942 0.0129 Inf   -0.2264   -0.1620 -15.0646  <0.001 ***
3 semantic technology risks - benefits  -0.7465 0.0122 Inf   -0.7769   -0.7161 -61.3031  <0.001 ***
4 semantic     health risks - benefits   0.0579 0.0126 Inf    0.0265    0.0894   4.5964  <0.001 ***


# Contrasts for Treatment: Health vs Technology

In [8]:
# asymptotic is used for approximations because other methods ("kenward-roger", "satterthwaite") are 
# computationally too expensive
# https://link.springer.com/article/10.3758/s13428-016-0809-y


the_means <- emmeans(fit, ~  query_type |(engine:trt), lmer.df = "asymptotic")
contrast_trt <- pairs(the_means)
# print(contrast_trt)


# create dataframe of contrasts
em <- pairs(the_means, interaction = "pairwise", infer = c(TRUE, TRUE)) %>%  rbind() 

# bonferroni method for 16 tests 
em

 engine   trt      query_type_pairwise estimate     SE  df asymp.LCL asymp.UCL z.ratio p.value
 google   risks    health - technology  0.58493 0.0129 Inf     0.553    0.6171  45.367  <.0001
 semantic risks    health - technology  0.79864 0.0127 Inf     0.767    0.8303  63.020  <.0001
 google   benefits health - technology  0.26071 0.0128 Inf     0.229    0.2926  20.394  <.0001
 semantic benefits health - technology -0.00581 0.0121 Inf    -0.036    0.0244  -0.480  1.0000

Results are averaged over some or all of the levels of: region, browser 
Degrees-of-freedom method: asymptotic 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 4 estimates 
P value adjustment: bonferroni method for 4 tests 

In [9]:
print (the_means %>%  rbind() %>% as.data.frame() 
       %>% arrange(desc(engine))  %>% mutate(across(where(is.numeric), round, 4)))

    engine      trt query_type  emmean     SE  df asymp.LCL asymp.UCL
1 semantic    risks     health  0.2980 0.0092 Inf    0.2729    0.3230
2 semantic    risks technology -0.5007 0.0088 Inf   -0.5246   -0.4767
3 semantic benefits     health  0.2400 0.0087 Inf    0.2163    0.2637
4 semantic benefits technology  0.2458 0.0085 Inf    0.2227    0.2690
5   google    risks     health  0.2542 0.0092 Inf    0.2290    0.2793
6   google    risks technology -0.3308 0.0091 Inf   -0.3555   -0.3060
7   google benefits     health  0.4484 0.0091 Inf    0.4236    0.4731
8   google benefits technology  0.1877 0.0091 Inf    0.1629    0.2124


In [10]:
# convert to data frame and sort
emdf <-  as.data.frame(em) %>% arrange(trt, engine)

# add significative starts
emdf['sig'] <- makeStars(emdf$p.value)

# restrict p-values decimals and add <0.0001 where correspond
emdf[,"p.value"] <- pvalue(emdf[,"p.value"], accuracy = 0.001)

# round all values to 4 decimals
embf <- emdf %>% mutate(across(where(is.numeric), round, 4))

print(embf)

    engine      trt query_type_pairwise estimate     SE  df asymp.LCL asymp.UCL z.ratio p.value sig
1   google benefits health - technology   0.2607 0.0128 Inf    0.2288    0.2926 20.3945  <0.001 ***
2 semantic benefits health - technology  -0.0058 0.0121 Inf   -0.0360    0.0244 -0.4799  >0.999    
3   google    risks health - technology   0.5849 0.0129 Inf    0.5527    0.6171 45.3673  <0.001 ***
4 semantic    risks health - technology   0.7986 0.0127 Inf    0.7670    0.8303 63.0199  <0.001 ***


# Contrasts for Treatment: Google vs Semantic Scholar

In [11]:
# asymptotic is used for approximations because other methods ("kenward-roger", "satterthwaite") are 
# computationally too expensive
# https://link.springer.com/article/10.3758/s13428-016-0809-y


the_means <- emmeans(fit, ~  engine |(trt:query_type), lmer.df = "asymptotic")
contrast_trt <- pairs(the_means)
# print(contrast_trt)


# create dataframe of contrasts
em <- pairs(the_means, interaction = "pairwise", infer = c(TRUE, TRUE)) %>%  rbind() 

# bonferroni method for 16 tests 
em

 trt      query_type engine_pairwise   estimate     SE  df asymp.LCL asymp.UCL z.ratio p.value
 risks    health     google - semantic  -0.0438 0.0130 Inf   -0.0762   -0.0114  -3.373  0.0030
 benefits health     google - semantic   0.2084 0.0125 Inf    0.1771    0.2397  16.628  <.0001
 risks    technology google - semantic   0.1699 0.0126 Inf    0.1384    0.2014  13.485  <.0001
 benefits technology google - semantic  -0.0582 0.0124 Inf   -0.0891   -0.0272  -4.695  <.0001

Results are averaged over some or all of the levels of: region, browser 
Degrees-of-freedom method: asymptotic 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 4 estimates 
P value adjustment: bonferroni method for 4 tests 

In [12]:
print (the_means %>%  rbind() %>% as.data.frame() 
       %>% arrange(desc(engine))  %>% mutate(across(where(is.numeric), round, 4)))
     

       trt query_type   engine  emmean     SE  df asymp.LCL asymp.UCL
1    risks     health semantic  0.2980 0.0092 Inf    0.2729    0.3230
2 benefits     health semantic  0.2400 0.0087 Inf    0.2163    0.2637
3    risks technology semantic -0.5007 0.0088 Inf   -0.5246   -0.4767
4 benefits technology semantic  0.2458 0.0085 Inf    0.2227    0.2690
5    risks     health   google  0.2542 0.0092 Inf    0.2290    0.2793
6 benefits     health   google  0.4484 0.0091 Inf    0.4236    0.4731
7    risks technology   google -0.3308 0.0091 Inf   -0.3555   -0.3060
8 benefits technology   google  0.1877 0.0091 Inf    0.1629    0.2124


In [13]:
# convert to data frame and sort
emdf <-  as.data.frame(em) %>% arrange(trt, query_type)

# add significative starts
emdf['sig'] <- makeStars(emdf$p.value)

# restrict p-values decimals and add <0.0001 where correspond
emdf[,"p.value"] <- pvalue(emdf[,"p.value"], accuracy = 0.001)

# round all values to 4 decimals
embf <- emdf %>% mutate(across(where(is.numeric), round, 4))

print(embf)

       trt query_type   engine_pairwise estimate     SE  df asymp.LCL asymp.UCL z.ratio p.value sig
1 benefits     health google - semantic   0.2084 0.0125 Inf    0.1771    0.2397 16.6275  <0.001 ***
2 benefits technology google - semantic  -0.0582 0.0124 Inf   -0.0891   -0.0272 -4.6947  <0.001 ***
3    risks     health google - semantic  -0.0438 0.0130 Inf   -0.0762   -0.0114 -3.3734   0.003  **
4    risks technology google - semantic   0.1699 0.0126 Inf    0.1384    0.2014 13.4849  <0.001 ***
