# Natural Statistics Cross-linguistic: 

#### Proportion of single-word utterances analysis

----

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "data_proc")
import contingent_extraction
import warnings
warnings.filterwarnings('ignore')

In [2]:
TSE_rand_dat_inc = pd.read_csv("../data/TSE_cont_dat.csv")
TSE_rand_dat_inc.rename(columns={'Langauge_name': 'Language_name'}, inplace=True)
TSE_rand_dat_inc['target_child_id'] = TSE_rand_dat_inc['transcript_id']

In [3]:
rand_dat_inc = pd.read_csv("../data/rand_dat_inc_master.csv",index_col=0,low_memory=False)

In [4]:
# load and clean data
rand_dat_inc=rand_dat_inc[rand_dat_inc["language"]!="ara"]
rand_dat_inc=rand_dat_inc[(rand_dat_inc["target_child_age"]>=5) & (rand_dat_inc["target_child_age"]<=30)]

In [5]:
# this way we include older and younger Tseltal kids
rand_dat_inc = pd.concat([TSE_rand_dat_inc, rand_dat_inc])

In [6]:
rand_dat_inc_cg = rand_dat_inc[rand_dat_inc["caregiver"]=="caregiver"]

rand_dat_inc_cg["contingent"] = np.where(rand_dat_inc_cg["contingent"]==1, "contingent", "non-contingent")

rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"].notna()]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="xxx"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="yyy"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="www"]

rand_dat_inc_cg["swu"]=np.where(rand_dat_inc_cg["num_tokens"]==1,1,0)

In [7]:
# add play context and year of study

play_context = pd.read_csv("../data/context_data.csv")
play_context = play_context.rename(columns={"Corpus": "corpus_name"})

# print(play_context.to_markdown())

rand_dat_inc_cg = rand_dat_inc_cg.merge(play_context,on='corpus_name')

rand_dat_inc_cg["context"] = rand_dat_inc_cg["Location"] + rand_dat_inc_cg["Activity"]

rand_dat_inc_cg["context"] = rand_dat_inc_cg["context"].replace({"HomeBook-reading":"Home: book reading",
                                                                 "HomeInterview/Unstructured":"Home: interview/unstructured",
                                                                 "HomeNaN":"Home: unreported",
                                                                 "HomeOther":"Home: other",
                                                                 "HomeUnstructured":"Home: unstructured",
                                                                 "LabOther":"Lab: other",
                                                                 "LabTabletop play":"Lab: tabletop play",
                                                                 "LabInterview/Unstructured":"Lab: interview/unstructured",
                                                                 "LabUnstructured":"Lab: unstructured",
                                                                 np.nan:"Unreported",
                                                                 "OtherUnstructured":"Other: unstructured"})

# year of study
# corpora_year = pd.read_csv("../data/corpora_year.csv")
# corpora_year = corpora_year.rename(columns={"Corpora": "corpus_name"})
# corpora_year = corpora_year[["corpus_name", "Year collected"]]

# rand_dat_inc_cg = rand_dat_inc_cg.merge(corpora_year,on='corpus_name')

In [8]:
%load_ext rpy2.ipython

In [9]:
%%R -i rand_dat_inc_cg

library("lme4")
library("repr")
library("knitr")
library("broom")
library("emmeans")
library("tidyverse")
library("kableExtra")

options(repr.plot.width=6, repr.plot.height=12, scipen=999)

R[write to console]: Loading required package: Matrix

R[write to console]: Welcome to emmeans.
Caution: You lose important information if you filter this package's results.
See '? untidy'



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ tidyr::pack()   masks Matrix::pack()
✖ tidyr::unpack() masks Matrix::unpack()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


R[write to console]: 
Attaching package: ‘kableExtra’


R[write to console]: The following object is masked from ‘package:dplyr’:

    group_rows




In [10]:
%%R -o rand_dat_inc_cg

# ---- create caregiver type categories

caregiver_type <- rand_dat_inc_cg %>%
  group_by(transcript_id) %>%
  summarise(
    caregiver_type = case_when(
      all(speaker_role == "Mother") ~ "Mother only",
      all(speaker_role == "Father") ~ "Father only",
      any(speaker_role %in% c("Mother", "Father")) ~ "Mother & Father",
      TRUE ~ "Unknown"
    )
  )

# inspect data:
# caregiver_type %>%
#     kbl(format="pipe")
    
# ggplot(caregiver_type, aes(x = 1, y = caregiver_type, fill = factor(caregiver_type))) + 
#   geom_col() +
#   coord_polar(theta = "y") +
#   theme_void()

rand_dat_inc_cg <- rand_dat_inc_cg %>%
  left_join(caregiver_type)

Joining with `by = join_by(transcript_id)`


In [11]:
rand_swu_stats = (rand_dat_inc_cg.groupby(["Language_name","target_child_id","transcript_id","contingent"])
                                  .swu
                                  .agg(["mean"])
                                  .reset_index())
rand_swu_sumstats =  rand_swu_stats.rename({'mean': 'means'}, axis=1)

In [12]:
%%R -i rand_swu_sumstats

# ^import rand_swu_sumstats into R

NULL


In [13]:
%%R -o rand_swu_sumstats

rand_swu_sumstats <- rand_swu_sumstats %>%
    left_join(caregiver_type) %>%
    filter(Language_name != "Mandarin") %>%
    filter(Language_name != "Polish")

Joining with `by = join_by(transcript_id)`


In [14]:
# save data to file:
rand_swu_sumstats.to_csv("../data/rand_swu_sumstats.csv")

Simple plot

In [15]:
%%R

xlabs <- c("C", "NC")

# # ara_label <- data.frame(means=c(.9),contingent = c(1.5),language="ara")
deu_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="German")
# deu_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
# est_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Norwegian")
# pol_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Swedish")
tse_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Tseltal")
# zho_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_swu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") +
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "ns", size=4,color="black",fontface = "italic") +  
     geom_text(data = fas_label,label = "ns", size=4,color="black",fontface = "italic") + 
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_label,label = "**",size=8,color="black") +  
     # geom_text(data = pol_label,label = "ns", size=4,color="black",fontface = "italic") +    
     geom_text(data = por_label,label = "***",size=8,color="black") +  
     geom_text(data = spa_label,label = "***",size=8,color="black") + 
     geom_text(data = swe_label,label = "***",size=8,color="black") + 
     # geom_text(data = zho_label,label = "ns", size=4, color="black",fontface = "italic") +
     ylim(0, .5) +
     labs(tag="C",
          y = "Proportion of Single Word Utterances",
          x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=16),
           axis.text.x = element_text(vjust = 0.5, hjust = 0.5),
           legend.title = element_blank(),
           legend.background = element_rect(fill=alpha("white",0.90),
                                                            size=0, linetype="dotted",
                                                            colour = "white"),
           legend.text=element_text(size=16))
      ggsave("../figures/token_rand_swu.pdf", width = 11.7, height = 6.2)


for manuscript

In [16]:
%%R -i rand_swu_sumstats

library('ggplot2')
library('repr')
options(repr.plot.width=6, repr.plot.height=12)

xlabs <- c("C", "NC")

# # ara_label <- data.frame(means=c(.9),contingent = c(1.5),language="ara")
deu_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="German")
# deu_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
# est_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Norwegian")
# pol_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Swedish")
tse_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Tseltal")
# zho_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_swu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") +
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "ns", size=4,color="black",fontface = "italic") +  
     geom_text(data = fas_label,label = "ns", size=4,color="black",fontface = "italic") + 
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_label,label = "**",size=8,color="black") +  
     # geom_text(data = pol_label,label = "ns", size=4,color="black",fontface = "italic") +    
     geom_text(data = por_label,label = "***",size=8,color="black") +  
     geom_text(data = spa_label,label = "***",size=8,color="black") + 
     geom_text(data = swe_label,label = "***",size=8,color="black") + 
     geom_text(data = tse_label,label = "", size=8, color="black") +
     # geom_text(data = zho_label,label = "ns", size=4, color="black",fontface = "italic") +
     ylim(0, .5) +
     labs(tag="C",
          y = "Proportion of Single Word Utterances",
          x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=11.5),
           axis.text.x = element_text(vjust = 0.5, hjust=0.5),
           legend.position="none")

      ggsave("../figures/figure_2_C.pdf", width = 11.5, height = 4.2)

Descriptives

In [17]:
%%R -i rand_swu_sumstats

rand_swu_sumstats %>%
    group_by(Language_name, contingent) %>%
    summarize(mean = mean(means),
              sd = sd(means)) %>%
    pivot_wider(names_from = contingent, values_from = c(mean, sd)) %>%
    select(Language_name, mean_contingent, sd_contingent, `mean_non-contingent`, `sd_non-contingent`) %>%
    `colnames<-`(c("Language", "Contingent Mean", "Contingent SD", "Non-Contingent Mean", "Non-Contingent SD")) %>%
    mutate(across(where(is.numeric), ~ round(., 2))) %>%
    unite("Contingent M (SD)", "Contingent Mean", "Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Contingent M (SD)` = paste0(`Contingent M (SD)`, ")")) %>%
    unite("Non-Contingent M (SD)", "Non-Contingent Mean", "Non-Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Non-Contingent M (SD)` = paste0(`Non-Contingent M (SD)`, ")")) %>%
    kbl("pipe")

`summarise()` has grouped output by 'Language_name'. You can override using the
`.groups` argument.


|Language   |Contingent M (SD) |Non-Contingent M (SD) |
|:----------|:-----------------|:---------------------|
|Croatian   |0.25 (0.1)        |0.19 (0.08)           |
|English    |0.29 (0.21)       |0.19 (0.1)            |
|Estonian   |0.18 (0.08)       |0.12 (0.05)           |
|French     |0.24 (0.19)       |0.2 (0.13)            |
|German     |0.26 (0.1)        |0.18 (0.1)            |
|Japanese   |0.49 (0.13)       |0.31 (0.08)           |
|Korean     |0.18 (0.08)       |0.1 (0.05)            |
|Norwegian  |0.31 (0.24)       |0.23 (0.27)           |
|Persian    |0.44 (0.14)       |0.32 (0.09)           |
|Portuguese |0.17 (0.11)       |0.15 (0.1)            |
|Spanish    |0.26 (0.06)       |0.16 (0.07)           |
|Swedish    |0.33 (0.1)        |0.2 (0.08)            |
|Tseltal    |0.42 (0.14)       |0.29 (0.08)           |


#### Statistical analyses

By language

In [18]:
SWU_dat = rand_dat_inc_cg[['Language_name','swu','contingent','transcript_id','target_child_id','caregiver_type']]

In [19]:
%%R -i SWU_dat

SWU_dat <- SWU_dat %>%
    filter(Language_name != "Mandarin") %>%
    filter(Language_name != "Polish")

# vectors for rows to remove from lmer
case_study <- c("Persian") # only 1 target child analyzed

case_study_cgtype_compare <- c("Korean") # only 1 target child analyzed, varies in CG type

no_cgtype_compare <- c("Portuguese", "Tseltal") # only `Mother only`

# single_tran <- c("Polish") # only 1 transcript

# nests of models
swu_nest1 <- SWU_dat %>%
    filter(!Language_name %in% case_study) %>%
    filter(!Language_name %in% no_cgtype_compare) %>%
    filter(!Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(swu ~ contingent + caregiver_type +
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`,`t.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`,`t.ratio`))

swu_nest2 <- SWU_dat %>%
    filter(Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(swu ~ contingent + caregiver_type +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`))

swu_nest3 <- SWU_dat %>%
    filter(Language_name %in% no_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(swu ~ contingent +
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

swu_nest4 <- SWU_dat %>%
    filter(Language_name %in% case_study) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(swu ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`))
 
# number of transcripts per language
sample_size <- SWU_dat %>%
    group_by(Language_name) %>%
    summarize(n = n_distinct(transcript_id))
    
# combine lmer summaries and correct p-values for multiple comparisons
emms_all <- list(swu_nest1, swu_nest2, swu_nest3, swu_nest4) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 13)) %>%
    left_join(sample_size)

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'pbkrtest.limit = 6241' (or larger)
[or, globally, 'set emm_options(pbkrtest.limit = 6241)' or larger];
but be warned that this may result in large computation time and memory use.

R[write to console]: Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'lmerTest.limit = 6241' (or larger)
[or, globally, 'set emm_options(lmerTest.limit = 6241)' or larger];
but be warned that this may result in large computation time and memory use.

R[write to console]: Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'pbkrtest.limit = 100059' (or larger)

Error in `mutate()`:
ℹ In argument: `statistic = coalesce(z.ratio)`.
ℹ In group 1: `Language_name = "Persian"`.
Caused by error:
! object 'z.ratio' not found
Run `rlang::last_trace()` to see where the error occurred.


R[write to console]: 




Error in mutate(., statistic = coalesce(z.ratio), .before = p.value) : 
ℹ In group 1: `Language_name = "Persian"`.
Caused by error:
! object 'z.ratio' not found


RInterpreterError: Failed to parse and evaluate line '\nSWU_dat <- SWU_dat %>%\n    filter(Language_name != "Mandarin") %>%\n    filter(Language_name != "Polish")\n\n# vectors for rows to remove from lmer\ncase_study <- c("Persian") # only 1 target child analyzed\n\ncase_study_cgtype_compare <- c("Korean") # only 1 target child analyzed, varies in CG type\n\nno_cgtype_compare <- c("Portuguese", "Tseltal") # only `Mother only`\n\n# single_tran <- c("Polish") # only 1 transcript\n\n# nests of models\nswu_nest1 <- SWU_dat %>%\n    filter(!Language_name %in% case_study) %>%\n    filter(!Language_name %in% no_cgtype_compare) %>%\n    filter(!Language_name %in% case_study_cgtype_compare) %>%\n    group_by(Language_name) %>%\n    nest() %>%\n    mutate(fit = map(data, ~ lmer(swu ~ contingent + caregiver_type +\n                                (1|target_child_id) +\n                                (1|transcript_id),\n                                data = .,\n                                REML= FALSE)),\n           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),\n           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),\n           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%\n    select(Language_name, contrasts, effect_size) %>%\n    unnest(cols = c(contrasts)) %>%\n    mutate(effect_size = map(effect_size, ~ summary(.))) %>%\n    unnest() %>%\n    mutate(statistic = coalesce(`z.ratio`,`t.ratio`), .before = p.value) %>%\n    select (-c(`z.ratio`,`t.ratio`))\n\nswu_nest2 <- SWU_dat %>%\n    filter(Language_name %in% case_study_cgtype_compare) %>%\n    group_by(Language_name) %>%\n    nest() %>%\n    mutate(fit = map(data, ~ lmer(swu ~ contingent + caregiver_type +\n                                (1|transcript_id),\n                                data = .,\n                                REML= FALSE)),\n           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),\n           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),\n           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%\n    select(Language_name, contrasts, effect_size) %>%\n    unnest(cols = c(contrasts)) %>%\n    mutate(effect_size = map(effect_size, ~ summary(.))) %>%\n    unnest() %>%\n    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%\n    select (-c(`z.ratio`))\n\nswu_nest3 <- SWU_dat %>%\n    filter(Language_name %in% no_cgtype_compare) %>%\n    group_by(Language_name) %>%\n    nest() %>%\n    mutate(fit = map(data, ~ lmer(swu ~ contingent +\n                                (1|target_child_id) +\n                                (1|transcript_id),\n                                data = .,\n                                REML= FALSE)),\n           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),\n           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),\n           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%\n    select(Language_name, contrasts, effect_size) %>%\n    unnest(cols = c(contrasts)) %>%\n    mutate(effect_size = map(effect_size, ~ summary(.))) %>%\n    unnest() %>%\n    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%\n    select (-c(`t.ratio`))\n\nswu_nest4 <- SWU_dat %>%\n    filter(Language_name %in% case_study) %>%\n    group_by(Language_name) %>%\n    nest() %>%\n    mutate(fit = map(data, ~ lmer(swu ~ contingent +\n                                (1|transcript_id),\n                                data = .,\n                                REML= FALSE)),\n           summary = map(fit, ~ emmeans(., "contingent"), lmer.df = "asymp"),\n           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),\n           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%\n    select(Language_name, contrasts, effect_size) %>%\n    unnest(cols = c(contrasts)) %>%\n    mutate(effect_size = map(effect_size, ~ summary(.))) %>%\n    unnest() %>%\n    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%\n    select (-c(`z.ratio`))\n \n# number of transcripts per language\nsample_size <- SWU_dat %>%\n    group_by(Language_name) %>%\n    summarize(n = n_distinct(transcript_id))\n    \n# combine lmer summaries and correct p-values for multiple comparisons\nemms_all <- list(swu_nest1, swu_nest2, swu_nest3, swu_nest4) %>% \n    reduce(bind_rows) %>%\n    mutate(p.value = p.adjust(p.value, "holm", 13)) %>%\n    left_join(sample_size)\n'.
R error message: 'Error in mutate(., statistic = coalesce(z.ratio), .before = p.value) : \nℹ In group 1: `Language_name = "Persian"`.\nCaused by error:\n! object \'z.ratio\' not found'

In [None]:
%%R

table_maker = function(data) { data %>%
    select(Language_name, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Language", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,Language)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Language (n)", c('Language','n'), sep=" (") %>%
    mutate(`Language (n)` = paste0(`Language (n)`,")")) %>%
    arrange(`Language (n)`)
    }

SWU_stats_table <- table_maker(emms_all)

kable(SWU_stats_table, "pipe")



|Language (n)    |Estimate (SE) | Test statistic| Effect size|Adjusted p-value |
|:---------------|:-------------|--------------:|-----------:|:----------------|
|Croatian (58)   |0.07 (0.01)   |           6.50|        0.17|<.0001           |
|English (882)   |0.11 (0)      |          30.11|        0.27|<.0001           |
|Estonian (22)   |0.04 (0.01)   |           2.88|        0.13|0.0525           |
|French (279)    |0.05 (0.01)   |           7.75|        0.13|<.0001           |
|German (38)     |0.08 (0.01)   |           6.49|        0.20|<.0001           |
|Japanese (160)  |0.18 (0.01)   |          27.63|        0.37|<.0001           |
|Korean (28)     |0.08 (0.01)   |           8.04|        0.26|<.0001           |
|Norwegian (28)  |0.1 (0.02)    |           4.79|        0.25|<.0001           |
|Persian (12)    |0.07 (0.02)   |           3.57|        0.15|0.0047           |
|Portuguese (23) |0.07 (0.01)   |           4.91|        0.20|<.0001           |
|Spanish (30)    |0.09 (0.