# Natural Statistics Cross-linguistic: 

#### MLUw analysis - random sample

----

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "data_proc")
import contingent_extraction
import warnings
warnings.filterwarnings('ignore')

In [2]:
TSE_rand_dat_inc = pd.read_csv("../data/TSE_cont_dat.csv")
TSE_rand_dat_inc.rename(columns={'Langauge_name': 'Language_name'}, inplace=True)
TSE_rand_dat_inc['target_child_id'] = TSE_rand_dat_inc['transcript_id']

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R -i TSE_rand_dat_inc

library(tidyverse)

# ---- count number of tokens per row

TSE_rand_dat_inc <- TSE_rand_dat_inc %>% 
    mutate(num_tokens = str_count(gloss, "\\w+"))

# save to file

write_csv(TSE_rand_dat_inc, file="../data/TSE_cont_dat.csv")


── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [5]:
%%R -o TSE_rand_dat_inc

TSE_rand_dat_inc <- TSE_rand_dat_inc

In [6]:
rand_dat_inc = pd.read_csv("../data/rand_dat_inc_master.csv", index_col=0, low_memory=False)

In [7]:
# load and clean data
rand_dat_inc=rand_dat_inc[rand_dat_inc["language"]!="ara"]
rand_dat_inc=rand_dat_inc[(rand_dat_inc["target_child_age"]>=5) & (rand_dat_inc["target_child_age"]<=30)]

In [8]:
# this way we include older and younger Tseltal kids
rand_dat_inc = pd.concat([TSE_rand_dat_inc, rand_dat_inc])

In [9]:
rand_dat_inc_cg = rand_dat_inc[rand_dat_inc["caregiver"]=="caregiver"]

rand_dat_inc_cg["contingent"] = np.where(rand_dat_inc_cg["contingent"]==1, "contingent", "non-contingent")

rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"].notna()]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="xxx"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="yyy"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="www"]

In [10]:
# add play context and year of study

play_context = pd.read_csv("../data/context_data.csv")
play_context = play_context.rename(columns={"Corpus": "corpus_name"})

# print(play_context.to_markdown())

rand_dat_inc_cg = rand_dat_inc_cg.merge(play_context,on='corpus_name')

rand_dat_inc_cg["Activity"] = rand_dat_inc_cg["Activity"].fillna("NaN")

rand_dat_inc_cg["context"] = rand_dat_inc_cg["Location"] + rand_dat_inc_cg["Activity"]

rand_dat_inc_cg["context"] = rand_dat_inc_cg["context"].replace({"HomeBook-reading":"Home: book reading",
                                                                 "HomeInterview/Unstructured":"Home: interview/unstructured",
                                                                 "HomeNaN":"Home: unreported",
                                                                 "HomeOther":"Home: other",
                                                                 "HomeUnstructured":"Home: unstructured",
                                                                 "LabOther":"Lab: other",
                                                                 "LabTabletop play":"Lab: tabletop play",
                                                                 "LabInterview/Unstructured":"Lab: interview/unstructured",
                                                                 "LabUnstructured":"Lab: unstructured",
                                                                 np.nan:"Unreported",
                                                                 "OtherUnstructured":"Other: unstructured"})

# # year of study
# corpora_year = pd.read_csv("../data/corpora_year.csv")
# corpora_year = corpora_year.rename(columns={"Corpora": "corpus_name"})
# corpora_year = corpora_year[["corpus_name", "Year collected"]]

# rand_dat_inc_cg = rand_dat_inc_cg.merge(corpora_year,on='corpus_name')

In [11]:
%%R -i rand_dat_inc_cg

library("lme4")
library("knitr")
library("broom")
library("emmeans")
library("lmerTest")
library("kableExtra")

options(scipen = 999)

Loading required package: Matrix

Attaching package: ‘Matrix’

The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Attaching package: ‘lmerTest’

The following object is masked from ‘package:lme4’:

    lmer

The following object is masked from ‘package:stats’:

    step


Attaching package: ‘kableExtra’

The following object is masked from ‘package:dplyr’:

    group_rows



In [12]:
%%R -o rand_dat_inc_cg

# ---- create caregiver type categories

caregiver_type <- rand_dat_inc_cg %>%
  group_by(transcript_id) %>%
  summarise(
    caregiver_type = case_when(
      all(speaker_role == "Mother") ~ "Mother only",
      all(speaker_role == "Father") ~ "Father only",
      any(speaker_role %in% c("Mother", "Father")) ~ "Mother & Father",
      TRUE ~ "Unknown"
    )
  )

rand_dat_inc_cg <- rand_dat_inc_cg %>%
  left_join(caregiver_type)

Joining with `by = join_by(transcript_id)`


In [13]:
rand_mlu_stats = (rand_dat_inc_cg.groupby(["Language_name","target_child_id","transcript_id","contingent"])
                                  .num_tokens
                                  .agg(["mean"])
                                  .reset_index())
rand_mlu_sumstats =  rand_mlu_stats.rename({'mean': 'means'}, axis=1)

In [14]:
%%R -i rand_mlu_sumstats

# ^import rand_mlu_sumstats into R

NULL


In [15]:
%%R -o rand_mlu_sumstats

rand_mlu_sumstats <- rand_mlu_sumstats %>%
    left_join(caregiver_type) %>%
    filter(Language_name != "Mandarin") %>%
    filter(Language_name != "Polish")

Joining with `by = join_by(transcript_id)`


In [16]:
# save data to file
rand_mlu_sumstats.to_csv("../data/rand_mlu_sumstats.csv")

----
#### MLUw plot

Simple plot

In [17]:
%%R -i rand_mlu_sumstats

library('ggplot2')
library('repr')
options(repr.plot.width=6, repr.plot.height=12)

xlabs <- c("C", "NC")

# ara_label <- data.frame(means=c(0),contingent = c(1.5),language="ara") # no adult speech transcribed
deu_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Estonian")
# fas_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Persian")
fas_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Korean")
# nor_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Norwegian")
nor_ns_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Norwegian")
# pol_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Swedish")
tse_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Tseltal")
# zho_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Mandarin")
# zho_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_mlu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") + 
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "**",size=8,color="black") +  
     geom_text(data = fas_ns_label,label = "ns",size=4,color="black",fontface = "italic") +
#      geom_text(data = fas_label,label = "*",size=8, color="black") +
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_ns_label,label = "**",size=8,color="black") +  
     geom_text(data = por_label,label = "***",size=8,color="black") +  
     geom_text(data = spa_label,label = "***",size=8,color="black") + 
     geom_text(data = swe_label,label = "***",size=8,color="black") + 
     geom_text(data = tse_label,label = "",size=8,color="black") + 
     ylim(0, 6) +
     labs(tag="B",
          y = "Mean Length of Utterances in Words",
          x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=16),
           axis.text.x = element_text(vjust = 0.5, hjust=0.5),
           legend.title = element_blank(),
           legend.background = element_rect(fill=alpha("white",0.90),
                                                            size=0, linetype="dotted",
                                                            colour = "white"),
           legend.text=element_text(size=16))
      ggsave("../figures/token_mlu_rand.pdf", width = 11.5, height = 4.2)

1: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
ℹ Please use the `fun` argument instead.
generated. 
2: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
generated. 
3: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
generated. 
4: Removed 177 rows containing non-finite values (`stat_summary()`). 
5: Removed 177 rows containing non-finite values (`stat_summary()`). 


for manuscript

In [18]:
%%R -i rand_mlu_sumstats

library('ggplot2')
library('repr')
options(repr.plot.width=6, repr.plot.height=12)

xlabs <- c("C", "NC")

# ara_label <- data.frame(means=c(0),contingent = c(1.5),language="ara") # no adult speech transcribed
deu_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Estonian")
# fas_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Persian")
fas_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Korean")
# nor_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Norwegian")
nor_ns_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Norwegian")
# pol_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(5.65),contingent = c(1.5),Language_name="Swedish")
# zho_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Mandarin")
# zho_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_mlu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .4) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=6,color="black") + 
     geom_text(data = eng_label,label = "***",size=6,color="black") +  
     geom_text(data = est_label,label = "**",size=6,color="black") +  
     geom_text(data = fas_ns_label,label = "ns",size=3,color="black",fontface = "italic") +
#      geom_text(data = fas_label,label = "*",size=8, color="black") +
     geom_text(data = fra_label,label = "***",size=6,color="black") +  
     geom_text(data = hrv_label,label = "***",size=6,color="black") + 
     geom_text(data = jpn_label,label = "***",size=6,color="black") + 
     geom_text(data = kor_label,label = "***",size=6,color="black") +  
     geom_text(data = nor_ns_label,label = "**",size=6,color="black") +  
     geom_text(data = por_label,label = "***",size=6,color="black") +  
     geom_text(data = spa_label,label = "***",size=6,color="black") + 
     geom_text(data = swe_label,label = "***",size=6,color="black") + 
     geom_text(data = tse_label,label = "",size=6,color="black") + 
#      geom_text(data = zho_label,label = "*",size=8, color="black") +
     ylim(0, 6) +
     labs(tag="B",
          y = "Mean Length of Utterances in Words", x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=11.5),
           axis.text.x = element_text(vjust = 0.5, hjust=0.5),
           legend.position="none")
      ggsave("../figures/figure_2_B.pdf", width = 11.5, height = 4.2)

1: Removed 177 rows containing non-finite values (`stat_summary()`). 
2: Removed 177 rows containing non-finite values (`stat_summary()`). 


In [19]:
MLUw_dat = rand_dat_inc_cg[['Language_name','num_tokens','contingent','transcript_id','target_child_id','caregiver_type']]

Descriptives

In [20]:
%%R -i rand_mlu_sumstats

rand_mlu_sumstats %>%
    group_by(Language_name, contingent) %>%
    summarize(mean = mean(means),
              sd = sd(means)) %>%
    pivot_wider(names_from = contingent, values_from = c(mean, sd)) %>%
    select(Language_name, mean_contingent, sd_contingent, `mean_non-contingent`, `sd_non-contingent`) %>%
    `colnames<-`(c("Language", "Contingent Mean", "Contingent SD", "Non-Contingent Mean", "Non-Contingent SD")) %>%
    mutate(across(where(is.numeric), ~ round(., 2))) %>%
    unite("Contingent M (SD)", "Contingent Mean", "Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Contingent M (SD)` = paste0(`Contingent M (SD)`, ")")) %>%
    unite("Non-Contingent M (SD)", "Non-Contingent Mean", "Non-Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Non-Contingent M (SD)` = paste0(`Non-Contingent M (SD)`, ")")) %>%
    kbl("pipe")

`summarise()` has grouped output by 'Language_name'. You can override using the
`.groups` argument.


|Language   |Contingent M (SD) |Non-Contingent M (SD) |
|:----------|:-----------------|:---------------------|
|Croatian   |3.72 (0.8)        |4.07 (0.82)           |
|English    |3.7 (1.71)        |4.34 (1.32)           |
|Estonian   |3.91 (0.42)       |4.53 (0.65)           |
|French     |3.97 (1.28)       |4.45 (1.22)           |
|German     |4.49 (1.03)       |5.34 (1.02)           |
|Japanese   |2.3 (0.45)        |2.93 (0.55)           |
|Korean     |3.78 (0.7)        |4.25 (0.66)           |
|Norwegian  |3.38 (1.38)       |4.08 (1.49)           |
|Persian    |2.76 (0.37)       |3.19 (0.32)           |
|Portuguese |3.94 (0.89)       |4.45 (0.88)           |
|Spanish    |3.34 (0.61)       |3.82 (0.63)           |
|Swedish    |3.27 (0.66)       |3.95 (0.77)           |
|Tseltal    |2.83 (0.48)       |3.58 (0.48)           |


#### Statistical analyses

By language

In [21]:
%%R -i MLUw_dat

MLUw_dat <- MLUw_dat %>%
    filter(Language_name != "Mandarin") %>%
    filter(Language_name != "Polish")

# vectors for rows to remove from lmer
case_study <- c("Persian") # only 1 target child analyzed

case_study_cgtype_compare <- c("Korean") # only 1 target child analyzed, varies in CG type

no_cgtype_compare <- c("Portuguese", "Tseltal") # only `Mother only`

# single_tran <- c("Polish") # only 1 transcript

# nests of models
mlu_nest1 <- MLUw_dat %>%
    filter(!Language_name %in% case_study) %>%
    filter(!Language_name %in% no_cgtype_compare) %>%
    filter(!Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(num_tokens ~ contingent + caregiver_type +
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`,`t.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`,`t.ratio`))

mlu_nest2 <- MLUw_dat %>%
    filter(Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(num_tokens ~ contingent + caregiver_type +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`))

mlu_nest3 <- MLUw_dat %>%
    filter(Language_name %in% no_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(num_tokens ~ contingent + 
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

mlu_nest4 <- MLUw_dat %>%
    filter(Language_name %in% case_study) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(num_tokens ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`z.ratio`), .before = p.value) %>%
    select (-c(`z.ratio`))
    
# number of transcripts per language
sample_size <- MLUw_dat %>%
    group_by(Language_name) %>%
    summarize(n = n_distinct(transcript_id))
    
# combine lmer summaries and correct p-values for multiple comparisons
emms_all <- list(mlu_nest1, mlu_nest2, mlu_nest3, mlu_nest4) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 13)) %>%
    left_join(sample_size)

Joining with `by = join_by(Language_name)`


boundary (singular) fit: see help('isSingular')
boundary (singular) fit: see help('isSingular')
Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'pbkrtest.limit = 6241' (or larger)
[or, globally, 'set emm_options(pbkrtest.limit = 6241)' or larger];
but be warned that this may result in large computation time and memory use.
Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'lmerTest.limit = 6241' (or larger)
[or, globally, 'set emm_options(lmerTest.limit = 6241)' or larger];
but be warned that this may result in large computation time and memory use.
Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
To enable adjustments, add the argument 'pbkrtest.limit = 100059' (or larger)
[or, globally, 'set emm_options(pbkrtest.limit = 100059)' or larger];
but be warned that this may result in 

Format statistics table

In [22]:
%%R

table_maker = function(data) { data %>%
    select(Language_name, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Language", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,Language)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Language (n)", c('Language','n'), sep=" (") %>%
    mutate(`Language (n)` = paste0(`Language (n)`,")")) %>%
    arrange(`Language (n)`)
    }

MLU_stats_table <- table_maker(emms_all)

kable(MLU_stats_table, "pipe")



|Language (n)    |Estimate (SE) | Test statistic| Effect size|Adjusted p-value |
|:---------------|:-------------|--------------:|-----------:|:----------------|
|Croatian (58)   |-0.37 (0.07)  |          -5.00|       -0.13|<.0001           |
|English (882)   |-0.69 (0.03)  |         -26.23|       -0.23|<.0001           |
|Estonian (22)   |-0.61 (0.12)  |          -4.88|       -0.21|<.0001           |
|French (279)    |-0.53 (0.05)  |          -9.77|       -0.16|<.0001           |
|German (38)     |-0.89 (0.11)  |          -8.38|       -0.26|<.0001           |
|Japanese (160)  |-0.65 (0.03)  |         -23.54|       -0.32|<.0001           |
|Korean (28)     |-0.46 (0.07)  |          -6.66|       -0.22|<.0001           |
|Norwegian (28)  |-0.66 (0.14)  |          -4.59|       -0.24|0.0001           |
|Persian (12)    |-0.41 (0.1)   |          -4.30|       -0.18|0.0002           |
|Portuguese (23) |-0.69 (0.13)  |          -5.32|       -0.22|<.0001           |
|Spanish (30)    |-0.53 (0

In [23]:
%%R 

# create clean language column

MLU_stats_table <- MLU_stats_table %>%
  mutate(Clean_Language = str_extract(`Language (n)`, "^[^\\(]+"))

# extract effect size CI

CIs <- emms_all %>% 
    select(`lower.CL`,`upper.CL`) %>%
    `colnames<-`(c("Clean_Language", "CI+", "CI-")) %>%
    arrange(Clean_Language)

# add columns sample and measure and save

MLU_stats_table %>%
    mutate(`CI+` = CIs$`CI+`,
           `CI-` = CIs$`CI-`,
           sample = "rand",
           measure = "mlu") %>%
    write.csv(file = "../data/rand_mlu_stats.csv")

Adding missing grouping variables: `Language_name`


In [24]:
%%R 

# ---- Sign test per language

rand_mlu_stats_wide <- rand_mlu_sumstats %>%
  spread(key = contingent, value = means) %>%
  mutate(diff = `non-contingent` - contingent,
         sign_cat = case_when(
           diff > 0 ~ "+",
           diff < 0 ~ "-",
           TRUE ~ "0")) %>%
  group_by(Language_name) %>%
  count(sign_cat) %>%
  mutate(prop = n / sum(n)) %>%
  filter(sign_cat == "+")

rand_mlu_stats_wide %>%
  mutate(measure = "Mean Length of Utterance in Words") %>%
  write.csv(file = "../data/rand_mlu_signtest.csv")
  
kbl(rand_mlu_stats_wide,"pipe")



|Language_name |sign_cat |   n|      prop|
|:-------------|:--------|---:|---------:|
|Croatian      |+        |  39| 0.6724138|
|English       |+        | 652| 0.7392290|
|Estonian      |+        |  21| 0.9545455|
|French        |+        | 194| 0.6953405|
|German        |+        |  33| 0.8684211|
|Japanese      |+        | 151| 0.9437500|
|Korean        |+        |  27| 0.9642857|
|Norwegian     |+        |  21| 0.7500000|
|Persian       |+        |  10| 0.8333333|
|Portuguese    |+        |  18| 0.7826087|
|Spanish       |+        |  26| 0.8666667|
|Swedish       |+        |  16| 1.0000000|
|Tseltal       |+        |   9| 0.9000000|


In [25]:
%%R 

sign_tests <- rand_mlu_sumstats %>%
  spread(key = contingent, value = means) %>%
  filter(!is.na(contingent) & !is.na(`non-contingent`)) %>%
  mutate(diff = `non-contingent` - contingent,
         success = if_else(diff > 0, 1, 0)) %>%
  group_by(Language_name) %>%
  summarize(successes = sum(success),
            trials = n(),
            .groups = 'drop')

# Nest the data and run binom.test
sign_tests %>%
  mutate(
    test_result = purrr::map2(successes, trials, ~binom.test(x = .x, n = .y)),
    `Adjusted p-value` = map_dbl(test_result, 'p.value'),
    `Adjusted p-value` = p.adjust(`Adjusted p-value`, "holm", 14),
    `Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4),
    `Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)
  )

# A tibble: 13 × 5
   Language_name successes trials test_result `Adjusted p-value`
   <chr>             <dbl>  <int> <list>      <chr>             
 1 Croatian             39     58 <htest>     0.0531            
 2 English             652    858 <htest>     <.0001            
 3 Estonian             21     22 <htest>     0.0001            
 4 French              194    275 <htest>     <.0001            
 5 German               33     38 <htest>     <.0001            
 6 Japanese            151    160 <htest>     <.0001            
 7 Korean               27     28 <htest>     <.0001            
 8 Norwegian            21     27 <htest>     0.0355            
 9 Persian              10     12 <htest>     0.0771            
10 Portuguese           18     23 <htest>     0.0531            
11 Spanish              26     30 <htest>     0.0004            
12 Swedish              16     16 <htest>     0.0002            
13 Tseltal               9     10 <htest>     0.0645            


By play context

In [26]:
rand_mlu_sumstats_contex = (rand_dat_inc_cg.groupby(["context","target_child_id","transcript_id","contingent"])
                                  .num_tokens
                                  .agg(["mean"])
                                  .reset_index())

rand_mlu_sumstats_contex =  rand_mlu_sumstats_contex.rename({'mean': 'means'}, axis=1)

In [31]:
%%R -i rand_mlu_sumstats_contex

emm_options(pbkrtest.limit = 96789)

# vectors for rows to remove from lmer
single_tran <- c("Home: interview/unstructured") # only 1 transcript

contex_sample_size <- rand_mlu_sumstats_contex %>%
    group_by(context) %>%
    summarize(n = n_distinct(transcript_id))

mlu_contex_nest_1 <- rand_dat_inc_cg %>%
    filter(!context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(num_tokens ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(context, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

mlu_contex_nest_2 <- rand_dat_inc_cg %>%
    filter(context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lm(num_tokens ~ contingent,
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise")))) %>%
    select(context, contrasts) %>%
    unnest(cols = c(contrasts))


# combine lmer summaries and correct p-values for multiple comparisons
context_emms_all <- list(mlu_contex_nest_1, mlu_contex_nest_2) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 7)) %>%
    left_join(contex_sample_size)

Joining with `by = join_by(context)`


`cols` is now required when using `unnest()`.
ℹ Please use `cols = c(effect_size)`. 


In [32]:
%%R

table_maker = function(data) { data %>%
    select(context, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Play context", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,`Play context`)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Play context (n)", c(`Play context`,'n'), sep=" (") %>%
    mutate(`Play context (n)` = paste0(`Play context (n)`,")")) %>%
    arrange(`Play context (n)`)
    }

mlu_context_stats_table <- table_maker(context_emms_all)

kable(mlu_context_stats_table, "pipe")



|Play context (n)                  |Estimate (SE) | Test statistic| Effect size|Adjusted p-value |
|:---------------------------------|:-------------|--------------:|-----------:|:----------------|
|Home: book reading (28)           |-0.46 (0.07)  |          -6.67|       -0.22|<.0001           |
|Home: other (20)                  |-0.54 (0.11)  |          -4.98|       -0.26|<.0001           |
|Home: unreported (12)             |-0.41 (0.1)   |          -4.27|       -0.18|0.0001           |
|Home: unstructured (900)          |-0.72 (0.02)  |         -32.14|       -0.24|<.0001           |
|Lab: interview/unstructured (368) |-0.41 (0.05)  |          -8.18|       -0.15|<.0001           |
|Lab: unstructured (26)            |-0.08 (0.08)  |          -1.01|       -0.04|1.0000           |
|Other: unstructured (138)         |-0.85 (0.05)  |         -17.04|       -0.31|<.0001           |
|Unreported (94)                   |-0.36 (0.04)  |          -9.26|       -0.21|<.0001           |
