# Natural Statistics Cross-linguistic: 

#### Lexical diversity analysis - random sample 

----

In [39]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "data_proc")
import analytic_proc
import warnings
warnings.filterwarnings('ignore')

In [40]:
TSE_rand_dat_inc = pd.read_csv("../data/TSE_cont_dat.csv")
TSE_rand_dat_inc.rename(columns={'Langauge_name': 'Language_name'}, inplace=True)
TSE_rand_dat_inc['target_child_id'] = TSE_rand_dat_inc['transcript_id']
# only include 'Mother' as speaker_role
TSE_rand_dat_inc = TSE_rand_dat_inc[TSE_rand_dat_inc['speaker_role'] == 'Mother']

In [41]:
rand_dat_inc = pd.read_csv("../data/rand_dat_inc_master.csv",index_col=0,low_memory=False)

In [42]:
# load and clean data
rand_dat_inc=rand_dat_inc[rand_dat_inc["language"]!="ara"]
rand_dat_inc=rand_dat_inc[(rand_dat_inc["target_child_age"]>=5) & (rand_dat_inc["target_child_age"]<=30)]

In [43]:
# this way we include older and younger Tseltal kids
rand_dat_inc = pd.concat([TSE_rand_dat_inc, rand_dat_inc])

In [44]:
rand_dat_inc_cg = rand_dat_inc[rand_dat_inc["caregiver"]=="caregiver"]

rand_dat_inc_cg["contingent"] = np.where(rand_dat_inc_cg["contingent"]==1, "contingent", "non-contingent")

rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"].notna()]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="xxx"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="yyy"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="www"]

Would call the following if we could run lexical diversity analysis globally:

```python
analytic_proc.create_result(rand_dat_inc_cg)
```

However, we want to have seperate dictionaries for contingent and non-contingent words so we can compare them to one another.

The function will allow us to have a different dictionary for each transcript.

Finally, to compare, we can run mixed effects to understand whether contingent and non-contingent utterances differ in their lexical diversity.

----
#### Seperate contingent and non-contingent utterances into individual dataframes

In [45]:
rand_dat_inc_cg_cc = rand_dat_inc_cg[rand_dat_inc_cg["contingent"]=="contingent"].reset_index(drop=True)
rand_dat_inc_cg_nc = rand_dat_inc_cg[rand_dat_inc_cg["contingent"]=="non-contingent"].reset_index(drop=True)

----
#### Loop through each unique transcript to compute the lexical diversity counts across languages.

In [46]:
analytic_proc.create_c_result(rand_dat_inc_cg_cc,"rand")

In [47]:
analytic_proc.create_nc_result(rand_dat_inc_cg_nc,"rand")

----
#### Lexical Diversity plot

In [48]:
rand_dat_inc_cg_cc = pd.read_csv("../data/rand_dat_inc_master_cc_lexdiv.csv",index_col=0,low_memory=False)
rand_dat_inc_cg_nc = pd.read_csv("../data/rand_dat_inc_master_nc_lexdiv.csv",index_col=0,low_memory=False)

# combine dataframes into one

rand_dat_inc_cg = pd.concat([rand_dat_inc_cg_cc,rand_dat_inc_cg_nc])

In [49]:
# add play context and year of study

play_context = pd.read_csv("../data/context_data.csv")
play_context = play_context.rename(columns={"Corpus": "corpus_name"})

# print(play_context.to_markdown())

rand_dat_inc_cg = rand_dat_inc_cg.merge(play_context,on='corpus_name')

rand_dat_inc_cg["context"] = rand_dat_inc_cg["Location"] + rand_dat_inc_cg["Activity"]

rand_dat_inc_cg["context"] = rand_dat_inc_cg["context"].replace({"HomeBook-reading":"Home: book reading",
                                                                 "HomeInterview/Unstructured":"Home: interview/unstructured",
                                                                 "HomeNaN":"Home: unreported",
                                                                 "HomeOther":"Home: other",
                                                                 "HomeUnstructured":"Home: unstructured",
                                                                 "LabOther":"Lab: other",
                                                                 "LabTabletop play":"Lab: tabletop play",
                                                                 "LabInterview/Unstructured":"Lab: interview/unstructured",
                                                                 "LabUnstructured":"Lab: unstructured",
                                                                 np.nan:"Unreported",
                                                                 "OtherUnstructured":"Other: unstructured"})

# year of study
# corpora_year = pd.read_csv("../data/corpora_year.csv")
# corpora_year = corpora_year.rename(columns={"Corpora": "corpus_name"})
# corpora_year = corpora_year[["corpus_name", "Year collected"]]

# rand_dat_inc_cg = rand_dat_inc_cg.merge(corpora_year,on='corpus_name')

In [50]:
len(rand_dat_inc_cg["Language_name"].unique())

13

In [51]:
# age descriptives
ar = [['min', rand_dat_inc_cg["target_child_age"].min()],
      ['max', rand_dat_inc_cg["target_child_age"].max()],
      ['mean', rand_dat_inc_cg["target_child_age"].mean()],
      ['stdev', rand_dat_inc_cg["target_child_age"].std()]]

age_range = pd.DataFrame(ar, columns = ['stat', 'age'])

# inspect data:
print(age_range.to_markdown())

|    | stat   |      age |
|---:|:-------|---------:|
|  0 | min    |  2       |
|  1 | max    | 36       |
|  2 | mean   | 19.2619  |
|  3 | stdev  |  6.83642 |


In [52]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [53]:
%%R -i rand_dat_inc_cg

library("lme4")
library("repr")
library("knitr")
library("broom")
library("emmeans")
library("tidyverse")
library("kableExtra")

In [54]:
%%R -o rand_dat_inc_cg

# ---- create caregiver type categories

caregiver_type <- rand_dat_inc_cg %>%
  group_by(transcript_id) %>%
  summarise(
    caregiver_type = case_when(
      all(speaker_role == "Mother") ~ "Mother only",
      all(speaker_role == "Father") ~ "Father only",
      any(speaker_role %in% c("Mother", "Father")) ~ "Mother & Father",
      TRUE ~ "Unknown"
    )
  )

# inspect data:
# caregiver_type %>%
#     kbl(format="pipe")
    
# ggplot(caregiver_type, aes(x = 1, y = caregiver_type, fill = factor(caregiver_type))) + 
#   geom_col() +
#   coord_polar(theta = "y") +
#   theme_void()

rand_dat_inc_cg <- rand_dat_inc_cg %>%
  left_join(caregiver_type)

Joining with `by = join_by(transcript_id)`


In [55]:
rand_lex_stats = (rand_dat_inc_cg.groupby(["Language_name","target_child_id","transcript_id","contingent"]) #,"context","Year collected"])
                                  .uniqueness
                                  .agg(["sum"])
                                  .reset_index())
rand_lex_sumstats = rand_lex_stats.rename({'sum': 'sums'}, axis=1)

In [56]:
%%R -i rand_lex_sumstats

# ^import rand_lex_sumstats into R

NULL


In [57]:
%%R -o rand_lex_sumstats

rand_lex_sumstats <- rand_lex_sumstats %>%
    left_join(caregiver_type) %>%
    filter(Language_name != "Mandarin") %>%
    filter(Language_name != "Polish")

Joining with `by = join_by(transcript_id)`


In [58]:
# inspect data:
len(rand_lex_sumstats["Language_name"].unique())

13

In [59]:
# inspect data:
len(rand_lex_sumstats["transcript_id"].unique())

1586

In [60]:
# save data to file:
rand_lex_sumstats.to_csv("../data/rand_lex_sumstats.csv")

Simple plot

In [61]:
%%R -i rand_lex_sumstats

options(repr.plot.width=6, repr.plot.height=12, scipen=999)

xlabs <- c("C", "NC")

# ara_label <- data.frame(means=c(0),contingent = c(1.5),language="ara") # no adult speech transcribed
deu_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="English")
est_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Korean")
# nor_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Norwegian")
nor_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Norwegian")
pol_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(sums=c(240),contingent = c(1.5),Language_name="Portuguese")
spa_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Swedish")
tse_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Tseltal")
# zho_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_lex_sumstats, aes(x = contingent, y = sums, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") + 
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "**",size=8,color="black") +  
#      geom_text(data = fas_ns_label,label = "*",size=4,color="black",fontface = "italic") +
     geom_text(data = fas_label,label = "*",size=8, color="black") +
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_ns_label,label = "ns",size=4,color="black",fontface = "italic") +  
#      geom_text(data = pol_ns_label,label = "ns", size=4,color="black",fontface = "italic") +  
     geom_text(data = por_label,label = "**",size=8,color="black") +  
     geom_text(data = spa_ns_label,label = "ns",size=4,color="black",fontface = "italic") + 
     geom_text(data = swe_label,label = "ns",size=4,color="black",fontface = "italic") + 
     geom_text(data = tse_label,label = "", size=8, color="black") +
#      geom_text(data = zho_ns_label,label = "ns",size=4,color="black",fontface = "italic") +
     # geom_text(data = zho_label,label = "^",size=8, color="black") +
     ylim(0, 250) +
     labs(tag = "A",
          y = "Number of Unique Words", x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=16),
           axis.text.x = element_text(vjust = 0.5, hjust=.5),
           legend.title = element_blank(),
           legend.background = element_rect(fill=alpha("white",0.90),
                                                            size=0, linetype="dotted",
                                                            colour = "white"),
           legend.text=element_text(size=16))
     ggsave("../figures/lexical_diversity_rand.pdf", width = 11.7, height = 6.2)

for manuscript

In [62]:
%%R -i rand_lex_sumstats

library('ggplot2')
library('repr')
options(repr.plot.width=6, repr.plot.height=12)

xlabs <- c("C", "NC")

# ara_label <- data.frame(means=c(0),contingent = c(1.5),language="ara") # no adult speech transcribed
deu_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="English")
est_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Korean")
# nor_label <- data.frame(means=c(5.8),contingent = c(1.5),Language_name="Norwegian")
nor_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Norwegian")
# pol_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Portuguese")
spa_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Swedish")
tse_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Tseltal")
# zho_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Mandarin")
# zho_ns_label <- data.frame(means=c(6),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_lex_sumstats, aes(x = contingent, y = sums, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .4) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=6,color="black") + 
     geom_text(data = eng_label,label = "***",size=6,color="black") +  
     geom_text(data = est_label,label = "**",size=6,color="black") +  
     geom_text(data = fas_label,label = "*",size=6, color="black") +
     geom_text(data = fra_label,label = "***",size=6,color="black") +  
     geom_text(data = hrv_label,label = "***",size=6,color="black") + 
     geom_text(data = jpn_label,label = "***",size=6,color="black") + 
     geom_text(data = kor_label,label = "***",size=6,color="black") +  
     geom_text(data = nor_ns_label,label = "ns",size=3,color="black",fontface = "italic") +  
     geom_text(data = por_label,label = "**",size=6,color="black") +  
     geom_text(data = spa_ns_label,label = "ns",size=3,color="black",fontface = "italic") + 
     geom_text(data = swe_label,label = "ns",size=3,color="black",fontface = "italic") + 
     geom_text(data = tse_label,label = "",size=6,color="black") + 
     # geom_text(data = zho_label,label = "^",size=3, color="black") +
     ylim(0, 250) +
     labs(tag = "A",
          y = "Number of Unique Words", x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=11.5),
           axis.text.x = element_text(vjust = 0.5, hjust=0.5),
           legend.position="none")
     ggsave("../figures/figure_2_A.pdf", width = 11.5, height = 4.2)

#### Descriptives

In [63]:
%%R -i rand_lex_sumstats

rand_lex_sumstats %>%
    group_by(Language_name, contingent) %>%
    summarize(mean = mean(sums),
              sd = sd(sums)) %>%
    pivot_wider(names_from = contingent, values_from = c(mean, sd)) %>%
    select(Language_name, mean_contingent, sd_contingent, `mean_non-contingent`, `sd_non-contingent`) %>%
    `colnames<-`(c("Language", "Contingent Mean", "Contingent SD", "Non-Contingent Mean", "Non-Contingent SD")) %>%
    mutate(across(where(is.numeric), ~ round(., 2))) %>%
    unite("Contingent M (SD)", "Contingent Mean", "Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Contingent M (SD)` = paste0(`Contingent M (SD)`, ")")) %>%
    unite("Non-Contingent M (SD)", "Non-Contingent Mean", "Non-Contingent SD", sep = " (", na.rm = TRUE) %>%
    mutate(`Non-Contingent M (SD)` = paste0(`Non-Contingent M (SD)`, ")")) %>%
    kbl("pipe")

`summarise()` has grouped output by 'Language_name'. You can override using the
`.groups` argument.


|Language   |Contingent M (SD) |Non-Contingent M (SD) |
|:----------|:-----------------|:---------------------|
|Croatian   |87.52 (37.27)     |124.66 (43.81)        |
|English    |45.79 (33.14)     |151.24 (57.08)        |
|Estonian   |94.91 (42.34)     |164.27 (57.91)        |
|French     |47.68 (32.34)     |115.8 (59.34)         |
|German     |101.13 (39.57)    |167.61 (60.51)        |
|Japanese   |70.81 (29.22)     |122.28 (39.83)        |
|Korean     |120.54 (39.34)    |227.54 (55.18)        |
|Norwegian  |43.93 (43.84)     |71.78 (68.68)         |
|Persian    |102.83 (50.34)    |276.83 (110.92)       |
|Portuguese |79.48 (39.38)     |118.87 (44.25)        |
|Spanish    |110.03 (30.66)    |112.6 (30.7)          |
|Swedish    |108 (33.93)       |146.75 (41.9)         |
|Tseltal    |119.9 (76.61)     |186 (98.35)           |


----
#### Statistical analyses

By language

In [64]:
rand_lex_sumstats = rand_lex_sumstats[['Language_name','sums','contingent','transcript_id','target_child_id','caregiver_type']] #, 'Year collected']]

rand_dat_inc_cg_count = rand_dat_inc_cg[['Language_name','transcript_id','target_child_id']]

In [65]:
rand_dat_inc_cg_count.to_csv("../data/rand_dat_inc_cg_count.csv")

In [66]:
%%R -i rand_lex_sumstats -i rand_dat_inc_cg_count

# vectors for rows to remove from lmer
case_study <- c("Persian") # only 1 target child analyzed

case_study_cgtype_compare <- c("Korean") # only 1 target child analyzed, varies in CG type

no_cgtype_compare <- c("Portuguese", "Tseltal") # only `Mother only`

# single_tran <- c("Polish") # only 1 transcript

# nests of models
lexdiv_nest1 <- rand_lex_sumstats %>%
    filter(!Language_name %in% case_study) %>%
    filter(!Language_name %in% no_cgtype_compare) %>%
    filter(!Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent + caregiver_type +
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

lexdiv_nest2 <- rand_lex_sumstats %>%
    filter(Language_name %in% case_study_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent + caregiver_type +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

lexdiv_nest3 <- rand_lex_sumstats %>%
    filter(Language_name %in% no_cgtype_compare) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent + 
                                (1|target_child_id) +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

lexdiv_nest4 <- rand_lex_sumstats %>%
    filter(Language_name %in% case_study) %>%
    group_by(Language_name) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(Language_name, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))
    
# number of transcripts per language
sample_size <- rand_dat_inc_cg_count %>%
    group_by(Language_name) %>%
    summarize(n = n_distinct(transcript_id))
    
# combine lmer summaries and correct p-values for multiple comparisons
emms_all <- list(lexdiv_nest1, lexdiv_nest2, lexdiv_nest3, lexdiv_nest4) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 13)) %>%
    left_join(sample_size)

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')



Joining with `by = join_by(Language_name)`


format statistics table

In [67]:
%%R

table_maker = function(data) { data %>%
    select(Language_name, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Language", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,Language)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Language (n)", c('Language','n'), sep=" (") %>%
    mutate(`Language (n)` = paste0(`Language (n)`,")")) %>%
    arrange(`Language (n)`)
    }

lexdiv_stats_table <- table_maker(emms_all)

kable(lexdiv_stats_table,"pipe")



|Language (n)    |Estimate (SE)  | Test statistic| Effect size|Adjusted p-value |
|:---------------|:--------------|--------------:|-----------:|:----------------|
|Croatian (58)   |-37.14 (3.93)  |          -9.44|       -1.77|<.0001           |
|English (882)   |-105.5 (2.04)  |         -51.74|       -2.49|<.0001           |
|Estonian (22)   |-69.36 (15.44) |          -4.49|       -1.40|0.0019           |
|French (279)    |-68.35 (3.3)   |         -20.71|       -1.76|<.0001           |
|German (38)     |-66.47 (7.94)  |          -8.38|       -1.95|<.0001           |
|Japanese (160)  |-51.46 (3.37)  |         -15.25|       -1.71|<.0001           |
|Korean (28)     |-107 (12.68)   |          -8.44|       -2.32|<.0001           |
|Norwegian (28)  |-27.07 (10.18) |          -2.66|       -0.73|0.1658           |
|Persian (12)    |-174 (26.31)   |          -6.61|       -2.82|0.0002           |
|Portuguese (23) |-39.39 (9.32)  |          -4.23|       -1.27|0.0038           |
|Spanish (30) 

In [68]:
%%R 

# add columns sample and measure and save

lexdiv_stats_table %>%
    mutate(sample = "rand",
           measure = "lexdiv") %>%
    write.csv(file = "../data/rand_lexdiv_stats.csv")

In [69]:
%%R 

# ---- Sign test per language

rand_lex_stats_wide <- rand_lex_sumstats %>%
  spread(key = contingent, value = sums) %>%
  mutate(diff = `non-contingent` - contingent,
         sign_cat = case_when(
           diff > 0 ~ "+",
           diff < 0 ~ "-",
           TRUE ~ "0")) %>%
  group_by(Language_name) %>%
  count(sign_cat) %>%
  mutate(prop = n / sum(n)) %>%
  filter(sign_cat == "+")

rand_lex_stats_wide %>%
  mutate(measure = "Number of Unique Words") %>%
  write.csv(file = "../data/rand_lexdiv_signtest.csv")

kbl(rand_lex_stats_wide,"pipe")



|Language_name |sign_cat |   n|      prop|
|:-------------|:--------|---:|---------:|
|Croatian      |+        |  53| 0.9137931|
|English       |+        | 839| 0.9512472|
|Estonian      |+        |  18| 0.8181818|
|French        |+        | 245| 0.8781362|
|German        |+        |  36| 0.9473684|
|Japanese      |+        | 145| 0.9062500|
|Korean        |+        |  25| 0.8928571|
|Norwegian     |+        |  16| 0.5714286|
|Persian       |+        |  12| 1.0000000|
|Portuguese    |+        |  20| 0.8695652|
|Spanish       |+        |  15| 0.5000000|
|Swedish       |+        |  13| 0.8125000|
|Tseltal       |+        |   8| 0.8000000|


In [70]:
%%R 

sign_tests <- rand_lex_sumstats %>%
  spread(key = contingent, value = sums) %>%
  filter(!is.na(contingent) & !is.na(`non-contingent`)) %>%
  mutate(diff = `non-contingent` - contingent,
         success = if_else(diff > 0, 1, 0)) %>%
  group_by(Language_name) %>%
  summarize(successes = sum(success),
            trials = n(),
            .groups = 'drop')

# Nest the data and run binom.test
sign_tests %>%
  mutate(
    test_result = purrr::map2(successes, trials, ~binom.test(x = .x, n = .y)),
    `Adjusted p-value` = map_dbl(test_result, 'p.value'),
    `Adjusted p-value` = p.adjust(`Adjusted p-value`, "holm", 14),
    `Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4),
    `Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)
  )

# A tibble: 13 × 5
   Language_name successes trials test_result `Adjusted p-value`
   <chr>             <dbl>  <int> <list>      <chr>             
 1 Croatian             53     58 <htest>     <.0001            
 2 English             839    858 <htest>     <.0001            
 3 Estonian             18     22 <htest>     0.0261            
 4 French              245    275 <htest>     <.0001            
 5 German               36     38 <htest>     <.0001            
 6 Japanese            145    160 <htest>     <.0001            
 7 Korean               25     28 <htest>     0.0002            
 8 Norwegian            16     27 <htest>     1.0000            
 9 Persian              12     12 <htest>     0.0039            
10 Portuguese           20     23 <htest>     0.0039            
11 Spanish              15     30 <htest>     1.0000            
12 Swedish              13     16 <htest>     0.1064            
13 Tseltal               8     10 <htest>     0.4375            


By play context

In [71]:
rand_lex_sumstats_contex = (rand_dat_inc_cg.groupby(["context","target_child_id","transcript_id","contingent"])
                                  .uniqueness
                                  .agg(["sum"])
                                  .reset_index())

rand_lex_sumstats_contex =  rand_lex_sumstats_contex.rename({'sum': 'sums'}, axis=1)

In [72]:
rand_lex_sumstats_contex["context"].unique()

array(['Home: book reading', 'Home: other', 'Home: unstructured',
       'Lab: interview/unstructured', 'Lab: unstructured',
       'Other: unstructured', 'Unreported'], dtype=object)

In [86]:
%%R -i rand_lex_sumstats_contex

# vectors for rows to remove from lmer
single_tran <- c("Home: interview/unstructured") # only 1 transcript

contex_sample_size <- rand_lex_sumstats_contex %>%
    group_by(context) %>%
    summarize(n = n_distinct(transcript_id))

lexdiv_contex_nest_1 <- rand_lex_sumstats_contex %>%
    filter(!context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(context, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))
    
lexdiv_contex_nest_2 <- rand_lex_sumstats_contex %>%
    filter(context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lm(sums ~ contingent,
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise")))) %>%
    select(context, contrasts) %>%
    unnest(cols = c(contrasts))

# combine lmer summaries and correct p-values for multiple comparisons
context_emms_all <- list(lexdiv_contex_nest_1, lexdiv_contex_nest_2) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 7)) %>%
    left_join(contex_sample_size)

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')



Joining with `by = join_by(context)`


In [88]:
%%R

table_maker = function(data) { data %>%
    select(context, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Play context", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,`Play context`)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Play context (n)", c(`Play context`,'n'), sep=" (") %>%
    mutate(`Play context (n)` = paste0(`Play context (n)`,")")) %>%
    arrange(`Play context (n)`)
    }
    
lexdiv_context_stats_table <- table_maker(context_emms_all)

kable(lexdiv_context_stats_table, "pipe")



|Play context (n)                  |Estimate (SE)  | Test statistic| Effect size|Adjusted p-value |
|:---------------------------------|:--------------|--------------:|-----------:|:----------------|
|Home: book reading (28)           |-107 (12.81)   |          -8.36|       -2.27|<.0001           |
|Home: other (20)                  |-46.8 (5.34)   |          -8.77|       -2.84|<.0001           |
|Home: unstructured (900)          |-72.45 (1.93)  |         -37.61|       -1.78|<.0001           |
|Lab: interview/unstructured (368) |-141.7 (2.85)  |         -49.76|       -3.74|<.0001           |
|Lab: unstructured (26)            |-62.58 (10.63) |          -5.88|       -1.66|<.0001           |
|Other: unstructured (138)         |-60.58 (3.67)  |         -16.50|       -1.99|<.0001           |
|Unreported (106)                  |-35.99 (6.23)  |          -5.78|       -0.80|<.0001           |


By context, dropping English

In [31]:
# drop English
rand_dat_inc_cg_no_eng = rand_dat_inc_cg[rand_dat_inc_cg["Language_name"] != "English"]

rand_lex_sumstats_contex_no_eng = (rand_dat_inc_cg_no_eng.groupby(["context","target_child_id","transcript_id","contingent"])
                                  .uniquenss
                                  .agg(["sum"])
                                  .reset_index())

rand_lex_sumstats_contex_no_eng =  rand_lex_sumstats_contex_no_eng.rename({'sum': 'sums'}, axis=1)

In [44]:
%%R -i rand_lex_sumstats_contex_no_eng

# # vectors for rows to remove from lmer
single_tran <- c("Home: interview/unstructured") # only 1 transcript

contex_sample_size_no_eng <- rand_lex_sumstats_contex_no_eng %>%
    group_by(context) %>%
    summarize(n = n_distinct(transcript_id))

lexdiv_contex_nest_1_no_eng <- rand_lex_sumstats_contex_no_eng %>%
    filter(!context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lmer(sums ~ contingent +
                                (1|transcript_id),
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise"))),
           effect_size = map2(summary, fit, ~ eff_size(.x, sigma = sigma(.y), edf = df.residual(.y)))) %>%
    select(context, contrasts, effect_size) %>%
    unnest(cols = c(contrasts)) %>%
    mutate(effect_size = map(effect_size, ~ summary(.))) %>%
    unnest() %>%
    mutate(statistic = coalesce(`t.ratio`), .before = p.value) %>%
    select (-c(`t.ratio`))

lexdiv_contex_nest_2_no_eng <- rand_lex_sumstats_contex_no_eng %>%
    filter(context %in% single_tran) %>%
    group_by(context) %>%
    nest() %>%
    mutate(fit = map(data, ~ lm(sums ~ contingent,
                                data = .,
                                REML= FALSE)),
           summary = map(fit, ~ emmeans(., "contingent")),
           contrasts = map(summary, ~ summary(contrast(., method = "pairwise")))) %>%
    select(context, contrasts) %>%
    unnest(cols = c(contrasts))  %>%
    rename(statistic = `t.ratio`)
    
# combine lmer summaries and correct p-values for multiple comparisons
context_no_eng_emms_all <- list(lexdiv_contex_nest_1_no_eng, lexdiv_contex_nest_2_no_eng) %>% 
    reduce(bind_rows) %>%
    mutate(p.value = p.adjust(p.value, "holm", 3)) %>%
    left_join(contex_sample_size_no_eng)

R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: boundary (singular) fit: see help('isSingular')



Joining with `by = join_by(context)`


In [45]:
%%R 

table_maker = function(data) { data %>%
    select(context, n, estimate, SE, statistic, effect.size, p.value) %>%
    `colnames<-`(c("Play context", "n", "Estimate", "SE", "Test statistic", "Effect size", "Adjusted p-value")) %>%
    mutate_at(vars(-c(`Adjusted p-value`,`Play context`)), round,2) %>%
    mutate(`Adjusted p-value` = format(round(`Adjusted p-value`,4),nsmall=4)) %>%
    mutate(`Adjusted p-value` = gsub("0.0000","<.0001",`Adjusted p-value`)) %>%
    unite("Estimate (SE)", c('Estimate','SE'), sep=" (") %>%
    mutate(`Estimate (SE)` = paste0(`Estimate (SE)`,")")) %>%
    unite("Play context (n)", c(`Play context`,'n'), sep=" (") %>%
    mutate(`Play context (n)` = paste0(`Play context (n)`,")")) %>%
    arrange(`Play context (n)`)
    }
    
lexdiv_context_stats_table_no_eng <- table_maker(context_no_eng_emms_all)

kable(lexdiv_context_stats_table_no_eng)



|Play context (n)                 |Estimate (SE)   | Test statistic| Effect size|Adjusted p-value |
|:--------------------------------|:---------------|--------------:|-----------:|:----------------|
|Home: book reading (28)          |-116.71 (10.89) |         -10.72|       -2.92|<.0001           |
|Home: interview/unstructured (1) |-13 (NaN)       |            NaN|          NA|NaN              |
|Home: unstructured (560)         |-64.37 (2.12)   |         -30.41|       -1.83|<.0001           |
