# Natural Statistics Cross-linguistic: 

#### Proportion of single-word utterances analysis

----

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "data_proc")
import contingent_extraction
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load and clean data
rand_dat_inc = pd.read_csv("../data/rand_dat_inc_master.csv",index_col=0,low_memory=False)
rand_dat_inc=rand_dat_inc[rand_dat_inc["language"]!="ara"]
rand_dat_inc=rand_dat_inc[(rand_dat_inc["target_child_age"]>=5) & (rand_dat_inc["target_child_age"]<=30)]
rand_dat_inc_cg = rand_dat_inc[rand_dat_inc["caregiver"]=="caregiver"]

rand_dat_inc_cg["contingent"] = np.where(rand_dat_inc_cg["contingent"]==1, "contingent", "non-contingent")

rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"].notna()]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="xxx"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="yyy"]
rand_dat_inc_cg = rand_dat_inc_cg[rand_dat_inc_cg["gloss"]!="www"]

rand_dat_inc_cg["swu"]=np.where(rand_dat_inc_cg["num_tokens"]==1,1,0)

In [3]:
# add play context and year of study

play_context = pd.read_csv("../data/context_data.csv")
play_context = play_context.rename(columns={"Corpus": "corpus_name"})

# print(play_context.to_markdown())

rand_dat_inc_cg = rand_dat_inc_cg.merge(play_context,on='corpus_name')

rand_dat_inc_cg["context"] = rand_dat_inc_cg["Location"] + rand_dat_inc_cg["Activity"]

rand_dat_inc_cg["context"] = rand_dat_inc_cg["context"].replace({"HomeBook-reading":"Home: book reading",
                                                                 "HomeInterview/Unstructured":"Home: interview/unstructured",
                                                                 "HomeNaN":"Home: unreported",
                                                                 "HomeOther":"Home: other",
                                                                 "HomeUnstructured":"Home: unstructured",
                                                                 "LabOther":"Lab: other",
                                                                 "LabTabletop play":"Lab: tabletop play",
                                                                 "LabInterview/Unstructured":"Lab: interview/unstructured",
                                                                 "LabUnstructured":"Lab: unstructured",
                                                                 np.nan:"Unreported",
                                                                 "OtherUnstructured":"Other: unstructured"})

# year of study
corpora_year = pd.read_csv("../data/corpora_year.csv")
corpora_year = corpora_year.rename(columns={"Corpora": "corpus_name"})
corpora_year = corpora_year[["corpus_name", "Year collected"]]

rand_dat_inc_cg = rand_dat_inc_cg.merge(corpora_year,on='corpus_name')

In [4]:
%load_ext rpy2.ipython

In [5]:
%%R -i rand_dat_inc_cg

library("lme4")
library("repr")
library("knitr")
library("broom")
library("emmeans")
library("tidyverse")
library("kableExtra")

options(repr.plot.width=6, repr.plot.height=12, scipen=999)

R[write to console]: Loading required package: Matrix

R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ tidyr::pack()   masks Matrix::pack()
✖ tidyr::unpack() masks Matrix::unpack()

R[write to console]: 
Attaching package: ‘kableExtra’


R[write to console]: The following object is masked from ‘package:dplyr’:

    group_rows




In [6]:
%%R -o rand_dat_inc_cg

# ---- create caregiver type categories

caregiver_type <- rand_dat_inc_cg %>%
  group_by(transcript_id) %>%
  summarise(
    caregiver_type = case_when(
      all(speaker_role == "Mother") ~ "Mother only",
      all(speaker_role == "Father") ~ "Father only",
      any(speaker_role %in% c("Mother", "Father")) ~ "Mother & Father",
      TRUE ~ "Unknown"
    )
  )

# inspect data:
# caregiver_type %>%
#     kbl(format="pipe")
    
# ggplot(caregiver_type, aes(x = 1, y = caregiver_type, fill = factor(caregiver_type))) + 
#   geom_col() +
#   coord_polar(theta = "y") +
#   theme_void()

rand_dat_inc_cg <- rand_dat_inc_cg %>%
  left_join(caregiver_type)

Joining with `by = join_by(transcript_id)`


In [7]:
rand_swu_stats = (rand_dat_inc_cg.groupby(["Language_name","target_child_id","transcript_id","contingent"])
                                  .swu
                                  .agg(["mean"])
                                  .reset_index())
rand_swu_sumstats =  rand_swu_stats.rename({'mean': 'means'}, axis=1)

In [8]:
%%R -i rand_swu_sumstats

# ^import rand_swu_sumstats into R

NULL


In [9]:
%%R -o rand_swu_sumstats

rand_swu_sumstats <- rand_swu_sumstats %>%
    left_join(caregiver_type)

Joining with `by = join_by(transcript_id)`


In [10]:
# save data to file:
rand_swu_sumstats.to_csv("../data/rand_swu_sumstats.csv")

Simple plot

In [11]:
%%R

xlabs <- c("C", "NC")

# # ara_label <- data.frame(means=c(.9),contingent = c(1.5),language="ara")
deu_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="German")
# deu_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
# est_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Norwegian")
pol_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Swedish")
zho_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_swu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") +
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "ns", size=4,color="black",fontface = "italic") +  
     geom_text(data = fas_label,label = "ns", size=4,color="black",fontface = "italic") + 
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_label,label = "**",size=8,color="black") +  
     geom_text(data = pol_label,label = "ns", size=4,color="black",fontface = "italic") +    
     geom_text(data = por_label,label = "***",size=8,color="black") +  
     geom_text(data = spa_label,label = "***",size=8,color="black") + 
     geom_text(data = swe_label,label = "***",size=8,color="black") + 
     geom_text(data = zho_label,label = "ns", size=4, color="black",fontface = "italic") +
     ylim(0, .5) +
     labs(tag="C",
          y = "Proportion of Single Word Utterances",
          x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=16),
           axis.text.x = element_text(vjust = 0.5, hjust = 0.5),
           legend.title = element_blank(),
           legend.background = element_rect(fill=alpha("white",0.90),
                                                            size=0, linetype="dotted",
                                                            colour = "white"),
           legend.text=element_text(size=16))
      ggsave("../figures/token_rand_swu.pdf", width = 11.7, height = 6.2)


for manuscript

In [12]:
%%R -i rand_swu_sumstats

library('ggplot2')
library('repr')
options(repr.plot.width=6, repr.plot.height=12)

xlabs <- c("C", "NC")

# # ara_label <- data.frame(means=c(.9),contingent = c(1.5),language="ara")
deu_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="German")
# deu_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
# est_ns_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Norwegian")
pol_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(.47),contingent = c(1.5),Language_name="Swedish")
zho_label <- data.frame(means=c(.5),contingent = c(1.5),Language_name="Mandarin")


p <- ggplot(rand_swu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=8,color="black") +
     geom_text(data = eng_label,label = "***",size=8,color="black") +  
     geom_text(data = est_label,label = "ns", size=4,color="black",fontface = "italic") +  
     geom_text(data = fas_label,label = "ns", size=4,color="black",fontface = "italic") + 
     geom_text(data = fra_label,label = "***",size=8,color="black") +  
     geom_text(data = hrv_label,label = "***",size=8,color="black") + 
     geom_text(data = jpn_label,label = "***",size=8,color="black") + 
     geom_text(data = kor_label,label = "***",size=8,color="black") +  
     geom_text(data = nor_label,label = "**",size=8,color="black") +  
     geom_text(data = pol_label,label = "ns", size=4,color="black",fontface = "italic") +    
     geom_text(data = por_label,label = "***",size=8,color="black") +  
     geom_text(data = spa_label,label = "***",size=8,color="black") + 
     geom_text(data = swe_label,label = "***",size=8,color="black") + 
     geom_text(data = zho_label,label = "ns", size=4, color="black",fontface = "italic") +
     ylim(0, .5) +
     labs(tag="C",
          y = "Proportion of Single Word Utterances",
          x = "") +
     theme_classic() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=11.5),
           axis.text.x = element_text(vjust = 0.5, hjust=0.5),
           legend.position="none")

      ggsave("../figures/figure_2_C.pdf", width = 11.5, height = 4.2)