# Natural Statistics Cross-linguistic:

#### Simplification effect overview, Figure 3 in the manuscript, generated from random samples across languages.

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "data_proc")
import contingent_extraction
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data

rand_lex_sumstats = pd.read_csv("../data/rand_lex_sumstats.csv",index_col=0)
rand_mlu_sumstats = pd.read_csv("../data/rand_mlu_sumstats.csv",index_col=0)
rand_swu_sumstats = pd.read_csv("../data/rand_swu_sumstats.csv",index_col=0)

In [3]:
%load_ext rpy2.ipython

For black background figures:

In [4]:
%%R

theme_black = function(base_size = 11, base_family = "") {
 
  theme_grey(base_size = base_size, base_family = base_family) %+replace%
 
    theme(
      # Specify axis options
      axis.line = element_line(colour = "white"),  
      axis.text.x = element_text(color = "white",margin = margin(2, 2, 2, 2)),  
      axis.text.y = element_text(color = "white",hjust=1,margin = margin(2, 2, 2, 2)),  
      axis.ticks = element_line(color = "white"),  
      axis.title.x = element_text(size = base_size, color = "white"),  
      axis.title.y = element_text(size = base_size, color = "white", angle = 90,margin = margin(0, 10, 0, 0)),  
      # Specify legend options
      legend.background = element_rect(color = NA, fill = "black"),  
      legend.key = element_rect(color = "white",  fill = "black"),  
      legend.key.size = unit(1.2, "lines"),  
      legend.key.height = NULL,  
      legend.key.width = NULL,      
      legend.text = element_text(size = base_size*0.8, color = "white"),  
      legend.title = element_text(size = base_size*0.8, face = "bold", hjust = 0, color = "white"),  
      legend.position = "right",  
      legend.text.align = NULL,  
      legend.title.align = NULL,  
      legend.direction = "vertical",  
      legend.box = NULL, 
      # Specify panel options
      panel.background = element_blank(),  
      panel.border = element_blank(),  
      panel.grid.major = element_blank(),  
      panel.grid.minor = element_blank(),  
#       panel.margin = unit(0.5, "lines"),   
      # Specify facetting options
      strip.background = element_rect(fill = "black", color = "white"),  
      strip.text.x = element_text(size = base_size*0.8, color = "white"),  
      strip.text.y = element_text(size = base_size*0.8, color = "white",angle = -90),  
      # Specify plot options
      plot.background = element_rect(color = "black", fill = "black"),  
      plot.title = element_text(size = base_size*1.2, color = "white"),  
      plot.margin = unit(rep(1, 4), "lines")
 
    )
 
}

Generate plot(s):

In [5]:
%%R -i rand_lex_sumstats -i rand_mlu_sumstats -i rand_swu_sumstats

library('tidyverse')
library('patchwork')
library('ggplot2')
library('repr')
options(repr.plot.width=6.47, repr.plot.height=7.3)

wrapper <- function(x, ...) 
{
  paste(strwrap(x, ...), collapse = "\n")
}

xlabs <- c("C", "NC")
font_sz = 11

# lexical diversity

deu_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="English")
est_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Estonian")
fas_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Korean")
nor_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Norwegian")
pol_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(sums=c(216),contingent = c(1.5),Language_name="Portuguese")
spa_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Spanish")
swe_ns_label <- data.frame(sums=c(247),contingent = c(1.5),Language_name="Swedish")
zho_label <- data.frame(sums=c(235),contingent = c(1.5),Language_name="Mandarin")


p1 <- ggplot(rand_lex_sumstats, aes(x = contingent, y = sums, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=6,color="white") + 
     geom_text(data = eng_label,label = "***",size=6,color="white") +  
     geom_text(data = est_label,label = "**",size=6,color="white") +  
     geom_text(data = fas_label,label = "*",size=6, color="white") +
     geom_text(data = fra_label,label = "***",size=6,color="white") +  
     geom_text(data = hrv_label,label = "***",size=6,color="white") + 
     geom_text(data = jpn_label,label = "***",size=6,color="white") + 
     geom_text(data = kor_label,label = "***",size=6,color="white") +  
     geom_text(data = nor_ns_label,label = "ns",size=2.5,color="white",fontface = "italic") +  
     geom_text(data = por_label,label = "**",size=6,color="white") +  
     geom_text(data = spa_ns_label,label = "ns",size=2.5,color="white",fontface = "italic") + 
     geom_text(data = swe_ns_label,label = "ns",size=2.5,color="white",fontface = "italic") + 
     geom_text(data = zho_label,label = "^",size=3, color="white") +
     ylim(0, 250) +
     labs(tag = "A",
          y = "Number of Unique Words", x = "") +
     theme_classic() +
#      theme_black() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=font_sz),
           axis.text.x = element_text(vjust = 0.5, hjust=.5),
           axis.ticks.length = unit(-2.5, "pt"),
           legend.position = "none")

# ggsave("../figures/Figure_3A_black.pdf", width = 6.47, height = 2.43, dpi=1200)

# mean length of utterance

deu_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="English")
est_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Estonian")
fas_ns_label <- data.frame(means=c(5.9),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Norwegian")
pol_ns_label <- data.frame(means=c(5.9),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(5.2),contingent = c(1.5),Language_name="Swedish")
zho_ns_label <- data.frame(means=c(5.9),contingent = c(1.5),Language_name="Mandarin")

ylab <- "Mean Length of Utterance in Words"

p2 <- ggplot(rand_mlu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=6,color="white") + 
     geom_text(data = eng_label,label = "***",size=6,color="white") +  
     geom_text(data = est_label,label = "**",size=6,color="white") +  
     geom_text(data = fas_ns_label,label = "ns",size=2.5,color="white",fontface = "italic") +
     geom_text(data = fra_label,label = "***",size=6,color="white") +  
     geom_text(data = hrv_label,label = "***",size=6,color="white") + 
     geom_text(data = jpn_label,label = "***",size=6,color="white") + 
     geom_text(data = kor_label,label = "***",size=6,color="white") +  
     geom_text(data = nor_label,label = "**",size=6,color="white") +  
     geom_text(data = pol_ns_label,label = "ns", size=2.5,color="white",fontface = "italic") +  
     geom_text(data = por_label,label = "***",size=6,color="white") +  
     geom_text(data = spa_label,label = "***",size=6,color="white") + 
     geom_text(data = swe_label,label = "***",size=6,color="white") + 
     geom_text(data = zho_ns_label,label = "ns",size=2.5,color="white",fontface = "italic") +
     ylim(0, 6) +
     labs(tag="B",
          y = wrapper(ylab, width = 25),
          x = "") +
     theme_classic() +
#      theme_black() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=font_sz),
           axis.text.x = element_text(vjust = 0.5, hjust=1),
           axis.title.y = element_text(vjust=-2.5),
           axis.ticks.length = unit(-2.5, "pt"),
           legend.position = "none")

# ggsave("../figures/Figure_3B_black.pdf", width = 6.47, height = 2.43, dpi=1200)

# proportion single word utterances

deu_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="German")
eng_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="English")
est_ns_label <- data.frame(means=c(.49),contingent = c(1.5),Language_name="Estonian")
fas_ns_label <- data.frame(means=c(.49),contingent = c(1.5),Language_name="Persian")
fra_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="French")
hrv_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Croatian")
jpn_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Japanese")
kor_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Korean")
nor_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Norwegian")
pol_ns_label <- data.frame(means=c(.49),contingent = c(1.5),Language_name="Polish")
por_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Portuguese")
spa_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Spanish")
swe_label <- data.frame(means=c(.43),contingent = c(1.5),Language_name="Swedish")
zho_ns_label <- data.frame(means=c(.49),contingent = c(1.5),Language_name="Mandarin")


p3 <- ggplot(rand_swu_sumstats, aes(x = contingent, y = means, color = Language_name)) +
     stat_summary(fun.y=mean, geom="point", shape=19, size=1.75) + 
     stat_summary(fun.data = mean_se, geom = "errorbar", size=1.25, width = .5) +
     facet_wrap(. ~ Language_name,ncol = 7) + 
     geom_text(data = deu_label,label = "***",size=6,color="white") +
     geom_text(data = eng_label,label = "***",size=6,color="white") +  
     geom_text(data = est_ns_label,label = "ns", size=2.5,color="white",fontface = "italic") +  
     geom_text(data = fas_ns_label,label = "ns", size=2.5,color="white",fontface = "italic") + 
     geom_text(data = fra_label,label = "***",size=6,color="white") +  
     geom_text(data = hrv_label,label = "***",size=6,color="white") + 
     geom_text(data = jpn_label,label = "***",size=6,color="white") + 
     geom_text(data = kor_label,label = "***",size=6,color="white") +  
     geom_text(data = nor_label,label = "**",size=6,color="white") +  
     geom_text(data = pol_ns_label,label = "ns", size=2.5,color="white",fontface = "italic") +    
     geom_text(data = por_label,label = "***",size=6,color="white") +  
     geom_text(data = spa_label,label = "***",size=6,color="white") + 
     geom_text(data = swe_label,label = "***",size=6,color="white") + 
     geom_text(data = zho_ns_label,label = "ns", size=2.5, color="white",fontface = "italic") +
     ylim(0, .5) +
     labs(tag="C",
          y = "Prop. Single Word Utterances",
          x = "") +
     theme_classic() +
#      theme_black() +
     scale_x_discrete(labels= xlabs) +
     theme(text = element_text(size=font_sz),
           axis.text.x = element_text(vjust = 0.5, hjust = 0.5),
           axis.ticks.length = unit(-2.5, "pt"),
           legend.position = "none")

# ggsave("../figures/Figure_3C_black.pdf", width = 6.47, height = 2.43, dpi=1200)
    
p <- p1/p2/p3
ggsave("../figures/Figure_3.pdf", width = 6.47, height = 7.3, dpi=1200)


R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

