# Replicate Piantadosi et al (2012) across five languages

**Sean Trott** and **Benjamin Bergen**

Here, we ask whether word length (measured by `#syllables`) predicts `#homophones` across five languages. We also ask about the predictive power of `surprisal`, a measure of the phonotactic plausibility of a wordform. Following Piantadosi et al (2012), we normalized `surprisal` to the length (in `#phones`) of a wordform.

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as sm
import seaborn as sns
from tqdm import tqdm

import src.utils as utils
import src.config as config

from collections import Counter

In [2]:
from mpl_toolkits.mplot3d import Axes3D

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

In [4]:
TARGET = 'num_homophones'
COVARIATES = [
    'surprisal_normed',
    'num_sylls_est'
]
FORMULA = '{y} ~ {regressors}'.format(y=TARGET, regressors=' + '.join(COVARIATES))

COVARIATES2 = [
    'surprisal',
    'num_sylls_est'
]
FORMULA2 = '{y} ~ {regressors}'.format(y=TARGET, regressors=' + '.join(COVARIATES2))


## Helper functions

In [8]:
def load_lexicons_for_language(language, phon_column="PhonDISC", word_column="Word", n=5):
    """Loads lexicons for a given language."""
    df_real_all = pd.read_csv("data/processed/{lan1}/reals/{lan2}_all_reals_{n}phone.csv".format(lan1=language,
                                                                                         lan2=language,n=n))
    df_lemmas_mps = pd.read_csv("data/processed/{lan1}/reals/{lan2}_with_mps_{n}phone.csv".format(lan1=language,
                                                                                         lan2=language,n=n))
    df_artificials = pd.read_csv("data/processed/{lan1}/minimal_pairs/{lan2}_artificial_10_matched_on_sylls_mps_no_restriction_{n}phone.csv".format(lan1=language,
                                                                                                                           lan2=language, n=n))
    return df_real_all, df_lemmas_mps, df_artificials

## Replicate in English, German, and Dutch

### English

In [5]:
language = 'english'

In [11]:
# Here, we ignore the artificial lexica
df_og, df_processed, _ = load_lexicons_for_language(language=language, n=5)

In [12]:
len(df_og)

41887

In [13]:
len(df_processed)

35107

In [26]:
utils.get_homophone_stats(df_processed)

{'homophone_percentage': 0.0229,
 'mean_homophones': 0.0246,
 'max_homophones': 4}

In [15]:
df_processed['surprisal_normed'] = df_processed['surprisal'] / df_processed['num_phones']

In [16]:
Counter(df_og['num_sylls_est'])

Counter({1: 7706,
         2: 15247,
         3: 11379,
         4: 5316,
         5: 1694,
         6: 439,
         7: 95,
         8: 10,
         10: 1})

#### Main analysis

In [17]:
result_real = sm.poisson(formula=FORMULA, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,35107.0
Model:,Poisson,Df Residuals:,35104.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.1427
Time:,13:22:34,Log-Likelihood:,-16316.0
converged:,True,LL-Null:,-19033.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.8380,0.060,-13.990,0.000,-0.955,-0.721
surprisal_normed,0.7815,0.030,26.118,0.000,0.723,0.840
num_sylls_est,-0.7171,0.018,-39.926,0.000,-0.752,-0.682


In [18]:
result_real = sm.poisson(formula=FORMULA2, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,35107.0
Model:,Poisson,Df Residuals:,35104.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.1316
Time:,13:22:35,Log-Likelihood:,-16528.0
converged:,True,LL-Null:,-19033.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3085,0.122,10.736,0.000,1.070,1.547
surprisal,-0.2036,0.028,-7.327,0.000,-0.258,-0.149
num_sylls_est,-0.8713,0.019,-45.227,0.000,-0.909,-0.834


### German

In [21]:
# Here, we ignore the artificial lexica
df_og, df_processed, _ = load_lexicons_for_language(language='german', n=5)

In [22]:
len(df_og)

51676

In [23]:
len(df_processed)

50435

In [25]:
utils.get_homophone_stats(df_processed)

{'homophone_percentage': 0.0229,
 'mean_homophones': 0.0246,
 'max_homophones': 4}

In [28]:
df_processed['surprisal_normed'] = df_processed['surprisal'] / df_processed['num_phones']

#### Main analysis

In [29]:
result_real = sm.poisson(formula=FORMULA, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,50435.0
Model:,Poisson,Df Residuals:,50432.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.0862
Time:,13:23:51,Log-Likelihood:,-5395.9
converged:,True,LL-Null:,-5904.9
,,LLR p-value:,8.593000000000001e-222

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5962,0.125,-20.831,0.000,-2.840,-2.352
surprisal_normed,0.8602,0.060,14.410,0.000,0.743,0.977
num_sylls_est,-0.6858,0.035,-19.367,0.000,-0.755,-0.616


In [30]:
result_real = sm.poisson(formula=FORMULA2, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,50435.0
Model:,Poisson,Df Residuals:,50432.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.1154
Time:,13:24:16,Log-Likelihood:,-5223.3
converged:,True,LL-Null:,-5904.9
,,LLR p-value:,9.843999999999999e-297

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.7822,0.219,12.694,0.000,2.353,3.212
surprisal,-0.9792,0.049,-20.126,0.000,-1.075,-0.884
num_sylls_est,-0.4050,0.038,-10.655,0.000,-0.480,-0.331


### Dutch

In [31]:
language = 'dutch'

In [33]:
# Here, we ignore the artificial lexica
df_og, df_processed, _ = load_lexicons_for_language("dutch", n =5)

In [34]:
len(df_og)

67477

In [35]:
len(df_processed)

65260

In [36]:
df_processed['surprisal_normed'] = df_processed['surprisal'] / df_processed['num_phones']

#### Main analysis

In [37]:
result_real = sm.poisson(formula=FORMULA, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,65260.0
Model:,Poisson,Df Residuals:,65257.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.1804
Time:,13:24:30,Log-Likelihood:,-8168.2
converged:,True,LL-Null:,-9965.9
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.7217,0.088,-19.486,0.000,-1.895,-1.549
surprisal_normed,0.9974,0.036,28.006,0.000,0.928,1.067
num_sylls_est,-1.1119,0.031,-35.566,0.000,-1.173,-1.051


In [38]:
result_real = sm.poisson(formula=FORMULA2, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,65260.0
Model:,Poisson,Df Residuals:,65257.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.1871
Time:,13:24:31,Log-Likelihood:,-8101.8
converged:,True,LL-Null:,-9965.9
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.2040,0.149,21.566,0.000,2.913,3.495
surprisal,-0.7735,0.033,-23.270,0.000,-0.839,-0.708
num_sylls_est,-0.9785,0.034,-29.055,0.000,-1.045,-0.913


## Extend to French and Japanese

### French

In [39]:
language = "french"

In [40]:
# Here, we ignore the artificial lexica
df_og, df_processed, _ = load_lexicons_for_language(language,
                                                         phon_column=config.PHON_COLUMN[language],
                                                         word_column=config.WORD_COLUMN[language],
                                                   n=4)

In [41]:
len(df_og)

43782

In [42]:
len(df_processed)

37278

In [44]:
df_processed['surprisal_normed'] = df_processed['surprisal'] / df_processed['num_phones']

#### Main analysis

In [45]:
result_real = sm.poisson(formula=FORMULA, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,37278.0
Model:,Poisson,Df Residuals:,37275.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.05054
Time:,13:25:08,Log-Likelihood:,-17919.0
converged:,True,LL-Null:,-18873.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.6160,0.069,-23.376,0.000,-1.751,-1.480
surprisal_normed,0.7328,0.036,20.535,0.000,0.663,0.803
num_sylls_est,-0.3467,0.015,-23.207,0.000,-0.376,-0.317


In [47]:
result_real = sm.poisson(formula=FORMULA2, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,37278.0
Model:,Poisson,Df Residuals:,37275.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.05769
Time:,13:26:18,Log-Likelihood:,-17784.0
converged:,True,LL-Null:,-18873.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8233,0.064,12.856,0.000,0.698,0.949
surprisal,-0.3245,0.014,-23.044,0.000,-0.352,-0.297
num_sylls_est,-0.2144,0.017,-12.519,0.000,-0.248,-0.181


### Japanese

In [48]:
language = "japanese"

In [49]:
# Here, we ignore the artificial lexica
df_og, df_processed, _ = load_lexicons_for_language(language,
                                                         phon_column=config.PHON_COLUMN[language],
                                                         word_column=config.WORD_COLUMN[language],
                                                         n=4)

In [50]:
len(df_og)

51147

In [51]:
len(df_processed)

40449

In [52]:
df_processed['surprisal_normed'] = df_processed['surprisal'] / df_processed['num_phones']

#### Main analysis

In [53]:
result_real = sm.poisson(formula=FORMULA, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,40449.0
Model:,Poisson,Df Residuals:,40446.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.2014
Time:,13:26:36,Log-Likelihood:,-24212.0
converged:,True,LL-Null:,-30318.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.0262,0.064,31.805,0.000,1.901,2.151
surprisal_normed,0.0004,0.031,0.013,0.989,-0.061,0.062
num_sylls_est,-1.0054,0.013,-80.383,0.000,-1.030,-0.981


In [54]:
result_real = sm.poisson(formula=FORMULA2, 
                         data=df_processed).fit(disp=0)
result_real.summary()

0,1,2,3
Dep. Variable:,num_homophones,No. Observations:,40449.0
Model:,Poisson,Df Residuals:,40446.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 02 Apr 2020",Pseudo R-squ.:,0.2596
Time:,13:26:37,Log-Likelihood:,-22447.0
converged:,True,LL-Null:,-30318.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.3071,0.052,83.226,0.000,4.206,4.409
surprisal,-0.8036,0.014,-57.748,0.000,-0.831,-0.776
num_sylls_est,-0.2995,0.016,-18.662,0.000,-0.331,-0.268
