In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

import matplotlib.pyplot as plt  # To visualize
import json
from collections import Counter, defaultdict

from sklearn.linear_model import LinearRegression
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

from itertools import chain
import scipy.stats

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
df = pd.read_csv("../data/finals/colex_wn_pron_features.csv")

In [4]:
features = ["syl", "son", "cons", "cont", "delrel", "lat", "nas", "strid", "voi", "sg", "cg", "ant", 
            "cor", "distr", "lab", "hi", "lo", "back", "round", "velaric", "tense", "long", "hitone", "hireg"]
Initial_features = [f"init_{x}" for x in features]
last_features = [f"last_{x}" for x in features]
all_features = Initial_features+features+last_features

In [7]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,NORM_PRON,INIT_NORM_PRON,INIT_PRON,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg,SEG_LEN,Family,vowels,vowel_ratio,PRON_TTR,PRON_complex,NORM_PRON_TTR,NORM_PRON_complex
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,,,ə,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,Indo-European,1,1.000000,1.000000,0.000000,,
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,urruns,u,u,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,1,1,-1,1,1,-1,1,-1,0,0,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,2,0.333333,0.666667,0.333333,0.666667,0.333333
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,skima,s,s,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,-1,-1,-1,1,-1,0,0,6,Indo-European,2,0.400000,1.000000,0.000000,1.000000,0.000000
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,wenahs,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,7,Indo-European,3,0.500000,1.000000,0.000000,1.000000,0.000000
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,wothis,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,7,Indo-European,3,0.600000,0.857143,0.142857,1.000000,0.000000


In [39]:
def pearsonr(df, family, feature, aff):
    df= df[df["Family"]==family]
    langs = list(set(df["LANG_PRON"].tolist()))
    df_feature= df[[aff, feature]].dropna()
    
    X = df_feature[aff].values
    Y = df_feature[feature].values
    try:
        pr = scipy.stats.pearsonr(X,Y)
        return pr.statistic, pr.pvalue, len(df_feature), langs
    except Exception:
        return None,None,None, None

In [20]:
fams = [x for x in list(set(list(df["Family"]))) if x is not np.nan]
num = len(fams)

In [32]:
for family in list(set(list(df["Family"]))):
    print(family)
    for feature in Initial_features:
        pr, p, LEN = pearsonr(df, family,feature, "Conc.Mean")
        if p is not None:
            if p < 0.05/num and LEN>100:
                if pr<-0.1 or pr>0.1:
                    print(feature, pr,p, LEN)
                    print()

Tai-Kadai
init_nas 0.11416414731458524 1.1877294936830248e-09 2822

init_ant 0.13452303650705277 7.195314186767634e-13 2822

init_hi -0.13337993462366962 1.1260981657526313e-12 2822

init_back -0.10404234152190808 3.031981130187732e-08 2822

Athabaskan-Eyak-Tlingit
Kra-Dai
Tungusic
Algic
Mongolic-Khitan
Nakh-Daghestanian
Austronesian
Kartvelian
init_syl -0.12320336102028043 0.0003834606986967139 827

init_son -0.11087808281904421 0.0014050214169832715 827

init_cons 0.12320336102028043 0.0003834606986967139 827

init_cont -0.2395041724300824 2.973522806262527e-12 827

init_sg 0.16902691834172978 1.0151735592883113e-06 827

init_cg 0.12537308562477797 0.0003012225907543039 827

init_lo -0.10634965603601236 0.0021955177461289634 827

init_tense -0.12320336102028043 0.0003834606986967139 827

Koreanic
Artificial Language
init_syl -0.24879120993242682 5.0960812743026475e-09 537

init_son -0.24952628635179516 4.577745765645934e-09 537

init_cons 0.24470160460635088 9.198330019328442e-09 537

In [33]:
for family in list(set(list(df["Family"]))):
    print(family)
    for feature in Initial_features:
        pr, p, LEN = pearsonr(df, family,feature, "V.Mean")
        if p is not None:
            if p < 0.05/num and LEN>100:
                if pr<-0.1 or pr>0.1:
                    print(feature, pr,p, LEN)
                    print()

Tai-Kadai
Athabaskan-Eyak-Tlingit
Kra-Dai
Tungusic
Algic
Mongolic-Khitan
Nakh-Daghestanian
Austronesian
Kartvelian
init_round -0.136835885146686 9.541976474373832e-05 808

Koreanic
init_son 0.10161245597906396 8.088107216212121e-25 10198

init_voi 0.10164924207635603 7.779738529736859e-25 10198

init_lab -0.10175930674978632 6.924963031716841e-25 10198

Artificial Language
init_nas -0.20217130695281804 3.292393536496663e-06 521

Austroasiatic
Indo-European
Abkhaz-Adyge
Uralic
Afro-Asiatic
nan
Japonic
init_syl -0.1324217777454545 3.713328263205136e-07 1463

init_cons 0.12598465350696392 1.3384133849089854e-06 1463

init_delrel 0.13109872554213917 4.857332200916458e-07 1463

init_voi -0.11459366562849546 1.1136128663666885e-05 1463

init_cor 0.11915679631583893 4.876198088247589e-06 1463

Sino-Tibetan
init_ant -0.1276676252725533 3.096594992409177e-29 7669

init_hi 0.12206920497963272 7.492699638607417e-27 7669

Dravidian
init_cons -0.17756815042126173 0.0010255165673371232 339

Turkic
i

In [40]:
for family in list(set(list(df["Family"]))):
    print(family)
    for feature in features:
        pr, p, LEN,langs = pearsonr(df, family,feature, "Conc.Mean")
        if p is not None:
            if p < 0.05/num and LEN>100:
                if pr<-0.1 or pr>0.1:
                    print(feature, pr,p, LEN, len(langs))
                    print()

Tai-Kadai
syl -0.15702615589596627 4.860453213306544e-17 2822 3

son -0.15329965913337226 2.6450717533672074e-16 2822 3

cons -0.1734226886470754 1.7114645242739612e-20 2822 3

cont -0.1566767280647083 5.70736302088606e-17 2822 3

voi -0.152383763002299 3.985757227508406e-16 2822 3

ant -0.12173310927172201 8.695788846693623e-11 2822 3

cor -0.15744565983846015 4.0060660553339995e-17 2822 3

hi -0.11942729754497182 1.9624013688552775e-10 2822 3

lo -0.1424490839170279 2.9001798433279064e-14 2822 3

back -0.10091683949225871 7.773901917212826e-08 2822 3

tense -0.1631201003323591 2.7826941410240447e-18 2822 3

Athabaskan-Eyak-Tlingit
Kra-Dai
Tungusic
Algic
Mongolic-Khitan
Nakh-Daghestanian
Austronesian
cont -0.11829869566522051 5.900423248829896e-105 33591 8

Kartvelian
syl -0.25221372404636977 1.819608541517345e-13 827 1

son -0.2503546610701913 2.7652649157076686e-13 827 1

cons -0.2414996397127703 1.937756920268083e-12 827 1

cont -0.24169376400076006 1.8583096235602596e-12 827 1

de