In [82]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

import matplotlib.pyplot as plt  # To visualize
import json
from collections import Counter, defaultdict

from sklearn.linear_model import LinearRegression
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

from itertools import chain
import scipy.stats

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [56]:
dflang= pd.read_csv("../data/aff+conc/languages_#phone.csv")

In [53]:
df_langs = pd.read_csv("../data/languages/wn_pron_languages_glotto.csv")

In [58]:
glotto2family = dict(zip(df_langs["Glottocode"], df_langs["Top-level family"]))

In [59]:
dflang["Family"]=dflang["glottocode"].apply(lambda x: glotto2family[x])

In [66]:
dflang[dflang["higher_lang"]=="ind"]

Unnamed: 0,glottocode,name,isocodes,level,macroarea,latitude,longitude,higher_lang,#Phonemes,#Concepts,Family


In [78]:
dflang["Family"]= dflang["higher_lang"].apply(lambda x: iso2family[x])

In [80]:
dflang.to_csv("../data/aff+conc/languages_#phone.csv", index=False)

In [81]:
dflang

Unnamed: 0,glottocode,name,isocodes,level,macroarea,latitude,longitude,higher_lang,#Phonemes,#Concepts,Family
0,abkh1244,Abkhaz,abk,language,Eurasia,43.056218,41.159115,abk,3,1,Abkhaz-Adyge
1,adyg1241,Adyghe,ady,language,Eurasia,44.000000,39.330000,ady,5,1,Abkhaz-Adyge
2,afri1274,Afrikaans,afr,language,Africa,-22.000000,30.000000,afr,53,1014,Indo-European
3,aima1241,Aimaq,aiq,language,Eurasia,35.283000,59.175600,fas,78,5946,Indo-European
4,alge1239,Algerian Arabic,arq,language,Africa,35.420800,3.230330,ara,49,10258,Afro-Asiatic
...,...,...,...,...,...,...,...,...,...,...,...
198,west2354,Western Frisian,fry,language,Eurasia,53.143000,5.860910,fry,32,26,Indo-European
199,west2361,Western Yiddish,yih,language,Eurasia,51.688900,12.260100,yid,23,16,Indo-European
200,xhos1239,Xhosa,xho,language,Africa,-31.038900,28.076900,xho,30,6,Atlantic-Congo
201,yoru1245,Yoruba,yor,language,Africa,7.153450,3.672250,yor,6,4,Atlantic-Congo


In [64]:
iso2family = dict(zip(dflang["higher_lang"], dflang["Family"]))

In [68]:
iso2family["ind"] = 'Austronesian'

In [72]:
iso2family["zha"] = "Kra-Dai"

In [75]:
iso2family["srd"] = "Indo-European"

In [144]:
df= pd.read_csv("../data/aff+conc/wn_pron_all.csv")

In [76]:
df["Family"]=df["LANG_PRON"].apply(lambda x: iso2family[x])

In [84]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,macroarae,Conc.CLASS,V.CLASS,A.CLASS,D.CLASS,Conc,V,A,D,Aff,Aff.all,Name,Family,Macroarea,Latitude,Longitude,ISO
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,Eurasia,0.0,1.0,0.0,2.0,1.0,4.0,2.0,5.0,3.0,-1.0,English,Indo-European,Eurasia,53.000000,-1.000000,eng
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,Eurasia,1.0,1.0,0.0,1.0,2.0,4.0,2.0,3.0,2.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,Eurasia,0.0,0.0,0.0,1.0,1.0,2.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,Eurasia,1.0,0.0,0.0,1.0,2.0,0.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,Eurasia,1.0,2.0,1.0,1.0,2.0,6.0,3.0,4.0,3.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,Eurasia,1.0,1.0,0.0,1.0,2.0,4.0,2.0,4.0,2.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,Eurasia,2.0,1.0,1.0,1.0,3.0,3.0,3.0,4.0,2.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,Eurasia,0.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,Eurasia,0.0,2.0,0.0,2.0,1.0,6.0,2.0,5.0,3.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got


In [85]:
import panphon
ft = panphon.FeatureTable()

In [86]:
len(df)

339692

In [87]:
len(set(df["LANG_PRON"].tolist()))

142

In [88]:
def get_phon_features(phone):
    r = ft.word_to_vector_list(phone, numeric=True)
    r_arr = np.array(r)
    seg_len = r_arr.shape[0]
    
    r_initials = r[0]
    r_last = r[-1]
    
    r_values = np.count_nonzero(r_arr==1, axis=0)
    cons = np.count_nonzero(r_arr== 1, axis=0)[2]
    vowels = seg_len - cons

    # /seg_len
    
    rs = np.concatenate(([seg_len], r_initials, r_values,[vowels], r_last), axis=0)
    return pd.Series(rs)

In [89]:
features = ["syl", "son", "cons", "cont", "delrel", "lat", "nas", "strid", "voi", "sg", "cg", "ant", 
            "cor", "distr", "lab", "hi", "lo", "back", "round", "velaric", "tense", "long", "hitone", "hireg"]

Initial_features = [f"init_{x}" for x in features]
last_features = [f"last_{x}" for x in features]

all_features =["SEG_LEN"]+ Initial_features+features+["vowels"]+last_features

In [90]:
len(features), len(all_features), 24*3

(24, 74, 72)

In [91]:
df[all_features]= df["PRON"].parallel_apply(get_phon_features)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=33970), Label(value='0 / 33970')))…

In [92]:
df["Family"].value_counts() # 21

Family
Indo-European              226649
Uralic                      26993
Austronesian                23054
Koreanic                    10814
Sino-Tibetan                 8777
Austroasiatic                3607
Tai-Kadai                    2858
Turkic                       2744
Japonic                      1842
Afro-Asiatic                  973
Kartvelian                    844
Artificial Language           552
Dravidian                     348
Mongolic-Khitan                67
Atlantic-Congo                 58
Athabaskan-Eyak-Tlingit        11
Nakh-Daghestanian               3
Abkhaz-Adyge                    3
Algic                           3
Tungusic                        2
Uto-Aztecan                     1
Name: count, dtype: int64

In [93]:
df["Macroarea"].value_counts()

Macroarea
Eurasia          288007
Papunesia         23054
Africa             1096
North America        20
South America         2
Name: count, dtype: int64

In [49]:
# df = pd.read_csv("../data/aff+conc/phone_aff_conc_features.csv")

In [94]:
df.head()

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,macroarae,Conc.CLASS,V.CLASS,A.CLASS,D.CLASS,Conc,V,A,D,Aff,Aff.all,Name,Family,Macroarea,Latitude,Longitude,ISO,SEG_LEN,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,vowels,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg
0,a,eng,ə,alpha~settling,2.035,1.23,5.5,2.24,3.67,2.63,6.32,2.48,5.163333,0.490352,Eurasia,0.0,1.0,0.0,2.0,1.0,4.0,2.0,5.0,3.0,-1.0,English,Indo-European,Eurasia,53.0,-1.0,eng,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.95,1.5,3.65,2.25,4.92,2.26,4.84,0.654389,Eurasia,1.0,1.0,0.0,1.0,2.0,4.0,2.0,3.0,2.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0
2,a,hun,ɒː,amplitude~legislature,2.73,1.36,3.55,1.57,3.22,2.18,4.48,1.99,3.75,0.481333,Eurasia,0.0,0.0,0.0,1.0,1.0,2.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.89,1.02,3.55,2.24,4.32,2.81,3.253333,0.486241,Eurasia,1.0,0.0,0.0,1.0,2.0,0.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.24,1.51,4.39,2.63,5.68,2.16,5.77,0.696056,Eurasia,1.0,2.0,1.0,1.0,2.0,6.0,3.0,4.0,3.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0


In [95]:
df=df.dropna(subset=["PRON"])

In [96]:
df["INIT_PRON"]=df["PRON"].str.split().str[0]
df["LAST_PRON"]=df["PRON"].str.split().str[-1]

In [5]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [125]:
data = sm.datasets.get_rdataset("dietox", "geepack").data

In [133]:
df["Aff"]=df["Aff.Mean"]

In [6]:
features = ["syl", "son", "cons", "cont", "delrel", "lat", "nas", "strid", "voi", "sg", "cg", "ant", 
            "cor", "distr", "lab", "hi", "lo", "back", "round", "velaric", "tense", "long", "hitone", "hireg"]
Initial_features = [f"init_{x}" for x in features]
last_features = [f"last_{x}" for x in features]

all_features = Initial_features+features+last_features

In [97]:
def get_corr(df, vfeat,thres, num):
    try:
        r = {}

        for feat in df.columns:
            corr, p_value =scipy.stats.pearsonr(df[feat], vfeat)
            if feat!=" ":
                if p_value < 0.05/num:
                    if corr < -thres or corr>thres:
                        r[feat]=(corr, p_value)
        return r
    except Exception:
        return None
        

# Corr with phones

## matrix of the rate of each phone in IPA

In [106]:
def get_pron_lang(pron, df, lang, vfeat, thres):
    df = df[df["LANG_PRON"]==lang].dropna()
    norm_prons = sorted(list(set(list(chain.from_iterable(df[pron].tolist())))))
    X_dim = len(df)
    Y_dim = len(norm_prons)
    
    langs = list(set(df["LANG_PRON"].tolist()))
    num = len(langs)

    
    
    pron_matrix = np.zeros((X_dim, Y_dim))  
    for index, pron in enumerate(df[pron]):
        ls = [norm_prons.index(x) for x in list(pron)]
        for l in ls:
            pron_matrix[index][l]+=1
    row_sums = pron_matrix.sum(axis=1)
    # print(pron_matrix[0])
    pron_m = pron_matrix/ row_sums[:, np.newaxis]
    # print(pron_m[0])
    
    df_pron = pd.DataFrame(pron_matrix, columns=norm_prons)
    r= get_corr(df_pron, df[vfeat],thres, num)
    return r , X_dim, Y_dim

In [121]:
def get_pron_fam(pron, df, family, vfeat, thres):
    fams = list(set(df["Family"].tolist()))
    num = len(fams)
    df = df[df["Family"]==family].dropna()
    langs = list(set(df["Name"].tolist()))
    print(len(langs),langs)
    norm_prons = sorted(list(set(list(chain.from_iterable(df[pron].tolist())))))
    X_dim = len(df)
    Y_dim = len(norm_prons)
    
    
    pron_matrix = np.zeros((X_dim, Y_dim))  
    for index, pron in enumerate(df[pron]):
        ls = [norm_prons.index(x) for x in list(pron)]
        for l in ls:
            pron_matrix[index][l]+=1
    row_sums = pron_matrix.sum(axis=1)
    # print(pron_matrix[0])
    pron_m = pron_matrix/ row_sums[:, np.newaxis]
    # print(pron_m[0])
    
    df_pron = pd.DataFrame(pron_matrix, columns=norm_prons)
    r= get_corr(df_pron, df[vfeat],thres,num)
    return r , X_dim, Y_dim

In [119]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,macroarae,Conc.CLASS,V.CLASS,A.CLASS,D.CLASS,Conc,V,A,D,Aff,Aff.all,Name,Family,Macroarea,Latitude,Longitude,ISO,SEG_LEN,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,vowels,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg,INIT_PRON,LAST_PRON
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,Eurasia,0.0,1.0,0.0,2.0,1.0,4.0,2.0,5.0,3.0,-1.0,English,Indo-European,Eurasia,53.000000,-1.000000,eng,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,ə,ə
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,Eurasia,1.0,1.0,0.0,1.0,2.0,4.0,2.0,3.0,2.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,ɒː,ɒː
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,Eurasia,0.0,0.0,0.0,1.0,1.0,2.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,ɒː,ɒː
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,Eurasia,1.0,0.0,0.0,1.0,2.0,0.0,2.0,3.0,1.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,ɒː,ɒː
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,Eurasia,1.0,2.0,1.0,1.0,2.0,6.0,3.0,4.0,3.0,-1.0,Hungarian,Uralic,Eurasia,46.906859,19.655527,hun,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,ɒː,ɒː
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,Eurasia,1.0,1.0,0.0,1.0,2.0,4.0,2.0,4.0,2.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got,6,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,1,1,-1,1,1,-1,1,-1,0,0,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0,2,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,u,s
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,Eurasia,2.0,1.0,1.0,1.0,3.0,3.0,3.0,4.0,2.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got,5,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0,2,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,-1,-1,-1,1,-1,0,0,s,a
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,Eurasia,0.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got,6,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0,3,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,w,s
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,Eurasia,0.0,2.0,0.0,2.0,1.0,6.0,2.0,5.0,3.0,-1.0,Gothic,Indo-European,Eurasia,46.930400,29.978600,got,5,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0,3,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,w,s


In [129]:
for lang in list(set(df["Family"].tolist())):
    d, x, y = get_pron_fam("LAST_PRON", df, lang, "V.Mean",0.1)
    if d is not None:
        if len(d)>0:
            print(lang, x,y)
            print(d)

1 ['Korean']
Koreanic 10043 20
{'o': (-0.1007358301060053, 4.517475486602534e-24), '̞': (-0.11028924973283585, 1.482418052366222e-28)}
1 ['Navajo']
7 ['Bashkir', 'Turkmen', 'Turkish', 'Tuvinian', 'Sakha', 'Kirghiz', 'Kazakh']
Turkic 2453 58
{'p': (-0.1412607883769486, 2.0946432974673825e-12), 'y': (-0.11577510679483485, 8.897647967074183e-09)}
3 ['Shan', 'Lao', 'Thai']
1 ['Japanese']
Japonic 1434 14
{'a': (-0.1598515233146667, 1.149085488559738e-09), '̠': (-0.1598515233146667, 1.149085488559738e-09)}
3 ['Abkhaz', 'Adyghe', 'Kabardian']
2 ['Central Khmer', 'Vietnamese']
1 ['Georgian']
Kartvelian 791 5
{'a': (0.13170118815363313, 0.00020369258796634017), 'i': (-0.122079462881854, 0.0005797848404404163)}
2 ['Esperanto', 'Interlingua (International Auxiliary Language Association)']
2 ['Dongxiang', 'Oirad-Kalmyk-Darkhat']
6 ['Kikuyu', 'Ewe', 'Zulu', 'Yoruba', 'Nyanja', 'Xhosa']
1 ['Classical Nahuatl']
7 ['Cebuano', 'Iloko', 'Standard Indonesian', 'Coastal-Naga Bikol', 'Hawaiian', 'Tonga (To

In [128]:
for lang in list(set(df["Family"].tolist())):
    d, x, y = get_pron_fam("INIT_PRON", df, lang, "V.Mean",0.1)
    if d is not None:
        if len(d)>0:
            print(lang, x,y)
            print(d)

1 ['Korean']
Koreanic 10043 34
{'p': (-0.1616802032632307, 8.640008827480696e-60)}
1 ['Navajo']
7 ['Bashkir', 'Turkmen', 'Turkish', 'Tuvinian', 'Sakha', 'Kirghiz', 'Kazakh']
Turkic 2453 53
{'c': (-0.11781561493562379, 4.840542344320657e-09), 'ɑ': (-0.12839072072662733, 1.7468230628284876e-10)}
3 ['Shan', 'Lao', 'Thai']
1 ['Japanese']
Japonic 1434 34
{'i': (-0.14799311227300196, 1.7981852801335418e-08), 's': (0.128309185073484, 1.0893982350040282e-06), '̠': (-0.10646806673315251, 5.351409816004188e-05)}
3 ['Abkhaz', 'Adyghe', 'Kabardian']
2 ['Central Khmer', 'Vietnamese']
1 ['Georgian']
Kartvelian 791 27
{'u': (-0.1385199703413988, 9.285875384192924e-05), 'z': (-0.12424217768434508, 0.0004612094316005783)}
2 ['Esperanto', 'Interlingua (International Auxiliary Language Association)']
Artificial Language 448 24
{'m': (-0.246401487151112, 1.2748286743927613e-07)}
2 ['Dongxiang', 'Oirad-Kalmyk-Darkhat']
6 ['Kikuyu', 'Ewe', 'Zulu', 'Yoruba', 'Nyanja', 'Xhosa']
1 ['Classical Nahuatl']
7 ['Ceb

In [124]:
for lang in list(set(df["Family"].tolist())):
    d, x, y = get_pron_fam("INIT_PRON", df, lang, "Conc.Mean",0.1)
    if d is not None:
        if len(d)>0:
            print(lang, x,y)
            print(d)

1 ['Korean']
1 ['Navajo']
7 ['Bashkir', 'Turkmen', 'Turkish', 'Tuvinian', 'Sakha', 'Kirghiz', 'Kazakh']
Turkic 2453 53
{'k': (0.11478934771227413, 1.1895535486825594e-08), 't': (0.10201046641852005, 4.127360136672461e-07)}
3 ['Shan', 'Lao', 'Thai']
Tai-Kadai 2701 20
{'k': (-0.11222478216317457, 4.9682184316619505e-09), 'n': (0.10657396785159487, 2.8237853779146013e-08)}
1 ['Japanese']
Japonic 1434 34
{'a': (-0.12627150850714813, 1.6128875366828345e-06), 'b': (0.1572382998402606, 2.1452026939359547e-09), 'i': (-0.16376164584311262, 4.4289690247960446e-10), 'j': (-0.14406631522300678, 4.2681756093150086e-08), 'p': (0.11026508154018919, 2.8558827980227573e-05), 'ɾ': (0.1304118187742555, 7.220794407090799e-07), '̠': (-0.13288310611506068, 4.416585096825992e-07)}
3 ['Abkhaz', 'Adyghe', 'Kabardian']
2 ['Central Khmer', 'Vietnamese']
Austroasiatic 3400 26
{'ʔ': (0.10281411393136894, 1.8694070907623223e-09)}
1 ['Georgian']
Kartvelian 791 27
{'b': (0.1331912069420454, 0.00017210556469751743), '

In [126]:
for lang in list(set(df["Family"].tolist())):
    d, x, y = get_pron_fam("LAST_PRON", df, lang, "Conc.Mean",0.1 )
    if d is not None:
        if len(d)>0:
            print(lang, x,y)
            print(d)

1 ['Korean']
Koreanic 10043 20
{'p': (-0.13128800037855348, 7.423250839679001e-40), '̚': (-0.11279465977535694, 8.459847846659965e-30)}
1 ['Navajo']
7 ['Bashkir', 'Turkmen', 'Turkish', 'Tuvinian', 'Sakha', 'Kirghiz', 'Kazakh']
3 ['Shan', 'Lao', 'Thai']
1 ['Japanese']
Japonic 1434 14
{'i': (-0.27915884279840375, 4.421713361527625e-27), 'ɯ': (0.1428989903462158, 5.4944236562315224e-08), 'ɴ': (0.12292476884546595, 3.0326636714726294e-06), '̟': (0.14107639082613732, 8.117411654358516e-08), 'ᵝ': (0.17245933434045232, 4.8867490091886e-11)}
3 ['Abkhaz', 'Adyghe', 'Kabardian']
2 ['Central Khmer', 'Vietnamese']
1 ['Georgian']
Kartvelian 791 5
{'a': (-0.2512344159733242, 7.497951450203571e-13), 'i': (0.2551097059519261, 3.2308946523316235e-13)}
2 ['Esperanto', 'Interlingua (International Auxiliary Language Association)']
Artificial Language 448 7
{'a': (-0.27009338557756973, 6.2794641516052145e-09), 'i': (-0.16342360988565108, 0.000515180747534691), 'o': (0.31539393057527587, 8.36641972005794e-1

In [107]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_PRON", df, lang, "Conc.Mean",0.2 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

hye 420 26
{'u': (-0.20387470873803765, 2.553313661873573e-05)}
ell 1691 30
{'a': (-0.2037996014529573, 2.597450194527588e-17)}
hbs 172 31
{'î': (-0.2800086159938189, 0.00019906576629708745)}
gla 188 29
{'k': (-0.24515605563204046, 0.0006966666676597365), 'r': (-0.22823273290362325, 0.0016316881387029478)}
glg 1984 28
{'k': (0.2028434774655048, 7.213728859038707e-20)}
heb 326 31
{'e': (-0.22087839788880426, 5.757472382158787e-05)}
lav 326 29
{'k': (0.2428522134546177, 9.227292509910128e-06)}
mya 111 20
{'z': (-0.2126498215097688, 0.02504312862565477)}
tam 292 11
{'p': (0.2208854683354931, 0.0001415686529013513)}
mkd 332 27
{'z': (-0.30947537601190506, 8.428916552294417e-09)}
epo 438 24
{'a': (-0.25390883762415006, 7.153574854959839e-08)}
khm 137 16
{'ɗ': (0.2612292189947081, 0.002047077556999267)}


In [116]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_PRON", df, lang, "Aff.all.Mean",0.2)
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

gla 188 22
{'k': (-0.25417440139320024, 0.00043186353251805233), 'p': (0.22143203432344336, 0.0022590585156815144)}
mya 111 16
{'z': (-0.2174143208891987, 0.021896217351834817)}
est 156 15
{'m': (0.22991954470601947, 0.003884598227926281), 'p': (0.2216846363044069, 0.005414764288871658), 'v': (-0.30241241837118027, 0.00012455194181262572), 'ɑ': (-0.22870090361462267, 0.004083219489639697)}
hbs 172 28
{'d': (0.2170185121078371, 0.004242052667610008), 'k': (0.20390689575778312, 0.007297335408212661), 'î': (-0.2274383191084942, 0.0026953734946819448)}
tam 292 10
{'m': (-0.2232541672110231, 0.00011943892460409669), 'p': (0.24515304605765992, 2.2750992306890182e-05)}
khm 137 15
{'ɗ': (0.24978997411288903, 0.003242722865829267)}
mkd 332 25
{'z': (-0.2407286723967914, 9.201514205489073e-06)}
lav 326 27
{'k': (0.23157672568235216, 2.4138635905345862e-05), 'm': (-0.23195065695813658, 2.339858322290456e-05)}
heb 326 27
{'ʔ': (0.21363348084670547, 0.00010132030750984986)}
glg 1984 24
{'d': (-0.20

In [117]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_PRON", df, lang, "Aff.Mean",0.2)
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

mya 111 16
{'t': (-0.22284557365553567, 0.018728592256568935)}
est 156 15
{'h': (-0.2661625553216753, 0.0007838843643076948), 'ɤ': (0.26011345275595854, 0.0010405648793621071)}
hbs 172 28
{'u': (-0.2577695323657471, 0.0006405389483125915)}
tam 292 10
{'m': (-0.2739064746930227, 2.0179239216828357e-06)}
epo 438 23
{'m': (-0.23686444071791266, 5.315067116767926e-07)}
khm 137 15
{'p': (0.23728075183463682, 0.005240936697182583)}
lav 326 27
{'d': (0.2003519849810857, 0.0002720727306344119), 'i': (0.21839515270170126, 7.00304091968487e-05), 'm': (-0.2800634044453072, 2.740274915769068e-07), 's': (-0.2538575721767571, 3.4475628129952125e-06)}
nno 153 27
{'r': (0.2726202910961771, 0.0006513340883959448)}


In [114]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_NORM_PRON", df, lang, "Conc.Mean",0.2 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

gla 188 20
{'k': (-0.24515605563204018, 0.0006966666676597076), 'r': (-0.2282327329036234, 0.0016316881387028557)}
mya 111 17
{'z': (-0.21264982150976877, 0.02504312862565479)}
est 156 14
{'p': (0.23545603180648167, 0.0030872996164283016), 'v': (-0.3383880522985208, 1.5558216724034958e-05)}
nor 469 23
{'e': (-0.24243327193494058, 1.0623238386063846e-07)}
hbs 172 23
{'i': (-0.29297112066391323, 9.611623590098432e-05)}
tam 292 19
{'U': (-0.2380495349735694, 3.963899517307805e-05), 'p': (0.22581047968851362, 9.921914383562696e-05)}
epo 438 23
{'a': (-0.25390883762415095, 7.153574854960822e-08)}
khm 137 17
{'d': (0.26122921899470836, 0.0020470775569992597)}
mkd 332 23
{'z': (-0.3094753760119071, 8.4289165522926e-09)}
lav 326 20
{'k': (0.24285221345461772, 9.227292509910579e-06)}
glg 1984 19
{'k': (0.2028434774655068, 7.213728859040781e-20)}
ell 1691 21
{'a': (-0.2061115208604269, 1.1142004337249228e-17)}
hye 420 23
{'u': (-0.20387470873803795, 2.55331366187298e-05)}


In [112]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_NORM_PRON", df, lang, "Aff.all.Mean",0.2)
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

gla 188 20
{'k': (-0.25417440139320024, 0.00043186353251805233), 'p': (0.22143203432344336, 0.0022590585156815144)}
mya 111 17
{'z': (-0.2174143208891987, 0.021896217351834817)}
est 156 14
{'a': (-0.22870090361462267, 0.004083219489639697), 'm': (0.22991954470601947, 0.003884598227926281), 'p': (0.2216846363044069, 0.005414764288871658), 'v': (-0.30241241837118027, 0.00012455194181262572)}
hbs 172 23
{'d': (0.2170185121078371, 0.004242052667610008), 'i': (-0.23323382791075445, 0.002076213997261117), 'k': (0.20390689575778312, 0.007297335408212661)}
tam 292 19
{'m': (-0.2232541672110231, 0.00011943892460409669), 'p': (0.2516071370574284, 1.3537932741410609e-05)}
khm 137 17
{'d': (0.24978997411288903, 0.003242722865829267)}
mkd 332 23
{'z': (-0.2407286723967914, 9.201514205489073e-06)}
lav 326 20
{'k': (0.23157672568235216, 2.4138635905345862e-05), 'm': (-0.23195065695813658, 2.339858322290456e-05)}
nno 153 22
{'u': (-0.20822513597934555, 0.009798637610928045)}
glg 1984 19
{'d': (-0.2005

In [113]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("INIT_NORM_PRON", df, lang, "Aff.Mean",0.2)
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

est 156 14
{'h': (-0.2661625553216753, 0.0007838843643076948), 'u': (0.2465842258164257, 0.0019148701934730116)}
hbs 172 23
{'u': (-0.29887701009282913, 6.817375009000432e-05)}
tam 292 19
{'m': (-0.2739064746930227, 2.0179239216828357e-06)}
epo 438 23
{'m': (-0.23686444071791266, 5.315067116767926e-07)}
khm 137 17
{'p': (0.23728075183463682, 0.005240936697182583)}
lav 326 20
{'d': (0.2003519849810857, 0.0002720727306344119), 'm': (-0.2800634044453072, 2.740274915769068e-07), 's': (-0.2538575721767571, 3.4475628129952125e-06)}
nno 153 22
{'r': (0.2790311669099881, 0.0004780639528741028)}
heb 326 23
{'k': (0.20212298043925875, 0.00023935162519885295)}


In [106]:

for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("NORM_PRON", df, lang, "Conc.Mean",0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

kat 791 23
{'e': (-0.309671370902214, 4.846400522203798e-19)}
est 156 22
{'v': (-0.30928821373239557, 8.540221435894473e-05)}
mkd 332 26
{'z': (-0.30131243745541836, 2.141685328879748e-08)}
nno 153 31
{'G': (-0.3885059408759388, 6.959461487395168e-07), 'N': (-0.3885059408759388, 6.959461487395168e-07), 'i': (-0.31904456478594645, 5.830707569487212e-05)}
aze 104 28
{'m': (-0.3248230871945797, 0.0007675655793152211)}


In [107]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("NORM_PRON", df, lang, "Aff.Mean",0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

lav 326 25
{'c': (-0.383473812581327, 7.321916902475724e-13), 'y': (-0.37233216956193904, 3.699307168798099e-12)}
aze 104 28
{'z': (-0.30809980831086275, 0.0014635539299992075)}


In [108]:
for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("NORM_PRON", df, lang, "Aff.all.Mean",0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)

est 156 22
{'v': (-0.3059730309844068, 0.00010256294694324198)}
lav 326 25
{'c': (-0.36058149532731465, 1.9116594178100968e-11), 'y': (-0.41669925152526377, 3.999048369103605e-15)}
nno 153 31
{'G': (-0.3317386505296389, 2.8021192668402772e-05), 'N': (-0.3317386505296389, 2.8021192668402772e-05)}
aze 104 28
{'m': (-0.33484036142930557, 0.0005121879848573282)}


In [None]:
def get_pron_lang_non_norm(df, lang, vfeat):
    df = df[df["LANG_PRON"]==lang].dropna()
    norm_prons = sorted(list(set(list(chain.from_iterable(df.PRON.tolist())))))
    X_dim = len(df)
    Y_dim = len(norm_prons)
    
    
    pron_matrix = np.zeros((X_dim, Y_dim))  
    for index, pron in enumerate(df["PRON"]):
        ls = [norm_prons.index(x) for x in list(pron)]
        for l in ls:
            pron_matrix[index][l]+=1
    row_sums = pron_matrix.sum(axis=1)
    # print(pron_matrix[0])
    pron_m = pron_matrix/ row_sums[:, np.newaxis]
    # print(pron_m[0])
    
    df_pron = pd.DataFrame(pron_matrix, columns=norm_prons)
    r= get_corr(df_pron, df[vfeat])
    return r , X_dim, Y_dim

In [95]:
# get_pron_lang_non_norm(df, "deu", "Conc.Mean" )

In [119]:

for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("PRON", df, lang, "Conc.Mean", 0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)


kat 791 28
{'e': (-0.309671370902214, 4.846400522203798e-19)}
est 156 29
{'v': (-0.30928821373239557, 8.540221435894473e-05)}
mkd 332 34
{'z': (-0.30131243745541836, 2.141685328879748e-08)}
nno 153 48
{'i': (-0.31904456478594645, 5.830707569487212e-05), 'ŋ': (-0.3885059408759388, 6.959461487395168e-07)}
aze 104 41
{'m': (-0.3248230871945797, 0.0007675655793152211)}


In [120]:

for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("PRON", df, lang, "Aff.Mean", 0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)


tam 292 36
{'ɭ': (-0.33990687091927835, 2.4937663766911764e-09), 'ː': (-0.315199003835468, 3.713692471745893e-08)}
lav 326 52
{'c': (-0.383473812581327, 7.321916902475724e-13), 'û': (-0.3672232551236219, 7.6174415557938e-12), 'ʎ': (-0.37233216956193904, 3.699307168798099e-12)}
aze 104 41
{'z': (-0.30809980831086275, 0.0014635539299992075)}


In [121]:

for lang in list(set(df["LANG_PRON"].tolist())):
    d, x, y = get_pron_lang("PRON", df, lang, "Aff.all.Mean", 0.3 )
    if d is not None:
        if len(d)>0 and x>100:
            print(lang, x,y)
            print(d)


est 156 29
{'v': (-0.3059730309844068, 0.00010256294694324198)}
tam 292 36
{'ɭ': (-0.3320580177930036, 6.037446255144444e-09)}
lav 326 52
{'c': (-0.36058149532731465, 1.9116594178100968e-11), 'û': (-0.40717270031496067, 1.891195309248821e-14), 'ʎ': (-0.41669925152526377, 3.999048369103605e-15)}
nno 153 48
{'ŋ': (-0.3317386505296389, 2.8021192668402772e-05)}
aze 104 41
{'m': (-0.33484036142930557, 0.0005121879848573282)}


# Corr with phonetic features

In [149]:
df= pd.read_csv("../data/aff+conc/phone_aff_conc_features_num.csv")

In [150]:
df["Family"]=df["LANG_PRON"].apply(lambda x: iso2family[x])

In [133]:
features = ["syl", "son", "cons", "cont", "delrel", "lat", "nas", "strid", "voi", "sg", "cg", "ant", 
            "cor", "distr", "lab", "hi", "lo", "back", "round", "velaric", "tense", "long", "hitone", "hireg"]
Initial_features = [f"init_{x}" for x in features]
last_features = [f"last_{x}" for x in features]

all_features = Initial_features+features+last_features

In [134]:
df_features = df[features]
df_init_features = df[Initial_features]
df_last_features = df[last_features]

In [153]:
df["vowels"] = df["SEG_LEN"]-df["cons"]
df["vowel_ratio"]= df["vowels"]/df["SEG_LEN"]

In [155]:
df.LANG_PRON.value_counts()

LANG_PRON
fra    33662
spa    26233
slv    24480
fin    22849
gle    21739
       ...  
krl        1
abk        1
bcl        1
lmo        1
nci        1
Name: count, Length: 142, dtype: int64

In [163]:
[x for x  in list("u r r u n s") if x!=" "]

['u', 'r', 'r', 'u', 'n', 's']

In [161]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,NORM_PRON,INIT_NORM_PRON,INIT_PRON,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg,SEG_LEN,Family,vowels,vowel_ratio,PRON_TTR
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,,,ə,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,Indo-European,1,1.000000,1.000000
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,urruns,u,u,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,1,1,-1,1,1,-1,1,-1,0,0,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,2,0.333333,0.454545
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,skima,s,s,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,-1,-1,-1,1,-1,0,0,5,Indo-European,2,0.400000,0.700000
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,wenahs,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,3,0.500000,0.666667
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,wothis,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,5,Indo-European,3,0.600000,0.636364


In [76]:
def get_corr_by_lang(df, lang, feature, aff, thres):
    df_lang = df[df["LANG_PRON"]==lang].dropna()
    r = get_corr(df_lang[[feature]], df_lang[aff], thres)
    if len(df_lang)>100:
        if r is not None:
            if len(r)>0:
                print(f"{lang}, {len(df_lang[[feature]])}, {feature} vs.{aff}, {thres}")
                print(r)

In [164]:
def ttr(x):
    try:
        l = [z for z  in list(x) if z!=" "]
        return len(list(set(l)))/len(list(l))
    except Exception:
        return np.NaN

In [222]:
def seg_len(x):
    try:
        l = [z for z  in list(x) if z!=" "]
        return len(l)
    except Exception:
        return np.NaN

In [223]:
df["SEG_LEN"] = df["PRON"].parallel_apply(seg_len)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=33970), Label(value='0 / 33970')))…

In [165]:
df["PRON_TTR"]= df["PRON"].parallel_apply(ttr) # type/token

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=33970), Label(value='0 / 33970')))…

In [167]:
df["PRON_complex"]=1-df["PRON_TTR"]

In [168]:
df["NORM_PRON_TTR"]= df["NORM_PRON"].parallel_apply(ttr) # type/token

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=33970), Label(value='0 / 33970')))…

In [170]:
df["NORM_PRON_complex"] = 1- df["NORM_PRON_TTR"]

In [172]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,NORM_PRON,INIT_NORM_PRON,INIT_PRON,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg,SEG_LEN,Family,vowels,vowel_ratio,PRON_TTR,PRON_complex,NORM_PRON_TTR,NORM_PRON_complex
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,,,ə,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,Indo-European,1,1.000000,1.000000,0.000000,,
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,urruns,u,u,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,1,1,-1,1,1,-1,1,-1,0,0,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,2,0.333333,0.666667,0.333333,0.666667,0.333333
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,skima,s,s,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,-1,-1,-1,1,-1,0,0,5,Indo-European,2,0.400000,1.000000,0.000000,1.000000,0.000000
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,wenahs,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,3,0.500000,1.000000,0.000000,1.000000,0.000000
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,wothis,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,5,Indo-European,3,0.600000,0.857143,0.142857,1.000000,0.000000


In [174]:
def get_corr(df, vfeat,thres, num):
    try:
        r = {}

        for feat in df.columns:
            corr, p_value =scipy.stats.pearsonr(df[feat], vfeat)
            if feat!=" ":
                if p_value < 0.05/num:
                    if corr < -thres or corr>thres:
                        r[feat]=(corr, p_value)
        return r
    except Exception:
        return None
        

In [211]:
def get_corr_by_family(df, family, feature, aff, thres):
    df_lang = df[df["Family"]==family].dropna()
    fam = list(set(df_lang["Family"]))
    langs = len(set(df_lang["LANG_PRON"]))
    num = len(fam)
    r = get_corr(df_lang[[feature]], df_lang[aff], thres,num)
    if r is not None:
        if len(r)>0 and langs>1:
            print(f"{family}, {len(df_lang[[feature]])}, {feature} vs.{aff}, {thres}")
            print(set(df_lang["LANG_PRON"]), langs)
            print(r)
            print("*"*40)

In [205]:
get_corr_by_family(df, "Koreanic", "PRON_TTR", "Conc.Mean", 0.1)

In [214]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "PRON_TTR", "Conc.Mean", 0.1)

Tai-Kadai, 2701, PRON_TTR vs.Conc.Mean, 0.1
{'tha', 'lao', 'shn'} 3
{'PRON_TTR': (0.15110212522721586, 2.910533359429517e-15)}
****************************************
Austroasiatic, 3398, PRON_TTR vs.Conc.Mean, 0.1
{'khm', 'vie'} 2
{'PRON_TTR': (0.17942561162458792, 5.593357956233101e-26)}
****************************************
Artificial Language, 506, PRON_TTR vs.Conc.Mean, 0.1
{'vol', 'ina', 'ido', 'epo'} 4
{'PRON_TTR': (0.1150521746354856, 0.009590889150008497)}
****************************************
Uralic, 23508, PRON_TTR vs.Conc.Mean, 0.1
{'est', 'hun', 'krl', 'fin', 'sme', 'mdf'} 6
{'PRON_TTR': (0.18761438610877243, 3.455859035389653e-185)}
****************************************
Sino-Tibetan, 7567, PRON_TTR vs.Conc.Mean, 0.1
{'mya', 'new', 'cmn', 'bod', 'dzo'} 5
{'PRON_TTR': (0.12571809348070603, 4.886764780767801e-28)}
****************************************


In [216]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "PRON_TTR", "V.Mean", 0.1)

In [217]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "PRON_TTR", "D.Mean", 0.1)

In [218]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "PRON_TTR", "A.Mean", 0.1)

Atlantic-Congo, 58, PRON_TTR vs.A.Mean, 0.1
{'kik', 'nya', 'zul', 'ewe', 'xho', 'yor'} 6
{'PRON_TTR': (0.2809859429156498, 0.0326311639330116)}
****************************************


In [230]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "PRON_TTR", "Aff.Mean", 0.1)

Abkhaz-Adyge, 3, PRON_TTR vs.Aff.Mean, 0.1
{'kbd', 'abk', 'ady'} 3
{'PRON_TTR': (-0.9995326997276376, 0.019463012048195363)}
****************************************


In [224]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "SEG_LEN", "Conc.Mean", 0.1)

Turkic, 2557, SEG_LEN vs.Conc.Mean, 0.1
{'kaz', 'bak', 'aze', 'tur', 'tyv', 'kir', 'tuk', 'sah'} 8
{'SEG_LEN': (-0.13727255901207328, 3.145575791269123e-12)}
****************************************
Tai-Kadai, 2701, SEG_LEN vs.Conc.Mean, 0.1
{'tha', 'lao', 'shn'} 3
{'SEG_LEN': (-0.18342119259857198, 7.266606125160074e-22)}
****************************************
Austroasiatic, 3398, SEG_LEN vs.Conc.Mean, 0.1
{'khm', 'vie'} 2
{'SEG_LEN': (-0.27145001840665084, 1.8085050432498253e-58)}
****************************************
Artificial Language, 506, SEG_LEN vs.Conc.Mean, 0.1
{'vol', 'ina', 'ido', 'epo'} 4
{'SEG_LEN': (-0.12776330130641794, 0.003993655914003288)}
****************************************
Austronesian, 31436, SEG_LEN vs.Conc.Mean, 0.1
{'ind', 'msa', 'ton', 'ilo', 'haw', 'tgl', 'bcl', 'ceb'} 8
{'SEG_LEN': (-0.11058488509269931, 4.177687387082302e-86)}
****************************************
Uralic, 23508, SEG_LEN vs.Conc.Mean, 0.1
{'est', 'hun', 'krl', 'fin', 'sme', 'mdf

In [225]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "SEG_LEN", "V.Mean", 0.1)

In [226]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "SEG_LEN", "A.Mean", 0.1)

Austroasiatic, 3398, SEG_LEN vs.A.Mean, 0.1
{'khm', 'vie'} 2
{'SEG_LEN': (0.11571652800752433, 1.3271508601048883e-11)}
****************************************
Mongolic-Khitan, 66, SEG_LEN vs.A.Mean, 0.1
{'xal', 'mon', 'sce'} 3
{'SEG_LEN': (0.32943559773568354, 0.006911899253502781)}
****************************************
Dravidian, 339, SEG_LEN vs.A.Mean, 0.1
{'mal', 'tel', 'tam'} 3
{'SEG_LEN': (0.11674962091682933, 0.03163416817928987)}
****************************************


In [227]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "SEG_LEN", "D.Mean", 0.1)

Artificial Language, 506, SEG_LEN vs.D.Mean, 0.1
{'vol', 'ina', 'ido', 'epo'} 4
{'SEG_LEN': (-0.13514788725152613, 0.0023148679565116593)}
****************************************


In [229]:
for family in list(set(df["Family"].tolist())):
    get_corr_by_family(df, family, "SEG_LEN", "Aff.Mean", 0.1)

Artificial Language, 506, SEG_LEN vs.Aff.Mean, 0.1
{'vol', 'ina', 'ido', 'epo'} 4
{'SEG_LEN': (-0.10501598095829924, 0.018129095362387347)}
****************************************


In [232]:
df.to_csv("../data/finals/colex_wn_pron_features.csv", index=False)

In [228]:
df

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,COLEX,Conc.Mean,Conc.SD,V.Mean,V.SD,A.Mean,A.SD,D.Mean,D.SD,Aff.Mean,Aff.all.Mean,NORM_PRON,INIT_NORM_PRON,INIT_PRON,init_syl,init_son,init_cons,init_cont,init_delrel,init_lat,init_nas,init_strid,init_voi,init_sg,init_cg,init_ant,init_cor,init_distr,init_lab,init_hi,init_lo,init_back,init_round,init_velaric,init_tense,init_long,init_hitone,init_hireg,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg,last_syl,last_son,last_cons,last_cont,last_delrel,last_lat,last_nas,last_strid,last_voi,last_sg,last_cg,last_ant,last_cor,last_distr,last_lab,last_hi,last_lo,last_back,last_round,last_velaric,last_tense,last_long,last_hitone,last_hireg,SEG_LEN,Family,vowels,vowel_ratio,PRON_TTR,PRON_complex,NORM_PRON_TTR,NORM_PRON_complex
0,a,eng,ə,alpha~settling,2.035,1.230,5.500,2.240,3.670,2.630,6.320,2.480,5.163333,0.490352,,,ə,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,-1,1,-1,-1,-1,-1,0,0,1,Indo-European,1,1.000000,1.000000,0.000000,,
1,a,hun,ɒː,amplitude~elevator,3.855,0.985,5.950,1.500,3.650,2.250,4.920,2.260,4.840000,0.654389,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
2,a,hun,ɒː,amplitude~legislature,2.730,1.360,3.550,1.570,3.220,2.180,4.480,1.990,3.750000,0.481333,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
3,a,hun,ɒː,amplitude~menopause,3.055,1.405,1.890,1.020,3.550,2.240,4.320,2.810,3.253333,0.486241,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
4,a,hun,ɒː,amplitude~movie,3.755,1.185,7.240,1.510,4.390,2.630,5.680,2.160,5.770000,0.696056,a,a,ɒ,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,1,1,-1,1,1,0,0,2,Uralic,1,1.000000,1.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,dawn~departure,3.765,1.170,5.395,1.750,3.375,2.535,5.035,2.615,4.601667,0.632148,urruns,u,u,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,1,1,-1,1,1,-1,1,-1,0,0,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,6,Indo-European,2,0.333333,0.666667,0.333333,0.666667,0.333333
339688,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,lantern~torch,4.880,0.320,4.935,1.700,4.165,2.195,5.550,2.160,4.883333,0.759296,skima,s,s,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0,1,1,-1,1,-1,-1,-1,0,1,-1,-1,0,-1,0,-1,-1,1,-1,-1,-1,1,-1,0,0,6,Indo-European,2,0.400000,1.000000,0.000000,1.000000,0.000000
339689,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,miserable~unhappy,2.005,1.320,2.220,1.485,5.080,2.720,3.785,2.170,3.695000,0.405778,wenahs,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,7,Indo-European,3,0.500000,1.000000,0.000000,1.000000,0.000000
339690,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,pleasant~sweet,2.775,1.020,7.505,1.445,3.525,2.720,6.410,1.965,5.813333,0.600463,wothis,w,w,-1,1,-1,1,-1,-1,-1,0,1,-1,-1,-1,-1,0,1,1,-1,1,1,-1,0,-1,0,0,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0,-1,-1,1,1,-1,-1,-1,0,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,0,7,Indo-European,3,0.600000,0.857143,0.142857,1.000000,0.000000


In [79]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_by_lang(df, lang, "SEG_LEN", "Aff.Mean", 0.3)

In [75]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_by_lang(df, lang, "vowel_ratio", "Aff.Mean", 0.2)

sqi, 1103, vowel_ratiovs.Aff.Mean, 0.2
{'vowel_ratio': (0.26353026967039994, 5.580415029179443e-19)}
tam, 292, vowel_ratiovs.Aff.Mean, 0.2
{'vowel_ratio': (0.2010594881932116, 0.0005475825060733501)}


In [10]:
def get_corr_features_by_lang(df, lang, aff, thres):
    df_lang = df[df["LANG_PRON"]==lang].dropna()
    df_lang_features = df_lang[features]
    df_lang_init_features = df_lang[Initial_features]
    df_lang_last_features = df_lang[last_features]
    
    if len(df_lang)>100:
        r_all = get_corr(df_lang_features, df_lang[aff], thres)
        if len(r_all)>0:
            print(lang ,  "---->", len(df_lang_features))
            print(f" all, {aff}")
            print(r_all)
            
        r = get_corr(df_lang_init_features, df_lang[aff], thres)
        if len(r)>0:
            print(lang ,  "---->", len(df_lang_init_features))
            print(f" init, {aff}")
            print(r)
        
        r_last = get_corr(df_lang_last_features, df_lang[aff],thres)
        if len(r_last)>0:
            print(lang ,  "---->", len(df_lang_last_features))
            print(f"last, {aff}")
            print(r_last)
        

In [13]:
df[features]

Unnamed: 0,syl,son,cons,cont,delrel,lat,nas,strid,voi,sg,cg,ant,cor,distr,lab,hi,lo,back,round,velaric,tense,long,hitone,hireg
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
2,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
3,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
4,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339687,2,5,4,5,0,0,1,0,5,0,0,4,4,0,2,2,0,2,2,0,2,0,0,0
339688,2,3,3,3,0,0,1,0,3,0,0,2,1,0,1,2,1,1,0,0,2,1,0,0
339689,2,5,3,5,0,0,1,0,4,0,0,2,2,0,1,1,1,1,1,0,1,1,0,0
339690,2,3,2,5,0,0,0,0,3,0,0,2,2,1,1,2,0,2,2,0,2,2,0,0


# 0.3 corr

In [11]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Conc.Mean", 0.3)

rus ----> 3263
 all, Conc.Mean
{'cons': (-0.3019164828940955, 9.611753699291419e-70), 'ant': (-0.3400068803444331, 4.166255177955472e-89), 'cor': (-0.342293529930169, 2.325633365938142e-90)}
nno ----> 153
 all, Conc.Mean
{'son': (-0.335917824695933, 2.1857514047098397e-05), 'nas': (-0.30124307066098394, 0.00015439222950941), 'voi': (-0.35961827517545913, 4.9846252340671565e-06), 'hi': (-0.3334495278968169, 2.5322629304059756e-05)}
lav ----> 326
last, Conc.Mean
{'last_syl': (0.30362266375719027, 2.2242345447310306e-08), 'last_son': (0.3001048911942613, 3.2835561519100574e-08), 'last_cons': (-0.30362266375719027, 2.2242345447310306e-08), 'last_cont': (0.3298868614196319, 1.0271769129562786e-09)}
epo ----> 438
last, Conc.Mean
{'last_lo': (-0.3218574618999202, 5.156009842202202e-12), 'last_back': (0.3721292746389011, 7.818386049615115e-16), 'last_round': (0.3721292746389011, 7.818386049615115e-16)}
ukr ----> 1170
 all, Conc.Mean
{'cont': (-0.30464938997906754, 1.500377883870889e-26), 'voi'

In [15]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.Mean", 0.3)

lav ----> 326
 all, Aff.Mean
{'lat': (-0.3620013898308955, 1.573097087811819e-11), 'distr': (-0.30758343171645147, 1.4256395223271561e-08), 'lab': (-0.3620415286891659, 1.564430734819978e-11)}
tam ----> 292
 all, Aff.Mean
{'tense': (-0.31541147915702267, 3.632133276262565e-08), 'long': (-0.315199003835468, 3.713692471746451e-08)}


In [16]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.all.Mean", 0.3)

rus ----> 3263
 all, Aff.all.Mean
{'ant': (-0.30832896337643967, 8.283949001076445e-73), 'cor': (-0.3061929134966654, 8.86443703254754e-72)}
nno ----> 153
 all, Aff.all.Mean
{'son': (-0.3348217095274524, 2.33371730524386e-05), 'voi': (-0.3514259952354828, 8.422108712022405e-06)}
lav ----> 326
 all, Aff.all.Mean
{'distr': (-0.3922057692946257, 1.9692231329649618e-13), 'hi': (-0.3117664986182273, 8.848346440352721e-09)}
mya ----> 111
last, Aff.all.Mean
{'last_cg': (-0.31189056905927676, 0.0008613057890516605)}
epo ----> 438
last, Aff.all.Mean
{'last_back': (0.311658722883006, 2.5449019348220134e-11), 'last_round': (0.311658722883006, 2.5449019348220134e-11)}
ukr ----> 1170
 all, Aff.all.Mean
{'voi': (-0.30746846381831544, 4.890730103184369e-27), 'ant': (-0.3103372879932277, 1.5433540922070669e-27)}
est ----> 156
 all, Aff.all.Mean
{'cont': (-0.3503800502593875, 7.324679704836385e-06), 'strid': (-0.3059730309844068, 0.00010256294694324162)}
est ----> 156
 init, Aff.all.Mean
{'init_cont': 

In [14]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "V.Mean", 0.3)

lav ----> 326
 all, V.Mean
{'lat': (-0.3654348368790005, 9.779499852650086e-12), 'lab': (-0.4303444168700593, 3.9575177607658234e-16), 'lo': (-0.30948735860105947, 1.1484900946757727e-08), 'tense': (-0.3085609934449395, 1.2761147576481154e-08)}
lav ----> 326
last, V.Mean
{'last_lo': (-0.33071043433105524, 9.282233821518901e-10), 'last_tense': (-0.30885901027410717, 1.233630127234442e-08)}
mya ----> 111
last, V.Mean
{'last_hi': (0.3103525867040398, 0.0009164103196002201)}
tam ----> 292
 all, V.Mean
{'tense': (-0.32627992237476283, 1.1394826311883255e-08), 'long': (-0.3913830474062304, 3.9770048792579244e-12)}
hbs ----> 172
 init, V.Mean
{'init_syl': (-0.3047814684118472, 4.799732429847628e-05), 'init_round': (-0.33862713777462317, 5.529256851359485e-06), 'init_tense': (-0.30478146841184733, 4.799732429847628e-05)}


In [18]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "D.Mean", 0.3)

lav ----> 326
 all, D.Mean
{'lat': (-0.4014100787465438, 4.727105793540637e-14), 'distr': (-0.3215083821102962, 2.8300748159386035e-09), 'lab': (-0.4175283220297767, 3.485075288350599e-15)}
lav ----> 326
last, D.Mean
{'last_cont': (-0.3389534177292671, 3.311857520535648e-10), 'last_lo': (-0.3037273424791016, 2.1984279146502385e-08)}
mya ----> 111
 init, D.Mean
{'init_lat': (-0.3316046339810489, 0.00037759181964904014)}
est ----> 156
last, D.Mean
{'last_lab': (-0.33459014092061734, 1.962430413200205e-05)}
tam ----> 292
 all, D.Mean
{'round': (-0.3354391519074809, 4.1376573631249355e-09), 'long': (-0.3211243262116672, 1.986179933637777e-08)}


In [19]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "A.Mean", 0.3)

# 0.3 corr with feature ratio

In [54]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Conc.Mean")

est ----> 156
 init, Conc.Mean
{'init_cont': (-0.35258466790781035, 6.355774710957686e-06), 'init_strid': (-0.3383880522985208, 1.5558216724034958e-05)}
epo ----> 438
last, Conc.Mean
{'last_lo': (-0.3218574618999202, 5.156009842203296e-12), 'last_back': (0.3721292746389011, 7.818386049616617e-16), 'last_round': (0.3721292746389011, 7.818386049616617e-16)}
lav ----> 326
last, Conc.Mean
{'last_syl': (0.30362266375719027, 2.2242345447309886e-08), 'last_son': (0.3001048911942613, 3.283556151910169e-08), 'last_cons': (-0.30362266375719027, 2.2242345447309886e-08), 'last_cont': (0.3298868614196319, 1.0271769129562685e-09)}
jpn ----> 1434
 all, Conc.Mean
{'cons': (0.3192102295200195, 2.5080780063234267e-35)}
glg ----> 1984
last, Conc.Mean
{'last_syl': (0.3017432748656591, 4.843318671266405e-43), 'last_cons': (-0.30250876403302634, 2.9173154214735808e-43), 'last_hi': (-0.3099353643256879, 1.969735077187435e-45), 'last_tense': (0.3040077276392968, 1.0763329466103928e-43)}


In [57]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "V.Mean")
    

mya ----> 111
last, V.Mean
{'last_hi': (0.3103525867040398, 0.000916410319600217)}
sqi ----> 1103
 all, V.Mean
{'syl': (0.3063686223718098, 2.1209897248586407e-25)}
hbs ----> 172
 init, V.Mean
{'init_syl': (-0.3047814684118472, 4.7997324298477213e-05), 'init_round': (-0.33862713777462317, 5.5292568513595325e-06), 'init_tense': (-0.30478146841184733, 4.7997324298477213e-05)}
tam ----> 292
 all, V.Mean
{'round': (-0.33525676874479976, 4.223349691803452e-09), 'long': (-0.37716194098249245, 2.645524594325767e-11)}
khm ----> 137
 all, V.Mean
{'nas': (-0.32992200939219235, 8.248093854010872e-05)}
lav ----> 326
 all, V.Mean
{'lat': (-0.30585123100308, 1.7331844863211272e-08), 'lab': (-0.36697443459595036, 7.8876186863329e-12)}
lav ----> 326
last, V.Mean
{'last_lo': (-0.33071043433105524, 9.282233821519306e-10), 'last_tense': (-0.30885901027410717, 1.2336301272344737e-08)}


In [59]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "A.Mean")

In [58]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "D.Mean")

mya ----> 111
 init, D.Mean
{'init_lat': (-0.3316046339810489, 0.000377591819649037)}
est ----> 156
last, D.Mean
{'last_lab': (-0.33459014092061734, 1.962430413200193e-05)}
tam ----> 292
 all, D.Mean
{'round': (-0.3571245698682493, 3.2821159878984106e-10), 'long': (-0.30417092518706856, 1.148394382388517e-07)}
lav ----> 326
 all, D.Mean
{'lat': (-0.36382892164397057, 1.2223060247002483e-11), 'distr': (-0.3053090132103879, 1.8419885239729262e-08), 'lab': (-0.36563616830006423, 9.509001936505161e-12)}
lav ----> 326
last, D.Mean
{'last_cont': (-0.3389534177292671, 3.3118575205356703e-10), 'last_lo': (-0.3037273424791016, 2.198427914650291e-08)}


In [55]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.Mean")

tam ----> 292
 all, Aff.Mean
{'round': (-0.30592563664017214, 9.625427762746871e-08), 'long': (-0.31548045102845135, 3.606031792035894e-08)}
lav ----> 326
 all, Aff.Mean
{'lat': (-0.3236262667814881, 2.1968663686653055e-09), 'lab': (-0.3242257122583718, 2.044165209322184e-09)}


In [56]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.all.Mean")

mya ----> 111
last, Aff.all.Mean
{'last_cg': (-0.31189056905927676, 0.0008613057890516652)}
est ----> 156
 init, Aff.all.Mean
{'init_cont': (-0.3352381330071859, 1.886616628409679e-05), 'init_strid': (-0.30241241837118027, 0.00012455194181262572)}
hbs ----> 172
 init, Aff.all.Mean
{'init_cont': (-0.3038906737744619, 5.063135598723015e-05)}
epo ----> 438
last, Aff.all.Mean
{'last_back': (0.311658722883006, 2.544901934822649e-11), 'last_round': (0.311658722883006, 2.544901934822649e-11)}
lav ----> 326
 all, Aff.all.Mean
{'distr': (-0.39654827514180957, 1.0100010258701049e-13)}
slk ----> 2144
last, Aff.all.Mean
{'last_long': (-0.322889557154704, 3.2643932297399235e-53)}


# 0.2 corr

In [39]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Conc.Mean")

gla ----> 188
 all, Conc.Mean
{'ant': (0.2619142633836412, 0.00028251499309589934)}
gla ----> 188
 init, Conc.Mean
{'init_delrel': (-0.24874455846853805, 0.0005771534502786629), 'init_ant': (0.2693535277226441, 0.00018557887884399987), 'init_lab': (0.24879799144134968, 0.0005755264404395278), 'init_hi': (-0.21133887793381456, 0.00359854893653898), 'init_lo': (-0.2217209847463268, 0.002228473666410493), 'init_back': (-0.23432055607295238, 0.0012096514294630083)}
mya ----> 111
 init, Conc.Mean
{'init_hi': (0.20053876584249694, 0.034825792767616055)}
tgl ----> 1483
 all, Conc.Mean
{'round': (0.2101144868971068, 2.9326118367608777e-16)}
kat ----> 791
 init, Conc.Mean
{'init_cont': (-0.23576844766142568, 1.8764578061255294e-11)}
est ----> 156
 all, Conc.Mean
{'cont': (-0.2587386988078081, 0.0011087357000114856), 'strid': (-0.2618841120960649, 0.0009584221676429321), 'voi': (-0.2487044323452575, 0.0017440445140673609), 'tense': (-0.20305931688112702, 0.011010781499310777)}
est ----> 156
 ini

In [41]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.Mean")

mya ----> 111
 all, Aff.Mean
{'nas': (0.23067066867786437, 0.014863068077227851), 'voi': (0.29530980351596725, 0.0016521018992835148)}
mya ----> 111
 init, Aff.Mean
{'init_nas': (0.21985318078582325, 0.0204210689119446)}
sqi ----> 1103
 all, Aff.Mean
{'syl': (0.2994248835500898, 2.777093522985666e-24), 'cons': (-0.26353026967040005, 5.580415029176263e-19)}
hbs ----> 172
 init, Aff.Mean
{'init_syl': (-0.22619208526894324, 0.0028486273764907768), 'init_round': (-0.2947252495911921, 8.686125804463092e-05), 'init_tense': (-0.22619208526894327, 0.0028486273764907768)}
tam ----> 292
 all, Aff.Mean
{'cons': (-0.2010594881932116, 0.0005475825060732852), 'lat': (-0.24075768173553103, 3.214089056091791e-05), 'lo': (-0.2536897586576691, 1.1415575176337624e-05), 'back': (-0.2354882364283391, 4.822574005765132e-05), 'round': (-0.30592563664017214, 9.625427762746871e-08), 'tense': (-0.2752761701792748, 1.7851372704837212e-06), 'long': (-0.31548045102845135, 3.606031792035894e-08)}
tam ----> 292
 ini

In [42]:
for lang in list(set(df["LANG_PRON"].tolist())):
    get_corr_features_by_lang(df, lang, "Aff.all.Mean")

gla ----> 188
 all, Aff.all.Mean
{'ant': (0.26300388468893937, 0.0002658518434790532)}
gla ----> 188
 init, Aff.all.Mean
{'init_ant': (0.26307234751596664, 0.0002648360057946244), 'init_lab': (0.2625673778512594, 0.00027241431271613414), 'init_hi': (-0.20982692393485416, 0.0038517168228235973), 'init_lo': (-0.20362918446916756, 0.005065652720368666), 'init_back': (-0.24412180317343685, 0.0007351252753210238)}
mya ----> 111
 all, Aff.all.Mean
{'cg': (-0.24911379599104835, 0.008374918646804144), 'hi': (0.2157174979817105, 0.02297567824348494)}
mya ----> 111
 init, Aff.all.Mean
{'init_hi': (0.20672896385562994, 0.029484823194524945)}
kat ----> 791
 all, Aff.all.Mean
{'lo': (0.21661781431247112, 7.464434890896715e-10)}
kat ----> 791
 init, Aff.all.Mean
{'init_cont': (-0.24353240535093124, 3.831860434584425e-12)}
est ----> 156
 all, Aff.all.Mean
{'cont': (-0.27701761012199155, 0.00046370532965655506), 'strid': (-0.28166508626677733, 0.0003679010498446594)}
est ----> 156
 init, Aff.all.Mean
