In [1]:
from collections import defaultdict, Counter
import pandas as pd

In [2]:
def load_cldf_dataset(path_to_values, path_to_languages):
    values = pd.read_csv(path_to_values)
    languages = pd.read_csv(path_to_languages)
    return pd.merge(left = values, right = languages, how="left",
                    left_on="Language_ID", right_on="ID")

In [3]:
def get_frequencies_w_inventory_collapsing(dataset):
    glottocode_to_inventory = defaultdict(set)
    for row in dataset.itertuples():
        if not pd.isnull(row.Language_ID):
            glottocode_to_inventory[row.Language_ID].add(row.Value)
    print(f'{len(glottocode_to_inventory)} languages')
    frequencies_absolute = Counter()
    for segments in glottocode_to_inventory.values():
        for segment in segments:
            frequencies_absolute[segment] += 1
    frequencies_relative = {
        segment: count / len(glottocode_to_inventory)
        for segment, count in frequencies_absolute.items()
    }
    return frequencies_absolute, frequencies_relative

In [4]:
segbo = load_cldf_dataset('../data/segbo/cldf/values.csv',
                          '../data/segbo/cldf/languages.csv')
phoible = load_cldf_dataset('../data/phoible/cldf/values.csv',
                            '../data/phoible/cldf/languages.csv')

In [5]:
import re
gltc_pattern = re.compile(r'[a-z]{4,4}\d{4,4}')

In [6]:
# A weird id?
phoible.loc[ 
    phoible.Language_ID.map(lambda lid: gltc_pattern.match(lid) is None) 
].Language_ID.unique()

array(['l1'], dtype=object)

In [7]:
n_phoible_inventories = len(phoible.Language_ID.unique())
n_phoible_inventories

2177

In [8]:
len(segbo.Language_ID.unique())

498

In [9]:
len(set(segbo.Language_ID) - set(phoible.Language_ID))

199

In [10]:
(
    phoible_frequencies_absolute, 
    phoible_frequencies_relative
) = get_frequencies_w_inventory_collapsing(phoible)

top_10_phoible = phoible_frequencies_absolute.most_common(10)
for el, freq in top_10_phoible:
    print(f'{el}: {freq} -> {phoible_frequencies_relative[el]}')

2177 languages
m: 2112 -> 0.9701423977951309
i: 2076 -> 0.9536058796508957
k: 2004 -> 0.9205328433624254
j: 1993 -> 0.915480018373909
u: 1992 -> 0.9150206706476803
a: 1983 -> 0.9108865411116215
p: 1895 -> 0.870463941203491
w: 1883 -> 0.864951768488746
n: 1845 -> 0.8474965548920533
t: 1656 -> 0.7606798346348186


In [11]:
# Exclude SEGBO languages not found in PHOIBLE
phoible_langs = set(phoible.Language_ID)
segbo = segbo.loc[ segbo.Language_ID.map(lambda gltc: gltc in phoible_langs) ]

In [12]:
len(segbo.Language_ID.unique())

299

In [13]:
len(set(segbo.Language_ID) - set(phoible.Language_ID))

0

In [14]:
(
    segbo_frequencies_absolute, 
    segbo_frequencies_relative
) = get_frequencies_w_inventory_collapsing(segbo)
top_10_segbo = segbo_frequencies_absolute.most_common(10)
for el, freq in top_10_segbo:
    print(f'{el}: {freq} -> {segbo_frequencies_relative[el]}')

299 languages
f: 101 -> 0.3377926421404682
ɡ: 52 -> 0.17391304347826086
b: 41 -> 0.13712374581939799
z: 38 -> 0.12709030100334448
d: 38 -> 0.12709030100334448
h: 34 -> 0.11371237458193979
d̠ʒ: 32 -> 0.10702341137123746
ʃ: 32 -> 0.10702341137123746
v: 31 -> 0.10367892976588629
p: 31 -> 0.10367892976588629


In [15]:
# However, we compute SEGBO relative frequencies based on PHOIBLE
segbo_frequencies_relative = {
    segment: count_segbo / n_phoible_inventories
    for segment, count_segbo in segbo_frequencies_absolute.items()
}

In [16]:
# Absolute frequencies in SEGBO are sometimes higher than in PHOIBLE, which
# is problematic for statistics. We create two versions of absolute PHOIBLE
# frequencies: one where the values are greater than or equal than in SEGBO and
# one where they are strictly greater (through Laplace smoothing).
phoible_greater_or_equal = {}
phoible_strictly_greater = {}
for segment, count_segbo in segbo_frequencies_absolute.items():
    if count_segbo >= phoible_frequencies_absolute[segment]:
        print(segment, count_segbo, phoible_frequencies_absolute[segment])
        phoible_greater_or_equal[segment] = count_segbo
        phoible_strictly_greater[segment] = count_segbo + 1
    else:
        phoible_greater_or_equal[segment] = phoible_frequencies_absolute[
            segment]
        phoible_strictly_greater[segment] = phoible_frequencies_absolute[
            segment] + 1

ʕ̞ 1 0
ʊai 1 0
ɸʷ 1 1
tsʲʰ 1 1
pʷʰ 1 1
d̠̤ʒ̤ 1 1
ɹ̤ 1 0
uə̯ 1 1
ɨə̯ 1 0
l̪ˤ 1 1
n̪ˤ 1 1
n̺d̺z̺ 1 0
ðˠ 1 1


In [17]:
# Now we need to recompute PHOIBLE relative frequencies
phoible_freqs_relative = {
    segment: count / n_phoible_inventories
    for segment, count in phoible_greater_or_equal.items()
}
phoible_freqs_relative_laplace = {
    segment: count / n_phoible_inventories
    for segment, count in phoible_strictly_greater.items()
}
for segment, f_s in sorted(phoible_freqs_relative.items(), 
                           key=lambda el: el[1], reverse=True)[:10]:
    print(f'{segment}: {f_s}, {phoible_freqs_relative_laplace[segment]}')

m: 0.9701423977951309, 0.9706017455213597
k: 0.9205328433624254, 0.9209921910886542
j: 0.915480018373909, 0.9159393661001378
u: 0.9150206706476803, 0.915480018373909
a: 0.9108865411116215, 0.9113458888378503
p: 0.870463941203491, 0.8709232889297198
w: 0.864951768488746, 0.8654111162149747
n: 0.8474965548920533, 0.847955902618282
t: 0.7606798346348186, 0.7611391823610473
l: 0.7266881028938906, 0.7271474506201194


In [18]:
# Now we can compute borrowability scores using Eisen's formula with normalisation
borrowability_scores = {}
borrowability_scores_laplace = {}
for segment in segbo_frequencies_relative:
    borrowability_scores[segment] = segbo_frequencies_relative[segment] / (
            phoible_freqs_relative[segment] - 
            phoible_freqs_relative[segment]**2
        ) / 6
    borrowability_scores_laplace[segment] = segbo_frequencies_relative[
        segment] / (
            phoible_freqs_relative_laplace[segment] - 
            phoible_freqs_relative_laplace[segment]**2
        ) / 6

In [19]:
borrowability_scores = dict(
    filter(lambda el: segbo_frequencies_absolute[el[0]] >= 10, 
           borrowability_scores.items()))
sorted(borrowability_scores.items(), key=lambda el: el[1], reverse=True)

[('f', 0.03131315979556449),
 ('p', 0.02104798617738605),
 ('ɡ', 0.016305565878208066),
 ('b', 0.013561760809053231),
 ('z', 0.013522755878997114),
 ('ʒ', 0.013063819433761137),
 ('d̠ʒ', 0.011803296464975062),
 ('v', 0.01168581778387289),
 ('d', 0.011637518709837154),
 ('x', 0.010997585279851278),
 ('h', 0.010670627101530257),
 ('ʃ', 0.010643297368815065),
 ('l', 0.009251134081951362),
 ('r', 0.008881549145425395),
 ('t̠ʃ', 0.008795380852284467),
 ('o', 0.008581466617976598),
 ('ɾ', 0.008309477460971793),
 ('s', 0.007337184934133851),
 ('ɣ', 0.007197289032151416),
 ('ts', 0.006746449553090329),
 ('e', 0.006693107052819283),
 ('ŋ', 0.004060536878702419),
 ('ʔ', 0.0032222464963804856)]

In [20]:
borrowability_scores_laplace = dict(
    filter(lambda el: segbo_frequencies_absolute[el[0]] >= 10, 
           borrowability_scores_laplace.items()))
sorted(borrowability_scores_laplace.items(), key=lambda el: el[1], reverse=True)

[('f', 0.03130673961230827),
 ('p', 0.021111749277995504),
 ('ɡ', 0.016310273841550427),
 ('b', 0.013569108701852064),
 ('z', 0.013511994945782806),
 ('ʒ', 0.013031529913692712),
 ('d̠ʒ', 0.011792555035534754),
 ('v', 0.011674392278401571),
 ('d', 0.011637695520768798),
 ('x', 0.010977107914592443),
 ('h', 0.010673766167544297),
 ('ʃ', 0.010637329722406982),
 ('l', 0.009260854491429348),
 ('r', 0.008881714076700149),
 ('t̠ʃ', 0.008792762227876732),
 ('o', 0.008587283404808561),
 ('ɾ', 0.008301302583813794),
 ('s', 0.00734376370419846),
 ('ɣ', 0.007179226740888285),
 ('ts', 0.0067375392662055306),
 ('e', 0.006698231574883583),
 ('ŋ', 0.004063083240014931),
 ('ʔ', 0.0032208620725367137)]