In [1]:
#%pip install pandas
#%pip install sparqlwrapper

### Setup the environment by creating a wrapper to query a public (limited) SPARQL using OFFSET/LIMIT windowing

In [2]:
from typing import NoReturn, Tuple, Union, List

import pandas as pd
from SPARQLWrapper import SPARQLWrapper, Wrapper

class SPARQLEndpoint:
    """
    Class allowing to make a query on a remote SPARQL server, its main characteristics are :
     - Taking into account the big answers by concatenating them as they are received
     - Ability to access the size of the database
     - Ability to retrieve the response in `pandas` data frame format
    """

    def __init__(self, endpoint: str, verbose: bool = False) -> NoReturn:
        """
        :param endpoint: Url to the remote SPARQL service
        :param verbose: If the detail text will be displayed
        """
        self.sparql = SPARQLWrapper(endpoint)
        self.sparql.setReturnFormat("json")

        self.verbose: bool = verbose
        self.is_widget: bool = False

    def query_once(self, query: str) -> Tuple[pd.DataFrame, Union[int, None]]:
        """
        Helper function to convert SPARQL results into a Pandas data frame.

        returns the dataframe along with an indication of server limit hit.

        The second value of the tuple is either an int (we hit a server limit of this amount)

        Credit: Douglas Fils

        :param query: The query to perform
        """

        self.sparql.setQuery(query)

        processed_results: Wrapper.QueryResult = self.sparql.query()

        # We will check if the results are incomplete due to server limitations
        max_size: Union[int, None] = None
        if 'x-sparql-maxrows' in processed_results.info():
            max_size = int(processed_results.info()['x-sparql-maxrows'])

        if 'x-sql-state' in processed_results.info():
            max_size = 0

        processed_results: dict = processed_results.convert()

        cols: list[str] = processed_results['head']['vars']

        out: list[list[str]] = [[row.get(c, {}).get('value') for c in cols] for row in
                                processed_results['results']['bindings']]
        return pd.DataFrame(out, columns=cols), max_size

    def query_all(self, query: str) -> pd.DataFrame:
        origin: int = 0
        df, limit = self.query_once(query)
        frames: List[pd.DataFrame] = [df]
        original_limit: int = limit
        while limit:
            origin += limit
            if self.verbose:
                print('\r', f"Hit the server limit, offseting query to {origin}", end='')
            limited_query = query + f'\n OFFSET {origin} LIMIT {original_limit}'
            df, limit = self.query_once(limited_query)
            frames.append(df)
        result = pd.concat(frames, ignore_index=True)
        if self.verbose:
            print(f" --> Retrieved {len(result)} rows", end='\n')
        return result

### Querying pronunciation data

In [3]:
q_without_pron = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?p ?e ?t ?pr ?mweOrLE
FROM <http://kaiko.getalp.org/dbnary/eng>
WHERE {
  ?e a ?mweOrLE ; rdfs:label ?t .
  OPTIONAL { ?e ontolex:canonicalForm / ontolex:phoneticRep ?pr }
  FILTER not exists { ?e ontolex:canonicalForm/ontolex:phoneticRep ?pr}

  VALUES ?mweOrLE
           { %%TYPE%% }
  ?p a dbnary:Page ; dbnary:describes ?e
}"""

q_with_pron = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?p ?e ?t ?pr ?mweOrLE
FROM <http://kaiko.getalp.org/dbnary/eng>
WHERE {
  ?e a ?mweOrLE ; rdfs:label ?t ;
  ontolex:canonicalForm / ontolex:phoneticRep ?pr.
  VALUES ?mweOrLE { %%TYPE%% }
  ?p a dbnary:Page ; dbnary:describes ?e
}"""

In [4]:
endpoint = "http://kaiko.getalp.org/sparql"

dbnary = SPARQLEndpoint(endpoint, True)

In [5]:
mwe_with_pron = dbnary.query_all(q_with_pron.replace('%%TYPE%%', 'ontolex:MultiWordExpression'))

 --> Retrieved 8143 rows


In [162]:
mwe_with_pron.nunique()

p          4656
e          4977
t          4656
pr         7366
mweOrLE       1
dtype: int64

In [165]:
mwe_without_pron[['p', 'e', 'pr']].nunique()

p     210845
e     214243
pr         0
dtype: int64

In [6]:
mwe_without_pron = dbnary.query_all(q_without_pron.replace('%%TYPE%%', 'ontolex:MultiWordExpression'))

 Hit the server limit, offseting query to 210000 --> Retrieved 214244 rows


In [7]:
entries_with_pron = dbnary.query_all(q_with_pron.replace('%%TYPE%%', 'ontolex:LexicalEntry'))

 Hit the server limit, offseting query to 170000 --> Retrieved 173517 rows


In [49]:
entries_with_pron

Unnamed: 0,p,e,t,pr,mweOrLE
0,http://kaiko.getalp.org/dbnary/eng/Bloomingdale,http://kaiko.getalp.org/dbnary/eng/Bloomingdal...,Bloomingdale,/ˈblu.mɪŋ.deɪl/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
1,http://kaiko.getalp.org/dbnary/eng/DINB,http://kaiko.getalp.org/dbnary/eng/DINB__Noun__1,DINB,/ˈdɪnbiː/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
2,http://kaiko.getalp.org/dbnary/eng/Gowanus,http://kaiko.getalp.org/dbnary/eng/Gowanus__Pr...,Gowanus,/ɡəˈwɑːnəs/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
3,http://kaiko.getalp.org/dbnary/eng/Juul,http://kaiko.getalp.org/dbnary/eng/Juul__Noun__1,Juul,/d͡ʒuːl/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
4,http://kaiko.getalp.org/dbnary/eng/Juul,http://kaiko.getalp.org/dbnary/eng/Juul__Prope...,Juul,/d͡ʒuːl/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
...,...,...,...,...,...
173512,http://kaiko.getalp.org/dbnary/eng/unattach,http://kaiko.getalp.org/dbnary/eng/unattach__V...,unattach,/ˌʌnəˈtæt͡ʃ/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
173513,http://kaiko.getalp.org/dbnary/eng/unctional,http://kaiko.getalp.org/dbnary/eng/unctional__...,unctional,/ˈʌŋkt͡ʃnəl/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
173514,http://kaiko.getalp.org/dbnary/eng/unctional,http://kaiko.getalp.org/dbnary/eng/unctional__...,unctional,/ˈʌŋkt͡ʃənəl/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry
173515,http://kaiko.getalp.org/dbnary/eng/updateability,http://kaiko.getalp.org/dbnary/eng/updateabili...,updateability,/ˈʌp.deɪtəˈbɪlɪti/,http://www.w3.org/ns/lemon/ontolex#LexicalEntry


In [166]:
entries_with_pron.nunique()

p           75082
e          107327
t           75086
pr         111934
mweOrLE         1
dtype: int64

In [48]:
mwe_with_pron

Unnamed: 0,p,e,t,pr,mweOrLE
0,http://kaiko.getalp.org/dbnary/eng/absentee_ow...,http://kaiko.getalp.org/dbnary/eng/absentee_ow...,absentee ownership,/ˌæb.sn̩ˈti ˈoʊ.nɚˌʃɪp/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
1,http://kaiko.getalp.org/dbnary/eng/absolutely_...,http://kaiko.getalp.org/dbnary/eng/absolutely_...,absolutely convergent,/ˈæb.səˌlut.li kənˈvɚ.d͡ʒn̩t/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
2,http://kaiko.getalp.org/dbnary/eng/absolutely_...,http://kaiko.getalp.org/dbnary/eng/absolutely_...,absolutely convergent,/ˌæb.səˈlut.li kənˈvɚ.d͡ʒn̩t/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
3,http://kaiko.getalp.org/dbnary/eng/cold_abscess,http://kaiko.getalp.org/dbnary/eng/cold_absces...,cold abscess,/koʊld ˈæbˌsɛs/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
4,http://kaiko.getalp.org/dbnary/eng/cold_abscess,http://kaiko.getalp.org/dbnary/eng/cold_absces...,cold abscess,/koʊld ˈæbˌsɪs/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
...,...,...,...,...,...
8138,http://kaiko.getalp.org/dbnary/eng/raræ_aves,http://kaiko.getalp.org/dbnary/eng/raræ_aves__...,raræ aves,/ˌɹɑːɹiː ˈeɪviːs/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
8139,http://kaiko.getalp.org/dbnary/eng/raræ_aves,http://kaiko.getalp.org/dbnary/eng/raræ_aves__...,raræ aves,/ˌɹɛːɹaɪ-/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
8140,http://kaiko.getalp.org/dbnary/eng/raræ_aves,http://kaiko.getalp.org/dbnary/eng/raræ_aves__...,raræ aves,/ˌɹɛːɹiː-/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...
8141,http://kaiko.getalp.org/dbnary/eng/whame_fly,http://kaiko.getalp.org/dbnary/eng/whame_fly__...,whame fly,/ˈweɪm ˌflaɪ/,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...


In [136]:
mwe_with_pron['p'].nunique()

4656

In [137]:
mwe_without_pron['p'].nunique()

210845

##### Merging the different pronunciations for each entry

When an MWT *entry* has several pronunciations, it will have several lines in the mwe_with_pron dataframe (one per pronunication). Hence, in order to prepare the processing, we need to merge such lines and produce a list of pronunciations for each entry, so that the evaluation will compare the list of *known* pronunciations with the list of *proposed* pronunciations.

In [12]:
aggregated_mwe_with_pron = mwe_with_pron.groupby('e').aggregate(
    {
        "p": lambda tdf: tdf.iloc[0],
        "t": lambda tdf: tdf.iloc[0],
        'pr': lambda tdf: ','.join(sorted(tdf.unique().tolist()))
     }
    )

In [13]:
aggregated_mwe_with_pron.count()

p     4977
t     4977
pr    4977
dtype: int64

In [14]:
# mwe_without_pron

In [15]:
mwe_without_pron['p'].nunique()

210845

Now, we have all entries with their pronunciation and all mwe (with and without) pronunciation.

Next, we should try to decide if a page is a heteronym (i.e.: has different entries with different pronunciation).

Then, try to create a pronunciation for MWE (by decomposing and constructing the pronunciation), and warn if an entry part is an heteronym.

Evaluate also.

### Playing with heteronymy

In [16]:
q_heteronyms_template = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
PREFIX dbnary: <http://kaiko.getalp.org/dbnary#>
PREFIX dbnary-eng: <http://kaiko.getalp.org/dbnary/eng/>

SELECT ?e ?prons
FROM <http://kaiko.getalp.org/dbnary/eng>
WHERE {
    %%PAGE%% dbnary:describes ?e.
    {
        SELECT ?e (GROUP_CONCAT(?pr ; SEPARATOR=",") as ?prons) {
            SELECT ?pr ?e {
                ?e ontolex:canonicalForm / ontolex:phoneticRep ?pr .
            }
            GROUP BY ?e ?pr
            ORDER BY ?pr
        } GROUP BY ?e
    }
}
"""

def get_page_uri(label: str) -> str:
    return '<http://kaiko.getalp.org/dbnary/eng/' + label.replace('/', '!slash!') + '>'

def get_entries_and_pronunciations_from_dbnary(page: str) -> pd.DataFrame:
    query = q_heteronyms_template.replace('%%PAGE%%', get_page_uri(page))
    df = dbnary.query_all(query)
    return df

get_entries_and_pronunciations_from_dbnary('bass')

 --> Retrieved 5 rows


Unnamed: 0,e,prons
0,http://kaiko.getalp.org/dbnary/eng/bass__Noun__3,/bæs/
1,http://kaiko.getalp.org/dbnary/eng/bass__Noun__2,/bæs/
2,http://kaiko.getalp.org/dbnary/eng/bass__Noun__1,/beɪs/
3,http://kaiko.getalp.org/dbnary/eng/bass__Verb__1,/beɪs/
4,http://kaiko.getalp.org/dbnary/eng/bass__Adjec...,/beɪs/


In [17]:
def get_entries_and_pronunciations(page: str) -> pd.DataFrame:
    return entries_with_pron[entries_with_pron.t == page][['e', 'pr']].groupby('e').aggregate(lambda tdf: ','.join(sorted(tdf.unique().tolist()))).rename(
        columns={"pr": "prons"}
    )

In [18]:
get_entries_and_pronunciations_from_dbnary('bank')

 --> Retrieved 7 rows


Unnamed: 0,e,prons
0,http://kaiko.getalp.org/dbnary/eng/bank__Verb__1,"/bæŋk/,[beɪŋk]"
1,http://kaiko.getalp.org/dbnary/eng/bank__Noun__2,"/bæŋk/,[beɪŋk]"
2,http://kaiko.getalp.org/dbnary/eng/bank__Noun__4,"/bæŋk/,[beɪŋk]"
3,http://kaiko.getalp.org/dbnary/eng/bank__Noun__1,"/bæŋk/,[beɪŋk]"
4,http://kaiko.getalp.org/dbnary/eng/bank__Noun__3,"/bæŋk/,[beɪŋk]"
5,http://kaiko.getalp.org/dbnary/eng/bank__Verb__2,"/bæŋk/,[beɪŋk]"
6,http://kaiko.getalp.org/dbnary/eng/bank__Verb__3,"/bæŋk/,[beɪŋk]"


In [19]:
get_entries_and_pronunciations('bank')

Unnamed: 0_level_0,prons
e,Unnamed: 1_level_1
http://kaiko.getalp.org/dbnary/eng/bank__Noun__1,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Noun__2,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Noun__3,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Noun__4,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Verb__1,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Verb__2,"/bæŋk/,[beɪŋk]"
http://kaiko.getalp.org/dbnary/eng/bank__Verb__3,"/bæŋk/,[beɪŋk]"


In [20]:
get_entries_and_pronunciations_from_dbnary('s/he')

 --> Retrieved 1 rows


Unnamed: 0,e,prons
0,http://kaiko.getalp.org/dbnary/eng/s!slash!he_...,/hi ɔː(ɹ) ʃi/


In [21]:
get_entries_and_pronunciations('s/he')

Unnamed: 0_level_0,prons
e,Unnamed: 1_level_1
http://kaiko.getalp.org/dbnary/eng/s!slash!he__Pronoun__1,/hi ɔː(ɹ) ʃi/


In [160]:
get_entries_and_pronunciations('lead')


Unnamed: 0_level_0,prons
e,Unnamed: 1_level_1
http://kaiko.getalp.org/dbnary/eng/lead__Adjective__1,"/lid/,/liːd/"
http://kaiko.getalp.org/dbnary/eng/lead__Noun__1,/lɛd/
http://kaiko.getalp.org/dbnary/eng/lead__Noun__2,"/lid/,/liːd/"
http://kaiko.getalp.org/dbnary/eng/lead__Verb__1,/lɛd/
http://kaiko.getalp.org/dbnary/eng/lead__Verb__2,"/lid/,/liːd/"
http://kaiko.getalp.org/dbnary/eng/lead__Verb__3,"/lid/,/liːd/"


#### Trying to get all heteronyms in one giant SPARQL query

In [22]:
p_pages_and_prons = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
PREFIX dbnary: <http://kaiko.getalp.org/dbnary#>
PREFIX dbnary-eng: <http://kaiko.getalp.org/dbnary/eng/>

SELECT ?p ?prons (GROUP_CONCAT(?e ; SEPARATOR = ",") as ?entries)
FROM <http://kaiko.getalp.org/dbnary/eng>
WHERE {
    ?p a dbnary:Page; dbnary:describes ?e.
    {
        SELECT ?e (GROUP_CONCAT(?pr ; SEPARATOR=",") as ?prons) {
            SELECT ?pr ?e {
                ?e ontolex:canonicalForm / ontolex:phoneticRep ?pr .
            }
            GROUP BY ?e ?pr
            ORDER BY ?pr
        } GROUP BY ?e
    }
} GROUP BY ?p ?prons

"""

pages_and_prons = dbnary.query_all(p_pages_and_prons)
pages_and_prons

 Hit the server limit, offseting query to 70000 --> Retrieved 76131 rows


Unnamed: 0,p,prons,entries
0,http://kaiko.getalp.org/dbnary/eng/prophetess,/pɹɒfɪˈtɛs/,http://kaiko.getalp.org/dbnary/eng/prophetess_...
1,http://kaiko.getalp.org/dbnary/eng/old-fashioned,"/oʊldˈfæʃənd/,/əʊldˈfæʃənd/",http://kaiko.getalp.org/dbnary/eng/old-fashion...
2,http://kaiko.getalp.org/dbnary/eng/maltreatment,/malˈtɹiːtmənt/,http://kaiko.getalp.org/dbnary/eng/maltreatmen...
3,http://kaiko.getalp.org/dbnary/eng/irrigation,/ˌɪɹəˈɡeɪʃən/,http://kaiko.getalp.org/dbnary/eng/irrigation_...
4,http://kaiko.getalp.org/dbnary/eng/symposium,/sɪm.ˈpoʊ.zi.əm/,http://kaiko.getalp.org/dbnary/eng/symposium__...
...,...,...,...
76126,http://kaiko.getalp.org/dbnary/eng/umbra_recta,/ˈʌmbɹə ˈɹɛktə/,http://kaiko.getalp.org/dbnary/eng/umbra_recta...
76127,http://kaiko.getalp.org/dbnary/eng/peacemaker,/ˈpiːsmeɪkə(ɹ)/,http://kaiko.getalp.org/dbnary/eng/peacemaker_...
76128,http://kaiko.getalp.org/dbnary/eng/sift_through,/ˈsɪft θɹuː/,http://kaiko.getalp.org/dbnary/eng/sift_throug...
76129,http://kaiko.getalp.org/dbnary/eng/secretory,/sɪˈkɹiːt(ə)ɹiː/,http://kaiko.getalp.org/dbnary/eng/secretory__...


In [23]:
pages_and_prons['p'].nunique()

75082

In [24]:
pages_and_prons['Counts'] = pages_and_prons.groupby(['p'])['prons'].transform('count')
heteronyms = pages_and_prons.query('Counts > 1')

In [25]:
heteronyms['p'].nunique()

970

In [26]:
heteronyms.nunique()

p           970
prons      1992
entries    2019
Counts        4
dtype: int64

In [155]:
heteronyms.sample(10)

Unnamed: 0,p,prons,entries,Counts
19571,http://kaiko.getalp.org/dbnary/eng/darkling,/ˈdɑː(ɹ)kəlɪŋ/,http://kaiko.getalp.org/dbnary/eng/darkling__A...,2
43424,http://kaiko.getalp.org/dbnary/eng/stonish,"/ˈstoʊnɪʃ/,/ˈstəʊnɪʃ/",http://kaiko.getalp.org/dbnary/eng/stonish__Ad...,2
64704,http://kaiko.getalp.org/dbnary/eng/ake,/ɑːˈkeɪ/,http://kaiko.getalp.org/dbnary/eng/ake__Adverb__1,2
13266,http://kaiko.getalp.org/dbnary/eng/footling,/ˈfʊt.lɪŋ/,http://kaiko.getalp.org/dbnary/eng/footling__N...,2
24039,http://kaiko.getalp.org/dbnary/eng/subject,"/səbˈd͡ʒɛkt/,/sʌbˈd͡ʒɛkt/",http://kaiko.getalp.org/dbnary/eng/subject__Ve...,2
68588,http://kaiko.getalp.org/dbnary/eng/tower,/ˈtəʊ.ə(ɹ)/,http://kaiko.getalp.org/dbnary/eng/tower__Noun__2,2
44601,http://kaiko.getalp.org/dbnary/eng/wang,/wæŋ/,http://kaiko.getalp.org/dbnary/eng/wang__Noun_...,2
66084,http://kaiko.getalp.org/dbnary/eng/mater,/ˈmeɪtɚ/,http://kaiko.getalp.org/dbnary/eng/mater__Noun__3,2
44837,http://kaiko.getalp.org/dbnary/eng/ay,/ˈeɪ/,http://kaiko.getalp.org/dbnary/eng/ay__Adverb_...,4
43667,http://kaiko.getalp.org/dbnary/eng/concord,/kənˈkɔɹd/,http://kaiko.getalp.org/dbnary/eng/concord__Ve...,3


In [161]:
heteronyms.query("p == 'http://kaiko.getalp.org/dbnary/eng/lead'")

Unnamed: 0,p,prons,entries,Counts
23811,http://kaiko.getalp.org/dbnary/eng/lead,"/lid/,/liːd/",http://kaiko.getalp.org/dbnary/eng/lead__Verb_...,2
53309,http://kaiko.getalp.org/dbnary/eng/lead,/lɛd/,http://kaiko.getalp.org/dbnary/eng/lead__Verb_...,2


In [27]:
heteronyms.to_pickle('../data/heteronyms.pkl')
pages_and_prons.to_pickle('../data/pages_and_prons.pkl')

### Querying derivation data for heteronym resolution

In [76]:
q_derivations = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
PREFIX dbnary: <http://kaiko.getalp.org/dbnary#>
PREFIX dbnary-eng: <http://kaiko.getalp.org/dbnary/eng/>

SELECT DISTINCT ?deriv_from ?source_label ?deriv_to ?target_label
FROM <http://kaiko.getalp.org/dbnary/eng>
WHERE {
    ?deriv_to dbnary:derivedFrom ?deriv_from ; dbnary:describes / rdfs:label ?target_label.
    ?deriv_from rdfs:label ?source_label.
}
"""

derivations = dbnary.query_all(q_derivations)

 Hit the server limit, offseting query to 230000 --> Retrieved 239284 rows


In [77]:
derivations

Unnamed: 0,deriv_from,source_label,deriv_to,target_label
0,http://kaiko.getalp.org/dbnary/eng/above__Prep...,above,http://kaiko.getalp.org/dbnary/eng/Above_Derwent,Above Derwent
1,http://kaiko.getalp.org/dbnary/eng/absolution_...,absolution,http://kaiko.getalp.org/dbnary/eng/Absolution_Day,Absolution Day
2,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian banana
3,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian,http://kaiko.getalp.org/dbnary/eng/Abyssinian_cat,Abyssinian cat
4,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian gold
...,...,...,...,...
239279,http://kaiko.getalp.org/dbnary/eng/trip_sitter...,trip sitter,http://kaiko.getalp.org/dbnary/eng/trip_sit,trip sit
239280,http://kaiko.getalp.org/dbnary/eng/turgite__No...,turgite,http://kaiko.getalp.org/dbnary/eng/turgitic,turgitic
239281,http://kaiko.getalp.org/dbnary/eng/uranocentri...,uranocentric,http://kaiko.getalp.org/dbnary/eng/uranocentri...,uranocentric orbit
239282,http://kaiko.getalp.org/dbnary/eng/vitapathy__...,vitapathy,http://kaiko.getalp.org/dbnary/eng/vitapathic,vitapathic


In [99]:
def get_pronunciations_for_entry(uri: str) -> pd.DataFrame:
    return entries_with_pron[entries_with_pron.e == uri][['e', 'pr']].groupby('e').aggregate(lambda tdf: ','.join(sorted(tdf.unique().tolist()))).rename(
        columns={"pr": "prons"}
    )

def get_derives_from(page: str, component:str) -> str:
    derivations_for_page = derivations[derivations.deriv_to == page]
    deriv_from = derivations_for_page[derivations_for_page.source_label == component]['deriv_from']
    if len(deriv_from) > 0:
        return deriv_from.iloc[0]
    return None

def get_pronunciations_from_derivation(page: str, component:str) -> pd.DataFrame:
    deriv_from = get_derives_from(page, component)
    if deriv_from:
        return get_pronunciations_for_entry(deriv_from)

In [100]:
# get_pronunciations_for_entry('http://kaiko.getalp.org/dbnary/eng/servant__Noun__1')
get_pronunciations_from_derivation('http://kaiko.getalp.org/dbnary/eng/bass_guitar', 'bass')

Unnamed: 0_level_0,prons
e,Unnamed: 1_level_1
http://kaiko.getalp.org/dbnary/eng/bass__Noun__1,/beɪs/


### Decomposing the terms

We will decompose the terms by spaces (we disregard the case of composition of MWTs).

Maybe stick to binary MWTs ?

Limitation to be expressed in the paper: we do not have the regional information on pronunciations and this would be handy to avoid composing a US pronunciation with an Australian one. In next version of DBnary, we may try to extract the pronunciation and keep the region information along with it.


In [116]:
import itertools

def cleanup_pronunciation(pron: str):
    if pron.startswith('/') and pron.endswith('/'):
        return pron[1:-1]
    else:
        return None

def combine(l: List[List[str]]) -> List[str]:
    combined = []
    for p in itertools.product(*l):
        combined.append('/' + ' '.join(p) + '/')
    return combined

## TODO: build pronunciation from dataframe instead of query to kaiko
def build_pronunciations(term: str, page: str) -> Tuple[List[str], str]:
    components = term.split(sep=' ')
    method = "Combination"
    pronunciations = []
    for component in components:
        prons: List[str] = []
        entries_and_pronunciations = get_entries_and_pronunciations(component)
        if entries_and_pronunciations['prons'].nunique() == 0:
            # print(f'WARN: component {component} has no pronunciation', end='\n')
            return [], 'Missing Pronunciation for a Component'

        if entries_and_pronunciations['prons'].nunique() > 1:
            # print(f'WARN: Component {component} is an heteronym', end='\n')
            # Check if the current MWT is known to derive from one of this components entry
            derivation_pronunciations = get_pronunciations_from_derivation(page, component)
            method = "Heteronymy"
            if derivation_pronunciations is not None and len(derivation_pronunciations) > 0:
                prons = str.split(derivation_pronunciations['prons'][0], sep=',')
            else:
                # What should I do if no derivation pronunciation are available when I have heteronymy ?
                return [], "No Derivation for Heteronymy"
        else:
            prons = str.split(entries_and_pronunciations['prons'][0], sep=',')
        prons = [cleanup_pronunciation(pron) for pron in prons]
        prons = [pron for pron in prons if pron]
        pronunciations.append(prons)
    return combine(pronunciations), method

build_pronunciations('bass guitar', 'http://kaiko.getalp.org/dbnary/eng/bass_guitar')


(['/beɪs ɡiˈtɑɹ/',
  '/beɪs ɡəˈtɑɹ/',
  '/beɪs ɡɪˈtɑɹ/',
  '/beɪs ɡɪˈtɑː(ɹ)/',
  '/beɪs ˈɡɪ.tɑɹ/'],
 'Heteronymy')

In [110]:
build_pronunciations('sea bass', 'http://kaiko.getalp.org/dbnary/eng/sea_bass')


(['/seɪ bæs/', '/siː bæs/'], 'Heteronymy')

In [111]:
build_pronunciations('Abyssinian banana', 'http://kaiko.getalp.org/dbnary/eng/Abyssinian_banana')

(['/ˌæb.əˈsɪn.i.ən bəˈnæ.nə/',
  '/ˌæb.əˈsɪn.i.ən bəˈnɑː.nə/',
  '/ˌæb.əˈsɪn.jən bəˈnæ.nə/',
  '/ˌæb.əˈsɪn.jən bəˈnɑː.nə/',
  '/ˌæb.ɪˈsɪn.i.ən bəˈnæ.nə/',
  '/ˌæb.ɪˈsɪn.i.ən bəˈnɑː.nə/',
  '/ˌæb.ɪˈsɪn.ɪ.ən bəˈnæ.nə/',
  '/ˌæb.ɪˈsɪn.ɪ.ən bəˈnɑː.nə/'],
 'Combination')

In [112]:
build_pronunciations('bass guitar', 'http://kaiko.getalp.org/dbnary/eng/bass_guitar')

(['/beɪs ɡiˈtɑɹ/',
  '/beɪs ɡəˈtɑɹ/',
  '/beɪs ɡɪˈtɑɹ/',
  '/beɪs ɡɪˈtɑː(ɹ)/',
  '/beɪs ˈɡɪ.tɑɹ/'],
 'Heteronymy')

In [117]:
# aggregated_mwe_with_pron['Computed Pronunciations'] = aggregated_mwe_with_pron['t'].map(build_pronunciations)
aggregated_mwe_with_pron['Computed Pronunciations'] = aggregated_mwe_with_pron.apply(lambda x : build_pronunciations(x['t'], x['p']), axis=1)

In [118]:
aggregated_mwe_with_pron['computed'], aggregated_mwe_with_pron['method'] = zip(*aggregated_mwe_with_pron['Computed Pronunciations'])
aggregated_mwe_with_pron = aggregated_mwe_with_pron.drop('Computed Pronunciations', axis=1)

In [130]:
aggregated_mwe_with_pron

Unnamed: 0_level_0,p,t,pr,computed,method,precision,recall,f1
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://kaiko.getalp.org/dbnary/eng/!Kung__Proper_noun__1,http://kaiko.getalp.org/dbnary/eng/!Kung,!Kung,"/kʊŋ/,/ǃkʊŋ/","[/kʊŋ/, /ǃkʊŋ/]",Combination,1.0,1.0,1.0
http://kaiko.getalp.org/dbnary/eng/'sall_good__Interjection__1,http://kaiko.getalp.org/dbnary/eng/'sall_good,'sall good,[sɔːl ɡʊːd],[],Missing Pronunciation for a Component,0.0,0.0,0.0
http://kaiko.getalp.org/dbnary/eng/++__Suffix__1,http://kaiko.getalp.org/dbnary/eng/++,++,/plʌsˈplʌs/,[/plʌsˈplʌs/],Combination,1.0,1.0,1.0
http://kaiko.getalp.org/dbnary/eng/10x_developer__Noun__1,http://kaiko.getalp.org/dbnary/eng/10x_developer,10x developer,/ˌtɛˈnɛks dɪˈvɛ.lə.pɚ/,"[/θæŋks dɪˈvɛləpə(ɹ)/, /θæŋks dɪˈvɛləpɚ/]",Combination,0.0,0.0,0.0
http://kaiko.getalp.org/dbnary/eng/13th_month__Noun__1,http://kaiko.getalp.org/dbnary/eng/13th_month,13th month,/θɜːˌtiːnθ ˈmʌnθ/,[],Missing Pronunciation for a Component,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
http://kaiko.getalp.org/dbnary/eng/Đà_Nẵng__Proper_noun__1,http://kaiko.getalp.org/dbnary/eng/Đà_Nẵng,Đà Nẵng,"/dəˈnæŋ/,/ˈdɑːnæŋ/",[],Missing Pronunciation for a Component,0.0,0.0,0.0
http://kaiko.getalp.org/dbnary/eng/Đắk_Lắk__Proper_noun__1,http://kaiko.getalp.org/dbnary/eng/Đắk_Lắk,Đắk Lắk,/ˈdæklæk/,[],Missing Pronunciation for a Component,0.0,0.0,0.0
http://kaiko.getalp.org/dbnary/eng/Đắk_Nông__Proper_noun__1,http://kaiko.getalp.org/dbnary/eng/Đắk_Nông,Đắk Nông,/ˈdæknɒŋ/,[],Missing Pronunciation for a Component,0.0,0.0,0.0
http://kaiko.getalp.org/dbnary/eng/Đồng_Nai__Proper_noun__1,http://kaiko.getalp.org/dbnary/eng/Đồng_Nai,Đồng Nai,/ˈdɒŋnaɪ/,[],Missing Pronunciation for a Component,0.0,0.0,0.0


In [129]:
aggregated_mwe_with_pron.groupby('method').count()

Unnamed: 0_level_0,p,t,pr,computed,precision,recall,f1
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Combination,2160,2160,2160,2160,2160,2160,2160
Heteronymy,128,128,128,128,128,128,128
Missing Pronunciation for a Component,2448,2448,2448,2448,2448,2448,2448
No Derivation for Heteronymy,241,241,241,241,241,241,241


In [131]:
aggregated_mwe_with_pron.to_pickle('../data/aggregated_mwe_with_pron.pkl')

In [132]:
gold_with_heteronyms = aggregated_mwe_with_pron.query("method == 'Heteronymy'")

In [133]:
gold_with_heteronyms

Unnamed: 0_level_0,p,t,pr,computed,method,precision,recall,f1
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://kaiko.getalp.org/dbnary/eng/Bath_chair__Noun__1,http://kaiko.getalp.org/dbnary/eng/Bath_chair,Bath chair,/bɑːθ t͡ʃɛə/,"[/bæːθ t͡ʃɛə(ɹ)/, /bæːθ t͡ʃɛəɹ/, /bæːθ t͡ʃɪə(ɹ...",Heteronymy,0.000000,0.000000,0.000000
http://kaiko.getalp.org/dbnary/eng/absciss_layer__Noun__1,http://kaiko.getalp.org/dbnary/eng/absciss_layer,absciss layer,"/ˈleɪ.ə/,/ˈleɪ.ɚ/,/ˈlɜ.ə/,/ˈlɜ.ɚ/,/ˈæb.sɪs /,/...","[/ˈæb.sɪs leɪ.ə/, /ˈæb.sɪs lɛə/, /ˈæb.sɪs ˈleɪ...",Heteronymy,0.333333,0.166667,0.222222
http://kaiko.getalp.org/dbnary/eng/abuse_of_distress__Noun__1,http://kaiko.getalp.org/dbnary/eng/abuse_of_di...,abuse of distress,/əˈbjuːsəvdɪstɹɛs/,"[/əˈbjus ɒv dɪˈstɹɛs/, /əˈbjus ɔv dɪˈstɹɛs/, /...",Heteronymy,0.100000,1.000000,0.181818
http://kaiko.getalp.org/dbnary/eng/access_time__Noun__1,http://kaiko.getalp.org/dbnary/eng/access_time,access time,/ˈaksɛsˌtʌɪm/,"[/ˈæksɛs taɪm/, /ˈæksɛs tɑɪm/, /ˈæksɛs tɜːm/, ...",Heteronymy,0.000000,0.000000,0.000000
http://kaiko.getalp.org/dbnary/eng/air_mass__Noun__1,http://kaiko.getalp.org/dbnary/eng/air_mass,air mass,"/ɛəɹ mæs/,/ɛːɹ mæs/,/ˈɛː mas/","[/eə̯ mæs/, /eː mæs/, /ɛə̯ mæs/, /ɛɚ mæs/, /ɛɹ...",Heteronymy,0.166667,0.333333,0.222222
...,...,...,...,...,...,...,...,...
http://kaiko.getalp.org/dbnary/eng/wind_farm__Noun__1,http://kaiko.getalp.org/dbnary/eng/wind_farm,wind farm,"/ˈwɪndfɑːm/,/ˈwɪndˌfɑɹm/","[/ˈwaɪnd fɑːm/, /ˈwaɪnd fɑːɹm/, /ˈwɪnd fɑːm/, ...",Heteronymy,0.500000,1.000000,0.666667
http://kaiko.getalp.org/dbnary/eng/wind_up__Noun__1,http://kaiko.getalp.org/dbnary/eng/wind_up,wind up,/waɪnd ˈʌp/,"[/waɪnd ap/, /waɪnd ʌp/]",Heteronymy,0.500000,1.000000,0.666667
http://kaiko.getalp.org/dbnary/eng/wind_up__Verb__1,http://kaiko.getalp.org/dbnary/eng/wind_up,wind up,/waɪnd ˈʌp/,"[/waɪnd ap/, /waɪnd ʌp/]",Heteronymy,0.500000,1.000000,0.666667
http://kaiko.getalp.org/dbnary/eng/winding_hole__Noun__1,http://kaiko.getalp.org/dbnary/eng/winding_hole,winding hole,/ˈwɪndɪŋˌhəʊl/,"[/ˈwɪndɪŋ hoʊl/, /ˈwɪndɪŋ hɐʉl/, /ˈwɪndɪŋ həʊl/]",Heteronymy,0.333333,1.000000,0.500000


### Evaluating the methodology

For each entry, compare the known prons with the computed ones, by calculating recall and precision

In [173]:
# aggregated_mwe_with_pron['recall'], aggregated_mwe_with_pron['precision'] = zip(*aggregated_mwe_with_pron[['pr','computed']].applymap(lambda pair: print(pair)))
ipa_normalizer = str.maketrans('', '', 'ˈˌːˑ.ǀǁ‿' )
ipa_normalizer_no_spaces = str.maketrans('', '', 'ˈˌːˑ.ǀǁ‿ ' )

def no_suprasegmentals(pron: str) -> str:
 return pron.translate(ipa_normalizer)

def no_space(pron: str) -> str:
 return pron.translate(str.maketrans('', '', ' ' ))

def no_suprasegmentals_no_spaces(pron: str) -> str:
 return pron.translate(ipa_normalizer_no_spaces)

def precision_recall(line, normalizer=None):
    gold = str.split(line['pr'], sep=',')
    if normalizer:
        gold = [normalizer(x) for x in gold]
    answer = line['computed']
    if normalizer:
        answer = [normalizer(x) for x in answer]
    intersection = list(set(gold) & set(answer))
    precision, recall, f1_measure = 0, 0, 0
    if len(answer) > 0 :
        precision = len(intersection) / len(answer)
    if len(gold) > 0 :
        recall = len(intersection) / len(gold)
    if precision > 0 or recall > 0:
        f1_measure = 2 * precision * recall / (precision + recall)
    return precision, recall, f1_measure

def precision_recall_no_suprasegmentals(line):
    return precision_recall(line, normalizer=no_suprasegmentals)

def precision_recall_no_suprasegmentals_no_spaces(line):
    return precision_recall(line, normalizer=no_suprasegmentals_no_spaces)

def precision_recall_no_space(line):
    return precision_recall(line, normalizer=no_space)

#aggregated_mwe_with_pron.sample(10)[['pr','computed']].apply(precision_recall, axis=1)
aggregated_mwe_with_pron['precision'], aggregated_mwe_with_pron['recall'], aggregated_mwe_with_pron['f1'] = zip(*aggregated_mwe_with_pron[['pr','computed']].apply(precision_recall_no_suprasegmentals, axis=1))
#aggregated_mwe_with_pron['precision'], aggregated_mwe_with_pron['recall'], aggregated_mwe_with_pron['f1'] = zip(*aggregated_mwe_with_pron[['pr','computed']].apply(precision_recall_no_suprasegmentals_no_spaces, axis=1))
#aggregated_mwe_with_pron['precision'], aggregated_mwe_with_pron['recall'], aggregated_mwe_with_pron['f1'] = zip(*aggregated_mwe_with_pron[['pr','computed']].apply(precision_recall_no_space, axis=1))
#aggregated_mwe_with_pron['precision'], aggregated_mwe_with_pron['recall'], aggregated_mwe_with_pron['f1'] = zip(*aggregated_mwe_with_pron[['pr','computed']].apply(precision_recall, axis=1))


In [174]:
# F-measure on all MWT
f1_all = aggregated_mwe_with_pron['f1'].sum() / aggregated_mwe_with_pron['f1'].count()
prec_all = aggregated_mwe_with_pron['precision'].sum() / aggregated_mwe_with_pron['precision'].count()
recall_all = aggregated_mwe_with_pron['recall'].sum() / aggregated_mwe_with_pron['recall'].count()

combinations_results = aggregated_mwe_with_pron[aggregated_mwe_with_pron['method'] == 'Combination']
f1_combination = combinations_results['f1'].sum() / combinations_results['f1'].count()
prec_combination = combinations_results['precision'].sum() / combinations_results['precision'].count()
recall_combination = combinations_results['recall'].sum() / combinations_results['recall'].count()

heteronymy_results = aggregated_mwe_with_pron[aggregated_mwe_with_pron['method'] == 'Heteronymy']
f1_heteronym = heteronymy_results['f1'].sum() / heteronymy_results['f1'].count()
prec_heteronym = heteronymy_results['precision'].sum() / heteronymy_results['precision'].count()
recall_heteronym = heteronymy_results['recall'].sum() / heteronymy_results['recall'].count()

print('F1:        all: ', f1_all,     ' / Combinations: ', f1_combination,     ' / Heteronymy: ', f1_heteronym, end="\n")
print('Precision: all: ', prec_all,   ' / Combinations: ', prec_combination,   ' / Heteronymy: ', prec_heteronym, end="\n")
print('Recall:    all: ', recall_all, ' / Combinations: ', recall_combination, ' / Heteronymy: ', recall_heteronym, end="\n")


F1:        all:  0.14956708102049934  / Combinations:  0.3324374651970818  / Heteronymy:  0.20570654229162993
Precision: all:  0.13182251126862948  / Combinations:  0.2937373620722372  / Heteronymy:  0.16881200396825397
Recall:    all:  0.22923105013219539  / Combinations:  0.5045214212228101  / Heteronymy:  0.3993489583333333


Here, the brute precision/recall/f1 is :

F1:        all:  0.05607510458121018  / Combinations:  0.1269479189362928  / Heteronymy:  0.038111645299145305
Precision: all:  0.0516666467337052  / Combinations:  0.1172070013962375  / Heteronymy:  0.03107638888888889
Recall:    all:  0.07717165628558033  / Combinations:  0.17318672839506172  / Heteronymy:  0.078125

When removing SPACES :

F1:        all:  0.05704945673096919  / Combinations:  0.1285977505861244  / Heteronymy:  0.048156288156288155
Precision: all:  0.05246587875656591  / Combinations:  0.11862418246619637  / Heteronymy:  0.038237847222222225
Recall:    all:  0.0789799745495948  / Combinations:  0.17619598765432098  / Heteronymy:  0.09765625

When removing the suprasegmentals markers in the gold and computed pronunciation, we get :
F1:        all:  0.14956708102049934  / Combinations:  0.3324374651970818  / Heteronymy:  0.20570654229162993
Precision: all:  0.13182251126862948  / Combinations:  0.2937373620722372  / Heteronymy:  0.16881200396825397
Recall:    all:  0.22923105013219539  / Combinations:  0.5045214212228101  / Heteronymy:  0.3993489583333333

When removing the suprasegmentals markers AND SPACES in the gold and computed pronunciation, we get :
F1:        all:  0.1766825076287711  / Combinations:  0.3896924625335098  / Heteronymy:  0.29385251090634856
Precision: all:  0.15615959534302604  / Combinations:  0.34578628218254814  / Heteronymy:  0.236780753968254
Recall:    all:  0.2748492270794863  / Combinations:  0.5994481187536743  / Heteronymy:  0.5712239583333334

### Computing the larger problem

We apply the very same technique on all MWT for which we have no known pronunciation.


In [140]:
mwe_without_pron['Computed Pronunciations'] = mwe_without_pron.apply(
    lambda x: build_pronunciations(x['t'], x['p']), axis=1)
mwe_without_pron['computed'], mwe_without_pron['method'] = zip(
    *mwe_without_pron['Computed Pronunciations'])
mwe_without_pron = mwe_without_pron.drop('Computed Pronunciations', axis=1)


KeyboardInterrupt: 

In [141]:
mwe_without_pron.count()

p           214244
e           214244
t           214244
pr               0
mweOrLE     214244
computed    214244
method      214244
dtype: int64

In [142]:
mwe_without_pron.to_pickle('../data/mwe_wihout_pron.pkl')

In [143]:
mwe_without_pron

Unnamed: 0,p,e,t,pr,mweOrLE,computed,method
0,http://kaiko.getalp.org/dbnary/eng/Above_Derwent,http://kaiko.getalp.org/dbnary/eng/Above_Derwe...,Above Derwent,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,[],Missing Pronunciation for a Component
1,http://kaiko.getalp.org/dbnary/eng/Absolution_Day,http://kaiko.getalp.org/dbnary/eng/Absolution_...,Absolution Day,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,[],Missing Pronunciation for a Component
2,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian banana,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,"[/ˌæb.əˈsɪn.i.ən bəˈnæ.nə/, /ˌæb.əˈsɪn.i.ən bə...",Combination
3,http://kaiko.getalp.org/dbnary/eng/Abyssinian_cat,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian cat,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,"[/ˌæb.əˈsɪn.i.ən kat/, /ˌæb.əˈsɪn.i.ən kæt/, /...",Combination
4,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,http://kaiko.getalp.org/dbnary/eng/Abyssinian_...,Abyssinian gold,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,"[/ˌæb.əˈsɪn.i.ən ɡaʉld/, /ˌæb.əˈsɪn.i.ən ɡoʊld...",Combination
...,...,...,...,...,...,...,...
214239,http://kaiko.getalp.org/dbnary/eng/zips_it,http://kaiko.getalp.org/dbnary/eng/zips_it__Ve...,zips it,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,[],Missing Pronunciation for a Component
214240,http://kaiko.getalp.org/dbnary/eng/zombie_cell,http://kaiko.getalp.org/dbnary/eng/zombie_cell...,zombie cell,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,"[/ˈzɑmbi sɛl/, /ˈzɒmbi sɛl/]",Combination
214241,http://kaiko.getalp.org/dbnary/eng/zombie_cells,http://kaiko.getalp.org/dbnary/eng/zombie_cell...,zombie cells,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,"[/ˈzɑmbi sɛlz/, /ˈzɒmbi sɛlz/]",Combination
214242,http://kaiko.getalp.org/dbnary/eng/Łódź_Voivod...,http://kaiko.getalp.org/dbnary/eng/Łódź_Voivod...,Łódź Voivodeship,,http://www.w3.org/ns/lemon/ontolex#MultiWordEx...,[],Missing Pronunciation for a Component


In [144]:
mwe_without_pron.groupby('method').count()

Unnamed: 0_level_0,p,e,t,pr,mweOrLE,computed
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Combination,114969,114969,114969,0,114969,114969
Heteronymy,2246,2246,2246,0,2246,2246
Missing Pronunciation for a Component,86689,86689,86689,0,86689,86689
No Derivation for Heteronymy,10340,10340,10340,0,10340,10340
