In [1]:
import sys
import re
from collections import defaultdict,namedtuple
import json
import os
import pickle

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 400)

parser_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/bio/c2/resources/tools/wiktionary-de-parser'
sys.path.append(parser_dir)

from wiktionary_de_parser.dump_processor import WiktionaryDump
from wiktionary_de_parser import WiktionaryParser
from pprint import pprint

# Generate nouns lookup table

In [72]:
def is_german(wiki_record):
    return wiki_record.language.lang == 'Deutsch'

def get_flexion_field(wiki_record, field_name):
    if field_name in wiki_record.flexion:
        wordform = wiki_record.flexion[field_name].strip()
        wordform = re.sub(r".*:'' ",r"",wordform) #remarks like 'selten:'/'militarisch:'
        return wordform
    else:
        return None

In [73]:
def is_noun(wiki_record):
    return (wiki_record.pos 
            and 'Substantiv' in wiki_record.pos 
            and wiki_record.flexion is not None #we can't do much without any flexion information
           )

In [74]:
def get_noun_wordforms_adjective_declination(lemma):
    '''
    Decline nouns as adjectives
    e.g. Beamte(r), Unbekannte(r)
    '''

    strong_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'en',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'er','Dativ Singular':'er','Akkusativ Singular':'e',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'es',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},}
    
    weak_declinations = {'m':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    mixed_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'es',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    wordforms = defaultdict(set)
        
    Lemma = namedtuple('lemma', 'lemma connection genus')
    
    for declinations in (strong_declinations,weak_declinations,mixed_declinations):
        for genus in ('m','f','n'):
            for category, ending in declinations[genus].items():
                wordform = re.sub(r'er?$','',lemma) + ending
                wordforms[wordform].add(Lemma(lemma,category,genus))

    return wordforms

In [75]:
def get_noun_forms(wiki_record):

    base_categories = ['Nominativ Singular', 'Nominativ Plural', 'Genitiv Singular', 'Genitiv Plural', 'Dativ Singular', 
                         'Dativ Plural', 'Akkusativ Singular', 'Akkusativ Plural']
    
    #remove these rare nouns in order not to confuse with some forms of more common nouns
    stoplist = {'gedanken':'.*', 'real':'.*', 'studium':'Plural', 'fleck':'Plural 2', 'post':'Plural', 'gemein':'.*', 'willen':'.*',
                'namen':'.*', 'arme':'.*', 'schade':'Plural', 'zeug':'Plural', 'omme':'.*', 'praxis':'Plural 2','schranken':'.*','fliegen':'.*'}

    Lemma = namedtuple('lemma', 'lemma connection genus')

    lemmas = defaultdict(set)

    lemma = wiki_record.lemma.lemma.lower() #we will use lowercase lemmas in the dictionary

    if ' ' in lemma:
        #we exclude fixed expressions consisting of multiple words (often entities, e.g. 'Vereinigte Arabische Emirate')
        return {}

    if 'adjektivische Deklination' in wiki_record.pos['Substantiv']:
        #decline as an adjective
        if lemma.endswith('r'):
            #don't use feminine (use masculine) lemma for plural forms, in accordance with other lemmatizers
            lemmas = get_noun_wordforms_adjective_declination(lemma)
        
    genus_categories = [x for x in wiki_record.flexion.keys() if x.startswith('Genus')] # 'Genus', 'Genus 1', 'Genus 2' in case the noun admits multiple genera
            
    if len(genus_categories)==0 and 'Nominativ Plural' in wiki_record.flexion:
        #the noun admits only plural form
        genus_categories = ['only_plural']

    for genus_category in genus_categories:

        if genus_category=='only_plural':
            genus = 'only_plural'
        else:
            genus = wiki_record.flexion[genus_category]
                        
        for base_category in base_categories:

            if len(genus_categories)==1:
                #if there is only a single genus, extra forms ending with digits or stars are possible
                extended_categories = [base_category+genus_suffix+'*'*n for genus_suffix in ('',' 1',' 2',' 3',' 4') for n in range(3)] # additional forms with an asterisk
            else:
                #if there are multiple genera, extra forms endings should match the genus index or be empty
                genus_suffix = genus_category.replace('Genus','') #empty or ' 1', ' 2', etc...
                extended_categories = [base_category+genus_suffix_+'*'*n for genus_suffix_ in ('',genus_suffix) for n in range(3)] # additional forms with an asterisk

            for extended_category in extended_categories:

                if lemma in stoplist and re.search(stoplist[lemma],extended_category):
                    continue
                        
                if extended_category in wiki_record.flexion:
                    
                    wordform = get_flexion_field(wiki_record,extended_category).lower()
        
                    if not re.match('^[\w -]+$',wordform):
                        print(f'Unrecognized characters in wordform {wordform} for {lemma}')
                        return {}
          
                    lemmas[wordform].add(Lemma(lemma,base_category,genus))#assign lemma to the wordform


    return lemmas

In [76]:
dump = WiktionaryDump(
    dump_file_path=parser_dir + "/wiktionary_german/dewiktionary-latest-pages-articles-multistream.xml.bz2"
)

In [77]:
def get_nouns(dump):

    nouns = defaultdict(set)
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_noun(wiki_record):
                wordlemmas = get_noun_forms(wiki_record)
                for k,v in wordlemmas.items():
                    nouns[k] = nouns[k].union(v)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')


    nouns = {k:[lemma_record._asdict() for lemma_record in v] for k,v in nouns.items()} #defaultdict with sets of named tuples to dict with lists of dicts

    return nouns

In [None]:
nouns = get_nouns(dump)

In [None]:
#for wordform, lemmas in nouns.items():
#    lemmas_NP = []
#    for lemma in lemmas:
#        if lemma['connection'] == 'Nominativ Plural':
#            lemmas_NP.append((lemma['lemma'],lemma['genus']))
#    if lemmas_NP:
#        lemmas_, genera_ = zip(*lemmas_NP)
#        if 'only_plural' in genera_ and len(set(lemmas_))>1:
#            nouns[wordform] = [lemma for lemma in nouns[wordform] if lemma['genus']=='only_plural' or (lemma['lemma'],lemma['genus']) not in lemmas_NP]
#            print(wordform,lemmas_NP)

In [79]:
#with open('data/nouns.json', 'wt', encoding='UTF-8') as json_file:
#    json.dump(nouns, json_file, ensure_ascii=False)

# Unknown nouns

In [159]:
parser = WiktionaryParser()
    
noun_flexion_cat = []

n_records = 0

for page in dump.pages():
    if page.redirect_to:
        continue
        
    #page_names.append(page.name)
                
    if page.name in ("Angehöriger",):
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_noun(wiki_record):
                print(wiki_record)
                keys = tuple([x for x in wiki_record.flexion.keys() if x[0].isupper()])
                noun_flexion_cat.append((page.name,keys))
                n_records += 1
                break
        break
        
            #pprint(wiki_record)
            #break
        #break

name='Angehöriger' hyphenation=['An', 'ge', 'hö', 'ri', 'ger'] flexion={'Genus': 'm', 'Stamm': 'Angehörige'} ipa=['ˈanɡəˌhøːʁɪɡɐ'] language=Language(lang='Deutsch', lang_code='de') lemma=Lemma(lemma='Angehöriger', inflected=False) pos={'Substantiv': ['adjektivische Deklination']} rhymes=None


In [None]:
get_noun_forms(wiki_record)

In [12]:
words, cats = zip(*noun_flexion_cat)

words = np.array(words)
cats_flatten = pd.Series([y for x in cats for y in x])

# Lemmatizer class

In [2]:
def get_verb_lemma_morpho(word, query_verb_dict_fnc, prefixes_list, use_longest_subword=True):

    def get_verb_lemma_recursive(word, morphemes=[]):
        '''
        Recursively detect prefixes and yield all possible lemmas
        '''
        res = []
        
        base_lemma = query_verb_dict_fnc(word)
        
        if base_lemma:
            #print(word,morphemes,base_lemma)
            if len(morphemes)>1 and morphemes[-1] == 'zu' and base_lemma == word:
                #suspect a zu-infinitive: some prefix on the right + zu + infinitive
                #e.g. abzuheben, aufzuatmen
                res.append((''.join(morphemes[:-1]+[base_lemma]),word)) #add without "zu"
            else:
                res.append((''.join(morphemes+[base_lemma]),word)) #compose the infinitive out of the collected prefixes and the base lemma
                
        for prefix_end_idx in range(1,len(word)-2):
            #look for the next prefix which ends at prefix_end_idx
            if word[:prefix_end_idx] in prefixes_list:
                #prefix in the list of known prefixes
                prefix = word[:prefix_end_idx] 
                res.extend(get_verb_lemma_recursive(word[prefix_end_idx:], morphemes+[prefix])) #detach the prefix, call the function again
    
        return res
        
    lemmas = get_verb_lemma_recursive(word)

    if lemmas:
        lemmas.sort(key=lambda x:-len(x[1])) #sort according to subword length, ascending=False
        lemmas, subwords = zip(*lemmas)     
        if (use_longest_subword or len(set(lemmas))==1):
        #all possible splits lead to the same lemma or taking the longest subword allowed
            return lemmas[0]
    else:
        return None

In [3]:
def get_noun_lemma_fwdsearch(word, query_noun_dict_fnc, use_longest_subword=True):

    '''
    Get all possible noun lemmas using forward search
    '''

    lemmas = []
    
    for start_idx in range(0,len(word)-2):
        #remove letters one by one, until the rest of the word matches one in the vocabulary
        trial_word = word[start_idx:]
        base_lemma = query_noun_dict_fnc(trial_word)
        if base_lemma:
            #print(word[0:start_idx] + base_lemma, base_lemma)
            lemmas.append(word[0:start_idx] + base_lemma)

    if lemmas and (use_longest_subword or len(set(lemmas))==1):
        #all possible splits lead to the same lemma or taking the longest subword allowed
        return lemmas[0]
    else:
        return None

In [4]:
article_constraints = {'dieser': (('f', 'Genitiv Singular'), ('n', 'Genitiv Plural'), ('m', 'Genitiv Plural'), ('m', 'Nominativ Singular'), ('f', 'Dativ Singular'), ('f', 'Genitiv Plural'), ('only_plural', 'Genitiv Plural')), 
 'der': (('f', 'Genitiv Singular'), ('n', 'Genitiv Plural'), ('m', 'Genitiv Plural'), ('m', 'Nominativ Singular'), ('f', 'Dativ Singular'), ('f', 'Genitiv Plural'), ('only_plural', 'Genitiv Plural')), 
 'kein': (('m', 'Nominativ Singular'), ('n', 'Nominativ Singular'), ('n', 'Akkusativ Singular')), 
 'dieses': (('n', 'Nominativ Singular'), ('n', 'Akkusativ Singular'), ('m', 'Genitiv Singular'), ('n', 'Genitiv Singular')), 
 'des': (('m', 'Genitiv Singular'), ('n', 'Genitiv Singular')), 'keines': (('m', 'Genitiv Singular'), ('n', 'Genitiv Singular')), 
 'diesem': (('n', 'Dativ Singular'), ('m', 'Dativ Singular')), 'dem': (('n', 'Dativ Singular'), ('m', 'Dativ Singular')), 
 'keinem': (('n', 'Dativ Singular'), ('m', 'Dativ Singular')), 'diesen': (('n', 'Dativ Plural'), ('only_plural', 'Dativ Plural'), ('f', 'Dativ Plural'), ('m', 'Akkusativ Singular'), ('m', 'Dativ Plural')), 
 'den': (('n', 'Dativ Plural'), ('only_plural', 'Dativ Plural'), ('f', 'Dativ Plural'), ('m', 'Akkusativ Singular'), ('m', 'Dativ Plural')), 
 'keinen': (('n', 'Dativ Plural'), ('only_plural', 'Dativ Plural'), ('f', 'Dativ Plural'), ('m', 'Akkusativ Singular'), ('m', 'Dativ Plural')), 
 'die': (('only_plural', 'Nominativ Plural'), ('n', 'Akkusativ Plural'), ('f', 'Akkusativ Singular'), ('only_plural', 'Akkusativ Plural'), ('f', 'Nominativ Singular'), ('m', 'Nominativ Plural'), ('m', 'Akkusativ Plural'), ('f', 'Nominativ Plural'), ('n', 'Nominativ Plural'), ('f', 'Akkusativ Plural')), 
 'diese': (('only_plural', 'Nominativ Plural'), ('n', 'Akkusativ Plural'), ('f', 'Akkusativ Singular'), ('only_plural', 'Akkusativ Plural'), ('f', 'Nominativ Singular'), ('m', 'Nominativ Plural'), ('m', 'Akkusativ Plural'), ('f', 'Nominativ Plural'), ('n', 'Nominativ Plural'), ('f', 'Akkusativ Plural')), 
 'keine': (('only_plural', 'Nominativ Plural'), ('n', 'Akkusativ Plural'), ('f', 'Akkusativ Singular'), ('only_plural', 'Akkusativ Plural'), ('f', 'Nominativ Singular'), ('m', 'Nominativ Plural'), ('m', 'Akkusativ Plural'), ('f', 'Nominativ Plural'), ('n', 'Nominativ Plural'), ('f', 'Akkusativ Plural')), 
 'keiner': (('f', 'Genitiv Singular'), ('f', 'Genitiv Plural'), ('m', 'Genitiv Plural'), ('f', 'Dativ Singular'), ('n', 'Genitiv Plural'), ('only_plural', 'Genitiv Plural')), 'das': (('n', 'Nominativ Singular'), ('n', 'Akkusativ Singular'))
}

prep_constraints = {
                 'zu': (('m','Dativ Singular'),('n','Dativ Singular'),('f','Dativ Singular'),('m','Dativ Plural'),('n','Dativ Plural'),('f','Dativ Plural')),
                 'von': (('m','Dativ Singular'),('n','Dativ Singular'),('f','Dativ Singular'),('m','Dativ Plural'),('n','Dativ Plural'),('f','Dativ Plural')),
                 'bei': (('m','Dativ Singular'),('n','Dativ Singular'),('f','Dativ Singular'),('m','Dativ Plural'),('n','Dativ Plural'),('f','Dativ Plural')),
                 'durch': (('m','Akkusativ Singular'),('n','Akkusativ Singular'),('f','Akkusativ Singular'),('m','Akkusativ Plural'),('n','Akkusativ Plural'),('f','Akkusativ Plural')),
                 'für': (('m','Akkusativ Singular'),('n','Akkusativ Singular'),('f','Akkusativ Singular'),('m','Akkusativ Plural'),('n','Akkusativ Plural'),('f','Akkusativ Plural')),
                 'um': (('m','Akkusativ Singular'),('n','Akkusativ Singular'),('f','Akkusativ Singular'),('m','Akkusativ Plural'),('n','Akkusativ Plural'),('f','Akkusativ Plural')),
                 'im':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'beim':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'zum':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'vom':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'zur':(('f','Dativ Singular'),),
                 'hintern':(('m','Akkusativ Singular'),),
                 'übern':(('m','Akkusativ Singular'),),
                 'untern':(('m','Akkusativ Singular'),),
                 'ins':(('n','Akkusativ Singular'),),
                 'aufs':(('n','Akkusativ Singular'),),
                 'durchs':(('n','Akkusativ Singular'),),
                 'fürs':(('n','Akkusativ Singular'),),
                 'ums':(('n','Akkusativ Singular'),),
                 'vors':(('n','Akkusativ Singular'),),
                 'übers':(('n','Akkusativ Singular'),),
                 'unters':(('n','Akkusativ Singular'),),
                 'hinterm':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'überm':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'unterm':(('m','Dativ Singular'),('n','Dativ Singular')),
                 'vorm':(('m','Dativ Singular'),('n','Dativ Singular')),
                }

def get_base_determiner(word):
    '''
    Check if a given word is a determiner and return the base form
    '''
    kein_style = re.match(r'(mein|dein|sein|ihr|Ihr|euer|unser|ein|kein|welch|solch|manch)($|e[rsnm]?$)',word)
    if kein_style:
        return 'kein'+kein_style.groups()[1]
    der_form = re.match(r'(der|die|das|dem|den|des)$',word)
    if der_form:
        return word
    der_style = re.match(r'(diese|jede|jene)([rsnm]?$)',word)
    if der_style:
        return 'diese'+der_style.groups()[1]
    return None

In [75]:
class NounsNBC():

    def __init__(self, path):

        with open(path,'rb') as f:

            data = pickle.load(f)
            
            self.nbc_clf = data['clf']
            self.features_encoder = data['features_encoder']
            self.features_list = data['features_list']
            self.rules_list = data['rules_list']
            self.n_last = data['n_last']
        
    def __call__(self, word, constraints=None):
        
        word_parts = [word[idx:] for idx in range(-self.n_last,0)]

        if not constraints:
            constraints = ((-1,-1),)

        data = [word_parts+list(constraint) for constraint in constraints]

        word_enc = self.features_encoder.transform(data).astype(int)
        
        if len(constraints)==1:

            pred = self.nbc_clf.predict(word_enc)[0]
            
        else:
                        
            pred = self.nbc_clf.predict_proba(word_enc).mean(0).argmax()
        
        rule = self.rules_list[pred]
    
        if rule=='-':
            return None
        else:
            seq_to_remove,seq_to_add = rule
            return re.sub(f'{seq_to_remove}$',seq_to_add,word)
            
        return None

In [76]:
class CategoricalNaiveBayes():

    def __init__(self, kappa=2, epsilon=1e-20):
        
        self.kappa = kappa
        self.epsilon = epsilon

    def _compute_priors_logprobs(self, y):

        priors_probs = [class_counts/len(y) for class_counts in self.class_counts]

        self.priors_logprobs = np.log(priors_probs)
        
    def _compute_loglikelihood(self, X, y):
        
        feature_counts = {feature_idx:np.zeros((self.n_categories[feature_idx]+2,self.n_classes)) for feature_idx in range(self.n_features)}
        
        for features, class_idx in zip(X, y):
            
            for feature_idx,feature_value in enumerate(features):
                
                feature_counts[feature_idx][feature_value,class_idx] += 1

        loglikelihood = {feature_idx:np.zeros((self.n_categories[feature_idx]+2,self.n_classes)) for feature_idx in range(self.n_features)}

        for feature_idx in range(self.n_features):
            loglikelihood[feature_idx] = np.log((feature_counts[feature_idx]+self.epsilon)
                                                          / (np.repeat(self.class_counts[None,...], self.n_categories[feature_idx]+2, axis=0)
                                                            + self.kappa*self.epsilon))

            loglikelihood[feature_idx][-1,:] = 0

        self.loglikelihood = loglikelihood

        
    def fit(self, X_train, y_train, priors_logprobs=None):

        counter = Counter(y_train)
        
        class_ids, class_counts = zip(*sorted(counter.items()))
        
        self.class_counts = np.array(class_counts)
        self.n_classes = np.max(class_ids)+1

        self.n_features = X_train.shape[1]
        self.n_categories = X_train.max(axis=0)

        if priors_logprobs is None:
            self._compute_priors_logprobs(y_train)
        else:
            self.priors_logprobs = priors_logprobs

        self._compute_loglikelihood(X_train, y_train)

    def _get_bayes_numerator(self, X):

        n_samples = X.shape[0]

        sample_loglikelihood = np.zeros((n_samples,self.n_features,self.n_classes))

        for feature_idx in range(self.n_features):
            
            sample_loglikelihood[:,feature_idx,:] = self.loglikelihood[feature_idx][X[:,feature_idx]] #N_samplesxN_classes

        numerator = sample_loglikelihood.sum(axis=1)  + self.priors_logprobs[None,...]

        return numerator
            
    def predict_proba(self, X):

        numerator = np.exp(self._get_bayes_numerator(X))
        
        probs = numerator/numerator.sum(axis=1,keepdims=True)
                            
        return probs

    def predict(self, X):

        predicted_class_ids = self._get_bayes_numerator(X).argmax(1)

        return predicted_class_ids
        
    def score(self, X, y):

        y_pred = self.predict(X)

        return (y_pred==np.array(y)).mean()

In [87]:
class NounsStatRules():

    def __init__(self, path):

        with open(path,'rb') as f:
            
            data = pickle.load(f)
            
            self.rules_dict = data['rules_dict']
            self.n_last = data['n_last']
            
    def __call__(self,word):

        for idx in range(-self.n_last,0):
            rule =  self.rules_dict[f'last_{abs(idx)}'].get(word[idx:],None)
            if rule:
                seq_to_remove,seq_to_add = rule
                return re.sub(f'{seq_to_remove}$',seq_to_add,word)

        return None

In [89]:
class GLemma():

    """Wiktionary-based German lemmatizer.

    Provides a lemma for a given word given the POS tag:
    NOUN, VERB, ADJ, ADV.

    Parameters
    ----------
    lemmatizer_data_path : str
        Data path to the lemmatizer dictionaries.
        
    unknown_strategy : str
        How to treat words not represented in the dictionary.
        When contains {POS}_morpho string, performs morphological analysis of the given word
        by decomposing it into known parts: a verb is split into its prefixes and the root and
        the compound noun is split into more simple nouns. When contains {POS}_fwds string,
        looks for all the words that are contained in the given word and uses their lemmas.
        When multiple lemmas for the given word are possible, returns None unless the longest_subword_ukn flag is True.

    longest_subword_ukn : bool, default=True
        Use the lemma of the longest subword for unknown words.
        
    wordfreq_csv : str, default=None
        A file with approximate word frequencies. When multiple lemmas
        for a given word form are possible, the most frequent lemma is taken.
        Does not have to be a lemma list.
    
    Examples
    --------
    >>> lemmatizer = GLemma('./data', unknown='VERB_morpho;VERB_fwds',
                    wordfreq_csv='data/third-party/FrequencyWords/content/2018/de/de_full.txt')
    >>> lemmatizer('vermalt','VERB')
    'vermalen'

    Notes
    -----
    If a verb prefix is separated in the sentence, it should be attached to the root before the lemmatization:
    Ich hole dich ab --> lemmatizer('abhole','VERB')
    """

    def __init__(self, lemmatizer_data_path, unknown_strategy='skip', nouns_nbc_path=None, longest_subword_ukn=True, wordfreq_csv=None):

        self.vocab = {}

        
        with open(os.path.join(lemmatizer_data_path,'nouns.json'), 'rt', encoding='UTF-8') as json_file:
            self.vocab['N'] = json.load(json_file)

        if nouns_nbc_path:
            self.nouns_nbc = NounsNBC(nouns_nbc_path)
        else:
            self.nouns_nbc = None

        self.nouns_stat_rules = NounsStatRules(os.path.join(lemmatizer_data_path,'nouns_stat_rules.pickle'))
    
        if wordfreq_csv:
            self.wordfreq = pd.read_csv(wordfreq_csv, sep=' ', names=['word','freq']).set_index('word').freq.sort_values(ascending=False) #sort by frequency
            self.wordfreq = self.wordfreq.to_dict()
        else:
            self.wordfreq = None

        self.unknown_strategy = unknown_strategy
        self.longest_subword_ukn = longest_subword_ukn

    def get_most_frequent_word(self, lemmalist):
        
        if self.wordfreq:
            freqs = [self.wordfreq.get(word, np.nan) for word in lemmalist]
            if all(np.isnan(freqs)):
                return None
            else:
                #if at least one word in the wordrank dictionary
                return lemmalist[np.nanargmax(freqs)]
        else:
            return None

    def get_noun_constraints(self, spacy_token):

        if spacy_token is None:
            return None
        
        ancestors_lemmas = [x.text.lower() for x in spacy_token.ancestors] #hope to find prepositions here, can be fused with articles
        for ancestors_lemma in ancestors_lemmas:
            ancestors_constraints = prep_constraints.get(ancestors_lemma, None)
            if ancestors_constraints:
                return ancestors_constraints
            
        childeren_lemmas = [x.text.lower() for x in spacy_token.children] #hope to find determiners here
        for childeren_lemma in childeren_lemmas:
            base_determiner = get_base_determiner(childeren_lemma)#conver determiners to canonical form
            if base_determiner: 
                return article_constraints[base_determiner]
                
        return None
            
    def get_word_lemma(self, word, pos, spacy_token=None):

        lemmas = self.vocab[pos].get(word, None)   

        if not lemmas:
            #maybe old orthography? try to replace ß with ss at the end of the stem 
            newform=re.sub(r'ß($|es$|t?en$|t?e$|t?e?t$|t?est$)',r'ss\1', word) 
            lemmas = self.vocab[pos].get(newform, None)   

        if not lemmas:
            return None
            
        n_unique_lemmas = len(set([x['lemma'] for x in lemmas])) #count unique lemmas, e.g. 'konzentriert' will have to records: one for the infinitive and one for the Partizip II
        
        if n_unique_lemmas>1 and spacy_token:

            if pos=='N':
                
                constraints = self.get_noun_constraints(spacy_token)
                if constraints:
                    lemmas = [lemma for lemma in lemmas if (lemma['genus'],lemma['connection']) in constraints]
                
            elif pos=='V':
                pass
                
        lemmas = list(set([x['lemma'] for x in lemmas])) #remove all meta info, take unique lemmas
        
        if not lemmas:
            return None
            
        elif len(lemmas)>1:
            return self.get_most_frequent_word(lemmas)
            
        else:
            
            return lemmas[0]

    def __call__(self, word=None, pos=None, spacy_token=None):

        lemma = None

        if not word:
            word, pos = spacy_token.text, spacy_token.pos_

        word = word.lower()

        for pos_tag in ('N','V','ADJ','ADV'):
            if pos.startswith(pos_tag):
                pos = pos_tag

        lemma = self.get_word_lemma(word, pos, spacy_token=spacy_token)

        if not lemma:
            if pos=='N':
                #if the given wordform not found in the dictionary
                if 'NOUN_morpho' in self.unknown_strategy:
                    #first try to get the lemma by decomposing the word into the prefixes and the root
                    lemma = get_noun_lemma_morpho(word, lambda x:self.get_word_lemma(x, pos, spacy_token=spacy_token), use_longest_subword=self.longest_subword_ukn) 
                if not lemma and 'NOUN_fwds' in self.unknown_strategy:
                    #then try to get the lemma by finding a subword that is in the dictionary
                    lemma = get_noun_lemma_fwdsearch(word, lambda x:self.get_word_lemma(x, pos, spacy_token=spacy_token), use_longest_subword=self.longest_subword_ukn)

        if pos=='N':
            
            if not lemma:
                if self.nouns_nbc:
                    constraints = self.get_noun_constraints(spacy_token)
                    lemma = self.nouns_nbc(word, constraints)
                else:
                    lemma = self.nouns_stat_rules(word)
                
            if lemma:
                lemma = lemma.title()
            
        return lemma

In [92]:
nouns_nbc_path='data/nouns-nbc-top100.pickle'
nouns_nbc_path=None

lemmatizer = GLemma('./data', unknown_strategy='skip',
                    wordfreq_csv='data/third-party/FrequencyWords/content/2018/de/de_full.txt',
                    nouns_nbc_path=nouns_nbc_path)

In [16]:
def get_spacy_model(language='en'):

    import spacy

    language = {
        'en': 'en_core_web_lg',
        'fr': 'french',
        'de': 'de_core_news_lg',
    }[language]
    return spacy.load(language) 

spacy_model = get_spacy_model('de')

## Test TIGER

In [17]:
tiger_dataset = 'data/third-party/tiger_release_aug07.corrected.16012013.xml'

In [18]:
def read_tiger():
    
    sentences = []

    with open(tiger_dataset,'r', encoding='iso-8859-15') as f:
        while True:
            line = f.readline()
            while not '<terminals>' in line:
                line = f.readline()
                if '</corpus>' in line:
                    return sentences
            words = []
            while not '</terminals>' in line:
                line = f.readline()
                s = re.search(r'word="(\w+)" lemma="(\w+)" pos="(\w+)"',line)
                if s:
                    words.append(s.groups(0))
            if len(words)>0:
                sentences.append(words)

tiger_sentences = read_tiger()

In [19]:
len(tiger_sentences)

49827

In [94]:
tiger_res = []

for idx,sentence in enumerate(tiger_sentences):
    words, lemmas, pos = zip(*sentence)
    text = ' '.join(words)
    doc = spacy_model(text)
    if len(doc)==len(lemmas):
        for token, tiger_word, tiger_lemma, tiger_pos in zip(doc,words,lemmas,pos):
             if tiger_word == token.text:
                if token.pos_=='NOUN':
                   lemma = lemmatizer(spacy_token=token)
                   #lemma = None
                   tiger_res.append((text,tiger_word, tiger_pos, tiger_lemma, token.pos_, token.lemma_, lemma))
    if (idx+1)%2000==0:
        print(idx)
        break

tiger_res = pd.DataFrame(tiger_res, columns = ['sentence','word','tiger_pos','tiger_lemma','spacy_pos','spacy_lemma','pred_lemma'])

1999


In [95]:
tiger_res = pd.DataFrame(tiger_res, columns = ['sentence','word','tiger_pos','tiger_lemma','spacy_pos','spacy_lemma','pred_lemma'])

In [96]:
tiger_res = pd.DataFrame(tiger_res, columns = ['sentence','word','tiger_pos','tiger_lemma','spacy_pos','spacy_lemma','pred_lemma'])

In [97]:
tiger_res = tiger_res[~tiger_res.tiger_lemma.str.endswith('ß')]
tiger_res = tiger_res[~tiger_res.tiger_lemma.apply(lambda x:x.lower()==x)]
tiger_res = tiger_res[~tiger_res.tiger_lemma.apply(lambda x:x.upper()==x)]

In [98]:
((tiger_res.tiger_pos.str.startswith('N')) & (tiger_res.spacy_pos=='NOUN')).mean()

1.0

In [99]:
(tiger_res.tiger_lemma==tiger_res.pred_lemma).mean()

0.9763926874910033

In [100]:
(tiger_res.tiger_lemma==tiger_res.spacy_lemma).mean()

0.9959694832301713

In [259]:
fails_df = tiger_res[(tiger_res.tiger_lemma!=tiger_res.pred_lemma)&(tiger_res.tiger_lemma==tiger_res.spacy_lemma)].drop_duplicates(subset=['word','tiger_lemma'])
fails_df.loc[~fails_df.pred_lemma.isna(),['sentence','word','tiger_lemma','pred_lemma']]

Unnamed: 0,sentence,word,tiger_lemma,pred_lemma
203,Während die neuen Tiger Asiens wirtschaftlich vorbeizogen kam der Gigant dessen Geschäftsleute nur im Ausland demonstrieren konnten was in ihnen steckt nicht voran,Geschäftsleute,Geschäftsleute,Geschäftsmann
241,Er weiß um die Ungeduld seiner Landsleute für die notwendige Strukturanpassungen erst einmal Gürtel enger schnallen heißen,Landsleute,Landsleute,Landsmann
340,FR Auf die Wahlerfolge der rechtsradikalen Parteien haben die Etablierten in den Parlamenten und viele Medien bisher mit einer Strategie gezielter Ignoranz reagiert,Medien,Medium,Media
831,Ungefähr die Hälfte des Schadens soll durch Betriebsfremde verursacht sein,Schadens,Schaden,Schade
1244,Am April demonstrierten dort bis fünfhundert Kriegsinvaliden gegen die Kürzung der ihnen zustehenden Lebensmittelrationen,Kriegsinvaliden,Kriegsinvalider,Kriegsinvalide
1335,Sie benutzte die Lumpen Asozialen Dealer Messerstecher Schmarotzer und Geier für die eigenen Ziele,Lumpen,Lump,Lumpen
2324,1960 hatte seine Familie ihren Besitz in die örtliche LPG einbringen müssen die nun Agrar GmbH heißt und nur noch wenige der einst 400 Menschen beschäftigt zumeist in Arbeitsbeschaffungsmaßnahmen,GmbH,GmbH,Gmbh
2476,Fachleute sind sich sicher,Fachleute,Fachleute,Fachmann
3336,Russen und Japaner besiedelten die Inseln zu Beginn des Jahrhunderts gemeinsam und rotteten gemeinsam das dort lebende aus,Russen,Russe,Russ
3854,Schlechtes Orientierungsvermögen zeigten zwei dänische Seeleute beim Auffinden ihres Schiffes,Seeleute,Seeleute,Seemann


In [175]:
sentence='Ich gehe zum Haus'


doc = spacy_model(sentence)

In [176]:
for spacy_token in doc:
    if spacy_token.pos_=='NOUN' and spacy_token.text=='Haus':
        lemma = lemmatizer(spacy_token=spacy_token)
        print(token, spacy_token.dep_, spacy_token.head, list(spacy_token.children),list(spacy_token.ancestors), lemma)
        break

setzt nk zum [] [zum, gehe] Haus


## Test HDT

In [102]:
hdt_dataset = 'data/third-party/UD_German-HDT/de_hdt-ud-train-a-1.conllu'

In [103]:
def read_hdt():
    
    sentences = []
    words = []

    prev_sent_id = ''
    
    with open(hdt_dataset,'r', encoding='utf-8') as f:
        for line in f:
            if 'sent_id' in line:
                sent_id = re.search(r'sent_id = (.+)',line).groups(1)
                if sent_id!=prev_sent_id:
                    if words:
                        sentences.append(words)
                    words = []
                    prev_sent_id = sent_id
            else:
                s = re.match(r'[0-9]+\t([\w]+)\t([\w]+)\t([\w]+)',line)
                if s:
                    words.append(s.groups(0))

    return sentences

sentences_hdt = read_hdt()

In [109]:
hdt_res = []

for idx,sentence in enumerate(sentences_hdt):
    words, lemmas, pos = zip(*sentence)
    text = ' '.join(words)
    doc = spacy_model(text)
    if len(doc)==len(lemmas):
        for token, hdt_word, hdt_lemma, hdt_pos in zip(doc,words,lemmas,pos):
             if hdt_word == token.text:
                if token.pos_=='NOUN':
                   lemma = lemmatizer(spacy_token=token)
                   hdt_res.append((text,hdt_word, hdt_pos, hdt_lemma, token.pos_, token.lemma_, lemma))
    if (idx+1)%2000==0:
        print(idx)
        break

1999


In [110]:
hdt_res = pd.DataFrame(hdt_res, columns = ['sentence','word','hdt_pos','hdt_lemma','spacy_pos','spacy_lemma','pred_lemma'])

In [111]:
((hdt_res.hdt_pos.str.startswith('N')) & (hdt_res.spacy_pos=='NOUN')).mean()

0.943731643221518

In [112]:
(hdt_res.hdt_lemma==hdt_res.pred_lemma).mean()

0.7367444736435307

In [113]:
(hdt_res.hdt_lemma==hdt_res.spacy_lemma).mean()

0.7633328180553408

In [265]:
fails_df = hdt_res[(hdt_res.hdt_lemma!=hdt_res.pred_lemma)&(hdt_res.hdt_lemma==hdt_res.spacy_lemma)].drop_duplicates(subset=['word','hdt_lemma'])
fails_df.loc[~fails_df.pred_lemma.isna(),['sentence','word','hdt_lemma','pred_lemma']]

Unnamed: 0,sentence,word,hdt_lemma,pred_lemma
153,Die RegTP erteilte die Genehmigung weil ihr die bisher vorliegenden Daten nicht ausreichend erschienen um das abschließend beurteilen zu können,Daten,Datum,Daten
160,Nutzer des können für Mark monatlich einschließlich und für normale Werktage an und Feiertagen kostenlos in dem gesamten deutschen telefonieren,können,können,Können
183,Letztlich dürfte es sich bei dem Gerücht lediglich um eine Begleiterscheinung der Talfahrt des Kurses der handeln,handeln,handeln,Handeln
331,Die Bundesregierung spricht sich desweiteren dafür aus dass die Abgaben bei wegen der mehrfachen Überschreibbarkeit der Medien höher ausfallen sollten als bei da hier nur eine einmalige Vervielfältigungsmöglichkeit gegeben sei,Medien,Medium,Medien
408,Geplant ist gleich eine ganze Serie von Spielen zu veröffentlichen die sich nach Aussage von Don Mattrick President der EA worldwide studios eng an den Romanvorlagen und an dem demnächst erscheinenden Harry orientieren,Spielen,Spiel,Spielen
433,Unter kann jeder Interessierte neue eingeben und beschreiben,Interessierte,Interessierter,Interessierte
524,Der Kopierschutz besteht nach Angaben von Intel aus einem das die sichere Verwaltung von Musik auf dem PC und deren Übertragung auf tragbare Audiogeräte und Speichermedien ermöglichen soll,PC,PC,Pc
544,Bei dem zweiten Modul handelt es sich um einen Secure Music Manager der alle Funktionen umfassen soll die benötigt werden um die einzuhalten,einzuhalten,einhalten,einzuhalten
713,Nach dem März können sich für flat den Pauschaltarif für und nicht mehr neu anmelden,flat,flat,Flat
861,Ausbau des Kabelnetzes in für,Kabelnetzes,Kabelnetz,Kabelnetzes


## Statistics

In [2]:
with open('data/nouns.json', 'rt', encoding='UTF-8') as json_file:
    nouns = json.load(json_file)

wordfreq = pd.read_csv('data/third-party/FrequencyWords/content/2018/de/de_full.txt', sep=' ', names=['word','freq']).set_index('word').freq.sort_values(ascending=False) #sort by frequency
wordfreq = wordfreq.to_dict()

In [31]:
n_wordforms = len(nouns)

print(f'Total wordforms in the dictionary: {n_wordforms}')

lemmas = [[lemma['lemma'] for lemma in lemmas] for wordform,lemmas in nouns.items()]

n_lemmas = len(set([x for y in lemmas for x in y]))

print(f'Total unique lemmas in the dictionary: {n_lemmas}')

n_lemmas_per_wordform = [len(set(x)) for x in lemmas]

Total wordforms in the dictionary: 284144
Total unique lemmas in the dictionary: 99072


In [45]:
wordforms = list(nouns.keys())

wordforms_freq = [wordfreq.get(wordform,0) for wordform in wordforms]

freq_counts = defaultdict(list)

for wordform_freq,lemmas_per_wordform in zip(wordforms_freq,n_lemmas_per_wordform):
    freq_counts[lemmas_per_wordform].append(wordform_freq)

freq_counts = {lemmas_per_wordform:np.mean(wordform_freq).astype(int) for lemmas_per_wordform, wordform_freq in freq_counts.items()}

In [54]:
unique, unique_counts = np.unique(n_lemmas_per_wordform,return_counts=True)

df = pd.DataFrame({'unique lemmas':unique,'count':unique_counts})
df['wordform freq'] = df['unique lemmas'].map(freq_counts)
df

Unnamed: 0,unique lemmas,count,wordform freq
0,1,282882,227
1,2,1205,3319
2,3,55,1259
3,4,1,26
4,5,1,178


In [5]:
#wordforms with more than 3 possible lemmas

res = []
wordforms = list(nouns.keys())
for  idx,x in enumerate(lemmas):
    if len(set(x))>3:
        print(wordforms[idx],set(x))

alben {'alb', 'album', 'albe', 'alba', 'alben'}
folien {'folio', 'folia', 'folie', 'folium'}


In [26]:
#number of wordforms with ambiguos lemmas even if declination and gender is known
ambiguous_singular = set()
ambiguous_plural = set()

for wordform,lemmas in nouns.items():
    res = defaultdict(set)
    for lemma in lemmas:
        connection, genus = lemma['connection'],lemma['genus']
        res[(connection, genus)].add(lemma['lemma'])
        if len(res[(connection, genus)])>1:
            if 'Singular' in connection:
                ambiguous_singular.add(wordform)
            else:
                ambiguous_plural.add(wordform)

print(f'wordforms which admit more than 1 lemma: {len(ambiguous_singular.union(ambiguous_plural))}')
print(f'singular wordforms which admit more than 1 lemma: {len(ambiguous_singular)}')
print(f'plural wordforms which admit more than 1 lemma: {len(ambiguous_plural)}')

wordforms which admit more than 1 lemma: 448
singular wordforms which admit more than 1 lemma: 149
plural wordforms which admit more than 1 lemma: 358
