In [4]:
import sys
import re
from collections import defaultdict,namedtuple
import json
import os

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)

parser_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/bio/c2/resources/tools/wiktionary-de-parser'
sys.path.append(parser_dir)

from wiktionary_de_parser.dump_processor import WiktionaryDump
from wiktionary_de_parser import WiktionaryParser
from pprint import pprint

# Generate verb lookup table

In [5]:
def is_german(wiki_record):
    return wiki_record.language.lang == 'Deutsch'

def get_flexion_field(wiki_record, field_name):
    if field_name in wiki_record.flexion:
        wordform = wiki_record.flexion[field_name].strip()
        wordform = re.sub(r".*:'' ",r"",wordform) #remarks like 'selten:'/'militarisch:'
        return wordform
    else:
        return None

In [6]:
base_categories = ['Präsens_ich', 'Präsens_du', 'Präsens_er, sie, es', 'Präteritum_ich', 'Konjunktiv II_ich', 
                     'Imperativ Singular', 'Imperativ Plural']

categories = [x+'*'*n for x in base_categories for n in range(4)] + ['Imperativ Singular 2']

In [7]:
def is_verb(wiki_record):
    return (wiki_record.pos 
            and 'Verb' in wiki_record.pos 
            and wiki_record.pos['Verb'] in ([],['Hilfsverb'])
            and wiki_record.flexion is not None #we can't do much without any flexion information
           )

def stem_verb(verb):
    root = re.sub(r'e?n$|e$|e?t$|est$|([^s])st$',r'\1', verb) #remove ending, leave s when it's part of the root, e.g. lassen->lass 
    return root

def generate_verbforms(verb, lemma, category):
    '''
    Add verb forms that are usually absent in the Wiktionary flexion entry
    '''
    wordforms = [verb,]
    
    if category.startswith('Präsens_ich'):

        if lemma == 'sein':

            wordforms += ['sei','seiest','seist','seiet', 'seien', 'sind'] #Konjunktiv I, Indikativ 3. Person Plural

        elif lemma in ('können', 'sollen', 'müssen', 'dürfen', 'wollen', 'mögen'):

            wordforms += [lemma[:-1],lemma[:-1]+'st',lemma[:-1]+'t', lemma[:-2]+'t']
            
        elif lemma.endswith('ern') or lemma.endswith('eln'): 
            #wandern, sammeln
            if verb.endswith('ere') or verb.endswith('ele'):
                wordforms += [verb[:-1]+'st', verb[:-1]+'t']  #Indikativ 2. Person Plural, Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural
            elif verb.endswith('le'):
                wordforms += [verb+'st', verb+'t']  #Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural, alternative forms
        else:

            wordforms += [verb+'st', verb+'t']  #Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural
                
            verb = re.sub(r'([^td])e$|([wtzpsdfghjkxvb][mn]e)$',r'\1\2',verb) #don't remove e if preceeded by t or d or m,n after a consonant (except l,r,l,m)
                
            wordforms += [verb+'st', verb+'t']  #Indikativ 2. Person Plural

        if lemma.endswith('auern'):
            #bedauern, kauern
            wordforms += [re.sub(r'auere$',r'aure',verb)] #bedauere-->bedaure

        if lemma == 'werden':
            wordforms += ['worden']
           
    elif category.startswith('Präteritum_ich'):
        
        if verb[-1]=='e':
            #regular + mixed verbs
            #machen, denken
            wordforms += [verb+'st', verb+'t', verb+'n']
        elif verb[-1] in ('s','ß','z'):
            #lassen,schmelzen,blasen
            wordforms += [verb+'est', verb+'t', verb+'en']  
        elif re.search(r'([td]$)|([wtzpsdfghjkxvb][mn]$)', verb):
            #halten, finden
            wordforms += [verb+'est', verb+'st', verb+'et', verb+'en']  
        else:
            #irregular verbs
            #sprechen
            wordforms += [verb+'st', verb+'t', verb+'en']  

    elif category.startswith('Konjunktiv II_ich'):

        if lemma=='fahren':
            #don't add Konjunktiv II for 'fahren' to avoid confusuion with Indikativ for 'führen'
            return []
            
        wordforms += [verb+'st', verb+'t', verb+'n'] #Konjunktiv II 2. Person Singular, Konjunktiv II 2.,3. Person Plural
            
    return wordforms
    
def get_verb_forms(wiki_record):

    Lemma = namedtuple('lemma', 'lemma connection via')

    verblemmas = defaultdict(set)

    lemma = wiki_record.lemma.lemma #word lemma
    
    if ' ' in lemma:
        #we ignore the cases where the prefix is not attached to the verb in subordinate clauses
        #e.g. frei geben, bekannt machen
        #dependency parses like SpaCy can't recognize that these are both parts of the same verb anyway
        return {}, None

    verblemmas[lemma].add(Lemma(lemma,None,()))

    is_separable, prefix = False, None

    for category in categories:
        
        #category with asterisk for alternative forms, e.g. ich anerkenne, ich erkenne an
        
        if category in wiki_record.flexion:
            
            verb = get_flexion_field(wiki_record,category)

            if not re.match('^[\w ]+$',verb):
                print(f'Unrecognized characters in wordform {verb} for {lemma}')
                return {}, None
            
            if ' ' in verb: #separable verb
                
                verb_split = verb.split()
                
                if len(verb_split)!=2:
                    #we don't treat cases with more than 1 prefix, e.g. wiederherstellen
                    print(f'Verb morphology not identified for {wiki_record.name}')
                    return {}, None

                is_separable = True

                verb, prefix = verb_split
                verb = prefix + verb #attach the prefix to the word without any space in-between, as they are used in subordinate clauses

            wordforms = generate_verbforms(verb,lemma,category) #get all possible wordforms from this word in this category

            for wordform in wordforms:
                verblemmas[wordform].add(Lemma(lemma,None,())) #assign lemma to each wordform

    if 'Partizip II' in wiki_record.flexion:
        partizip_II = get_flexion_field(wiki_record,'Partizip II')
        hilfs_verbs = []
        for hilfsverb_cat in ('Hilfsverb','Hilfsverb2','Hilfsverb*'):
            hilfsverb = get_flexion_field(wiki_record, hilfsverb_cat)
            if hilfsverb:
                hilfs_verbs.append(hilfsverb)
        verblemmas[partizip_II].add(Lemma(lemma,'Partizip II',tuple(hilfs_verbs)))
        
    if is_separable:
        #add the zu-infinitive form used in subordinate clauses 
        verblemmas[prefix+'zu'+re.sub(f'^{prefix}','',lemma)].add(Lemma(lemma,'zu-inf',()))

    return verblemmas, prefix

In [8]:
def is_prefix(wiki_record):
    return wiki_record.pos == {'Affix': ['Präfix']}

In [9]:
dump = WiktionaryDump(
    dump_file_path=parser_dir + "/wiktionary_german/dewiktionary-latest-pages-articles-multistream.xml.bz2"
)

In [10]:
def get_verbs(dump):

    verbs = defaultdict(set)
    #partizip_II = defaultdict(set)
    #prefixes = defaultdict(set)
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_verb(wiki_record):
                wordlemmas, sep_prefix = get_verb_forms(wiki_record)
                for k,v in wordlemmas.items():
                    verbs[k] = verbs[k].union(v)
                #if partizip_2:
                #    partizip_II[partizip_2].add(wiki_record.lemma.lemma)
                #if sep_prefix:
                #    prefixes[sep_prefix].add(wiki_record.lemma.lemma)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')

    verbs = {k:[lemma_record._asdict() for lemma_record in v] for k,v in verbs.items()} #defaultdict with sets of named tuples to dict with lists of dicts
    #partizip_II = {k:list(v) for k,v in partizip_II.items()} #defaultdict with sets to dict with lists

    return verbs

In [11]:
def get_prefixes(dump):

    prefixes = set()
        
    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_prefix(wiki_record):
                lemma = wiki_record.lemma.lemma
                lemma = lemma.replace('-','') #remove final - 
                if not lemma.istitle():
                    #ignore a few prefixes starting with a capital
                    prefixes.add(lemma)

    prefixes = list(prefixes)
            
    return prefixes

In [12]:
verbs = get_verbs(dump)

Verb morphology not identified for spulgen
Unrecognized characters in wordform -fiziere for -fizier
Unrecognized characters in wordform -ele for -el
1000 records processed
Unrecognized characters in wordform -ige for -ig
2000 records processed
Verb morphology not identified for wiederherstellen
3000 records processed
4000 records processed
5000 records processed
Unrecognized characters in wordform -iere for -ier
Unrecognized characters in wordform e-maile for e-mailen
Unrecognized characters in wordform (schneibte) for schneiben
Unrecognized characters in wordform -ze for -zen
Unrecognized characters in wordform (dergab) for dergeben
Unrecognized characters in wordform (sprang umhin) for umhinspringen
Unrecognized characters in wordform (derfror) for derfrieren
Unrecognized characters in wordform (bätzte) for bätzen
Unrecognized characters in wordform (bätzte aus) for ausbätzen
Unrecognized characters in wordform (bätzte anhin) for anhinbätzen
Unrecognized characters in wordform (zerbä

In [13]:
with open('data/verbs.json', 'wt', encoding='UTF-8') as json_file:
    json.dump(verbs, json_file, ensure_ascii=False)

In [None]:
#prefixes = get_prefixes(dump)

In [None]:
#with open('data/prefixes.json', 'wt', encoding='UTF-8') as json_file:
#    json.dump(prefixes, json_file, ensure_ascii=False)

# Unknown verbs

In [271]:
parser = WiktionaryParser()
    
page_names = []

verbs = defaultdict(set)
prefixes = defaultdict(set)

n_records = 0

for page in dump.pages():
    if page.redirect_to:
        continue
        
    page_names.append(page.name)
                
    if page.name in ("entwickeln",):
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            pprint(wiki_record)
            print('Hilfsverb' in wiki_record.flexion)
            break
        break

ParsedWiktionaryPageEntry(name='entwickeln', hyphenation=['ent', 'wi', 'ckeln'], flexion={'Präsens_ich': 'entwickle', 'Präsens_du': 'entwickelst', 'Präsens_er, sie, es': 'entwickelt', 'Präteritum_ich': 'entwickelte', 'Partizip II': 'entwickelt', 'Konjunktiv II_ich': 'entwickelte', 'Imperativ Singular': 'entwickle', 'Imperativ Plural': 'entwickelt', 'Hilfsverb': 'haben'}, ipa=['ɛntˈvɪkl̩n'], language=Language(lang='Deutsch', lang_code='de'), lemma=Lemma(lemma='entwickeln', inflected=False), pos={'Verb': []}, rhymes=['ɪkl̩n'])
True


In [64]:
with open('data/verbs.json', 'rt', encoding='UTF-8') as json_file:
    verbs = json.load(json_file)

with open('data/prefixes.json', 'rt', encoding='UTF-8') as json_file:
    prefixes = json.load(json_file)

In [68]:
for k,v in verbs.items():
    if len(v)>1:
        print(k,v)

# Lemmatizer class

In [16]:
def get_verb_lemma_morpho(word, query_verb_dict_fnc, prefixes_list, use_longest_subword=True):

    def get_verb_lemma_recursive(word, morphemes=[]):
        '''
        Recursively detect prefixes and yield all possible lemmas
        '''
        res = []
        
        base_lemma = query_verb_dict_fnc(word)
        
        if base_lemma:
            #print(word,morphemes,base_lemma)
            if len(morphemes)>1 and morphemes[-1] == 'zu' and base_lemma == word:
                #suspect a zu-infinitive: some prefix on the right + zu + infinitive
                #e.g. abzuheben, aufzuatmen
                res.append((''.join(morphemes[:-1]+[base_lemma]),word)) #add without "zu"
            else:
                res.append((''.join(morphemes+[base_lemma]),word)) #compose the infinitive out of the collected prefixes and the base lemma
                
        for prefix_end_idx in range(1,len(word)-2):
            #look for the next prefix which ends at prefix_end_idx
            if word[:prefix_end_idx] in prefixes_list:
                #prefix in the list of known prefixes
                prefix = word[:prefix_end_idx] 
                res.extend(get_verb_lemma_recursive(word[prefix_end_idx:], morphemes+[prefix])) #detach the prefix, call the function again
    
        return res
        
    lemmas = get_verb_lemma_recursive(word)

    if lemmas:
        lemmas.sort(key=lambda x:-len(x[1])) #sort according to subword length, ascending=False
        lemmas, subwords = zip(*lemmas)     
        if (use_longest_subword or len(set(lemmas))==1):
        #all possible splits lead to the same lemma or taking the longest subword allowed
            return lemmas[0]
    else:
        return None

In [17]:
def get_verb_lemma_fwdsearch(word, query_verb_dict_fnc, use_longest_subword=True):

    '''
    Get all possible verb lemmas using forward search, prefix-agnostic
    '''

    lemmas = []
    
    for start_idx in range(0,len(word)-2):
        #remove letters one by one, until the rest of the word matches one in the vocabulary
        trial_word = word[start_idx:]
        base_lemma = query_verb_dict_fnc(trial_word)
        if start_idx>2 and word[start_idx-2:start_idx] == 'zu' and base_lemma == trial_word:
            #suspect a zu-infinitive: some prefix on the right + zu + infinitive
            #e.g. abzuheben, aufzuatmen
            lemmas.append(word[0:start_idx-2] + base_lemma) #add without "zu"
        elif base_lemma and not base_lemma.startswith('zu'):
            lemmas.append(word[0:start_idx] + base_lemma)

    if lemmas and (use_longest_subword or len(set(lemmas))==1):
        #all possible splits lead to the same lemma or taking the longest subword allowed
        return lemmas[0]
    else:
        return None

In [18]:
class GLemma():

    """Wiktionary-based German lemmatizer.

    Provides a lemma for a given word given the POS tag:
    NOUN, VERB, ADJ, ADV.

    Parameters
    ----------
    lemmatizer_data_path : str
        Data path to the lemmatizer dictionaries.
        
    unknown_strategy : str
        How to treat words not represented in the dictionary.
        When contains {POS}_morpho string, performs morphological analysis of the given word
        by decomposing it into known parts: a verb is split into its prefixes and the root and
        the compound noun is split into more simple nouns. When contains {POS}_fwds string,
        looks for all the words that are contained in the given word and uses their lemmas.
        When multiple lemmas for the given word are possible, returns None unless the longest_subword_ukn flag is True.

    longest_subword_ukn : bool, default=True
        Use the lemma of the longest subword for unknown words.
        
    wordfreq_csv : str, default=None
        A file with approximate word frequencies. When multiple lemmas
        for a given word form are possible, the most frequent lemma is taken.
        Does not have to be a lemma list.
    
    Examples
    --------
    >>> lemmatizer = GLemma('./data', unknown='VERB_morpho;VERB_fwds',
                    wordfreq_csv='data/third-party/FrequencyWords/content/2018/de/de_full.txt')
    >>> lemmatizer('vermalt','VERB')
    'vermalen'

    Notes
    -----
    If a verb prefix is separated in the sentence, it should be attached to the root before the lemmatization:
    Ich hole dich ab --> lemmatizer('abhole','VERB')
    """

    def __init__(self, lemmatizer_data_path, unknown_strategy='skip', longest_subword_ukn=True, wordfreq_csv=None):

        with open(os.path.join(lemmatizer_data_path,'verbs.json'), 'rt', encoding='UTF-8') as json_file:
            self.verbs = json.load(json_file)
        
        with open(os.path.join(lemmatizer_data_path,'prefixes.json'), 'rt', encoding='UTF-8') as json_file:
            self.prefixes = json.load(json_file)
    
        if wordfreq_csv:
            self.wordfreq = pd.read_csv(wordfreq_csv, sep=' ', names=['word','freq']).set_index('word').freq.sort_values(ascending=False) #sort by frequency
            self.wordfreq = self.wordfreq.to_dict()
        else:
            self.wordfreq = None

        self.unknown_strategy = unknown_strategy
        self.longest_subword_ukn = longest_subword_ukn

    def get_most_frequent_word(self, lemmalist):
        
        if self.wordfreq:
            freqs = [self.wordfreq.get(word, np.nan) for word in lemmalist]
            if all(np.isnan(freqs)):
                return None
            else:
                #if at least one word in the wordrank dictionary
                return lemmalist[np.nanargmax(freqs)]
        else:
            return None
            
    def get_verb_lemma(self, verb, spacy_token=None):
        
        lemmas = self.verbs.get(verb, None)  
        
        if not lemmas:
            #maybe old orthography? try to replace ß with ss at the end of the stem 
            newform=re.sub(r'ß($|t?en$|t?e$|t?e?t$|t?est$)',r'ss\1', verb) 
            lemmas = self.verbs.get(newform, None)
            
        if not lemmas:
            return None
            
        n_unique_lemmas = len(set([x['lemma'] for x in lemmas])) #count unique lemmas, e.g. 'konzentriert' will have to records: one for the infinitive and one for the Partizip II
        
        if n_unique_lemmas>1 and spacy_token is not None:
            
            if spacy_token.head.lemma_ in ('haben','sein'):
                #Perfekt suspected
                
                n_hilfsverb = len(set([y for x in lemmas for y in x['via']]))
                
                if n_hilfsverb>1: #do we really have to choose between sein and haben?
                    lemmas = [x for x in lemmas if x['connection']=='Partizip II' 
                                    and spacy_token.head.lemma_ in x['via']]
                    
            elif spacy_token.head.lemma_=='werden':
                #Passiv or Futur suspected
                
                lemmas = [x for x in lemmas if x['connection']=='Partizip II' 
                                or x['lemma']==verb]
                
            else:
                #no evidence for Perfekt or Passiv
                
                lemmas = [x for x in lemmas if not x['connection']=='Partizip II']
                
        lemmas = list(set([x['lemma'] for x in lemmas])) #remove all meta info, take unique lemmas
        
        if not lemmas:
            
            return None
            
        elif len(lemmas)>1:

            return self.get_most_frequent_word(lemmas)
            
        else:
            
            return lemmas[0]

    def __call__(self, word=None, pos=None, spacy_token=None):

        lemma = None

        if not word:
            word, pos = spacy_token.text, spacy_token.pos_
        
        if pos.startswith('V'):
            word = word.lower()
            lemma = self.get_verb_lemma(word, spacy_token=spacy_token)
            if not lemma:
                #if the given wordform not found in the dictionary
                if 'VERB_morpho' in self.unknown_strategy:
                    #first try to get the lemma by decomposing the word into the prefixes and the root
                    lemma = get_verb_lemma_morpho(word, lambda x:self.get_verb_lemma(x, spacy_token=spacy_token), self.prefixes, use_longest_subword=self.longest_subword_ukn) 
                if not lemma and 'VERB_fwds' in self.unknown_strategy:
                    #then try to get the lemma by finding a subword that is in the dictionary
                    lemma = get_verb_lemma_fwdsearch(word, lambda x:self.get_verb_lemma(x, spacy_token=spacy_token), use_longest_subword=self.longest_subword_ukn)

        return lemma

In [19]:
lemmatizer = GLemma('./data', unknown_strategy='VERB_fwds',
                    wordfreq_csv='data/third-party/FrequencyWords/content/2018/de/de_full.txt')

In [20]:
def get_spacy_model(language='en'):

    import spacy

    language = {
        'en': 'en_core_web_lg',
        'fr': 'french',
        'de': 'de_core_news_lg',
    }[language]
    return spacy.load(language) 

spacy_model = get_spacy_model('de')

## Test TIGER

In [21]:
tiger_dataset = 'data/third-party/tiger_release_aug07.corrected.16012013.xml'

In [22]:
def read_tiger():
    
    sentences = []

    with open(tiger_dataset,'r', encoding='iso-8859-15') as f:
        while True:
            line = f.readline()
            while not '<terminals>' in line:
                line = f.readline()
                if '</corpus>' in line:
                    return sentences
            words = []
            while not '</terminals>' in line:
                line = f.readline()
                s = re.search(r'word="(\w+)" lemma="(\w+)" pos="(\w+)"',line)
                if s:
                    words.append(s.groups(0))
            if len(words)>0:
                sentences.append(words)

tiger_sentences = read_tiger()

In [23]:
tiger_res = []

for idx,sentence in enumerate(tiger_sentences):
    words, lemmas, pos = zip(*sentence)
    text = ' '.join(words)
    doc = spacy_model(text)
    if len(doc)==len(lemmas):
        for token, tiger_word, tiger_lemma, tiger_pos in zip(doc,words,lemmas,pos):
             if tiger_word == token.text:
                if token.pos_=='VERB':
                   lemma = lemmatizer(spacy_token=token)
                   tiger_res.append((text,tiger_word, tiger_pos, tiger_lemma, token.pos_, token.lemma_, lemma))
    if (idx+1)%2000==0:
        print(idx)
        break

1999


In [24]:
tiger_res = pd.DataFrame(tiger_res, columns = ['sentence','word','tiger_pos','tiger_lemma','spacy_pos','spacy_lemma','pred_lemma'])

In [25]:
((tiger_res.tiger_pos.str.startswith('V')) & (tiger_res.spacy_pos=='VERB')).mean()

0.9979570990806946

In [28]:
(tiger_res.tiger_lemma==tiger_res.pred_lemma).mean()

0.993190330268982

In [27]:
(tiger_res.tiger_lemma==tiger_res.spacy_lemma).mean()

0.989445011916922

In [290]:
fails_df = tiger_res[tiger_res.tiger_lemma!=tiger_res.pred_lemma]#.drop_duplicates(subset=['word','tiger_lemma'])
fails_df

Unnamed: 0,sentence,word,tiger_pos,tiger_lemma,spacy_pos,spacy_lemma,pred_lemma
98,Den Herren Rao und Singh gebührt ein Platz in ...,resumiert,VVFIN,resumieren,VERB,resumieren,
526,Zur Bedarfsdeckung müßten bis 2000 rund 90 000...,hinzukommen,VVINF,hinzukommen,VERB,hinzukommen,hinkommen
565,Erst mit Hilfe der aus anderen Orten herbeigee...,gelang,VVFIN,gelingen,VERB,gelingen,gelangen
1008,Doch die meisten der nach der Wende rund 4500 ...,verschuldet,ADJD,verschuldet,VERB,verschulden,verschulden
1147,Als positiv führen die Experten an daß Versorg...,führen,VVFIN,fahren,VERB,fahren,führen
1202,Aber dann spannt der Gigant die Muskeln die Fe...,spannt,VVFIN,spannen,VERB,spannen,spinnen
1228,Wie nicht anders zu erwarten rutschten die mei...,dotierte,ADJA,dotiert,VERB,dotiert,dotieren
1278,Von den Banken gelang es während der vergangen...,gelang,VVFIN,gelingen,VERB,gelingen,gelangen
1436,Der Münsteraner Theologe Tiemo Rainer Peters p...,prangert,VVFIN,prangern,VERB,prangern,prangeren
1437,Einen Gleichklang zwischen verstärkten Tendenz...,prangert,VVFIN,prangern,VERB,prangern,prangeren


In [121]:
sentence='Es ist wichtig, dich zu bewerten'
#sentence='Zum neuen Stil gehört daß die Junge Union nicht mehr allein im Saft eigener Vorstellungen schmoren will sondern gezielt andere Parteien zu Wort kommen läßt'

doc =spacy_model(sentence)

In [122]:
for token in doc:
    if token.pos_=='VERB' and token.text=='bewerten':
        lemma = lemmatizer(spacy_token=token)
        print(token, token.dep_, token.head.lemma_, lemma)
        break

bewerten re es bewerten


## Test HDT

In [282]:
hdt_dataset = 'data/third-party/UD_German-HDT/de_hdt-ud-train-a-1.conllu'

In [283]:
def read_hdt():
    
    sentences = []
    words = []

    prev_sent_id = ''
    
    with open(hdt_dataset,'r', encoding='utf-8') as f:
        for line in f:
            if 'sent_id' in line:
                sent_id = re.search(r'sent_id = (.+)',line).groups(1)
                if sent_id!=prev_sent_id:
                    if words:
                        sentences.append(words)
                    words = []
                    prev_sent_id = sent_id
            else:
                s = re.match(r'[0-9]+\t([\w]+)\t([\w]+)\t([\w]+)',line)
                if s:
                    words.append(s.groups(0))

    return sentences

sentences_hdt = read_hdt()

In [302]:
hdt_res = []

for idx,sentence in enumerate(sentences_hdt):
    words, lemmas, pos = zip(*sentence)
    text = ' '.join(words)
    doc = spacy_model(text)
    if len(doc)==len(lemmas):
        for token, hdt_word, hdt_lemma, hdt_pos in zip(doc,words,lemmas,pos):
             if hdt_word == token.text:
                if token.pos_=='VERB':
                   lemma = lemmatizer(spacy_token=token)
                   hdt_res.append((text,hdt_word, hdt_pos, hdt_lemma, token.pos_, token.lemma_, lemma))
    if (idx+1)%2000==0:
        print(idx)
        break

1999


In [303]:
hdt_res = pd.DataFrame(hdt_res, columns = ['sentence','word','hdt_pos','hdt_lemma','spacy_pos','spacy_lemma','pred_lemma'])

In [304]:
((hdt_res.hdt_pos.str.startswith('V')) & (hdt_res.spacy_pos=='VERB')).mean()

0.9800214056368177

In [308]:
(hdt_res.hdt_lemma==hdt_res.pred_lemma).mean()

0.9921512665001784

In [306]:
(hdt_res.hdt_lemma==hdt_res.spacy_lemma).mean()

0.9700321084552266

In [309]:
fails_df = hdt_res[hdt_res.hdt_lemma!=hdt_res.pred_lemma]#.drop_duplicates(subset=['word','tiger_lemma'])
fails_df[fails_df.hdt_pos=='VERB']

Unnamed: 0,sentence,word,hdt_pos,hdt_lemma,spacy_pos,spacy_lemma,pred_lemma
583,Und das Angebot der Telekom dafür Leitungen vo...,führe,VERB,fahren,VERB,führen,führen
892,HP will nach gutem Geschäftsergebnis Aktien sp...,splitten,VERB,splitten,VERB,splitten,spleiden
1110,Auch bei dem Streit um die Rechtmäßigkeit der ...,gelang,VERB,gelingen,VERB,gelingen,gelangen
1236,Die Aktien sollen in dem Verhältnis eins zu zw...,gesplittet,VERB,splitten,VERB,splitten,gespleiden
1241,Bereits in der Vergangeheit wurden die Aktien ...,gesplittet,VERB,splitten,VERB,splitten,gespleiden
1787,Über verschiedene auf diesem Portal soll das G...,hereinkmmen,VERB,unknown,VERB,hereinkmmen,
2052,Der Bereich EADS Telecommunications unter sein...,gehört,VERB,gehören,VERB,gehören,hören
2465,Einem Bericht der Saarbrücker Zeitung zufolge ...,zusamenarbeiten,VERB,unknown,VERB,zusamenarbein,zusamenarbeiten
2628,Durchschnittlich fast fünfeinhalb Millionen in...,gebracht,VERB,bringen,VERB,bringen,gebrechen
2770,Durch die neuen Netzwerke entwickele sich die ...,entwickele,VERB,entwickeln,VERB,entwickeln,


In [270]:
lemmatizer.verbs['entwickele']

KeyError: 'entwickele'

In [253]:
fails_df.loc[2052].sentence

'Der Bereich EADS Telecommunications unter seiner Leitung wird bei der EADS Defence Security Networks angesiedelt zu der auch die AEG Mobile Communications Ulm gehört'

In [254]:
sentence='Es ist wichtig, dich zu bewerten'
sentence='Der Bereich EADS Telecommunications unter seiner Leitung wird bei der EADS Defence Security Networks angesiedelt zu der auch die AEG Mobile Communications Ulm gehört'

doc =spacy_model(sentence)

In [255]:
for token in doc:
    if token.pos_=='VERB' and token.text=='gehört':
        lemma = lemmatizer(spacy_token=token)
        print(token, token.dep_, token.head.lemma_, lemma)
        break

gehört cj werden hören
