In [1]:
import sys
import re
from collections import defaultdict,namedtuple
import json
import os
import pickle

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 400)

parser_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/bio/c2/resources/tools/wiktionary-de-parser'
sys.path.append(parser_dir)

from wiktionary_de_parser.dump_processor import WiktionaryDump
from wiktionary_de_parser import WiktionaryParser
from pprint import pprint

In [2]:
def is_german(wiki_record):
    return wiki_record.language.lang == 'Deutsch'

def get_flexion_field(wiki_record, field_name):
    if field_name in wiki_record.flexion:
        wordform = wiki_record.flexion[field_name].strip()
        wordform = re.sub(r".*:'' ",r"",wordform) #remarks like 'selten:'/'militarisch:'
        return wordform
    else:
        return None

In [3]:
dump = WiktionaryDump(
    dump_file_path=parser_dir + "/wiktionary_german/dewiktionary-latest-pages-articles-multistream.xml.bz2"
)

# Generate nouns lookup table

In [73]:
def is_noun(wiki_record):
    return (wiki_record.pos 
            and 'Substantiv' in wiki_record.pos 
            and wiki_record.flexion is not None #we can't do much without any flexion information
           )

In [74]:
def get_noun_wordforms_adjective_declination(lemma):
    '''
    Decline nouns as adjectives
    e.g. Beamte(r), Unbekannte(r)
    '''

    strong_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'en',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'er','Dativ Singular':'er','Akkusativ Singular':'e',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'es',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},}
    
    weak_declinations = {'m':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    mixed_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'es',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    wordforms = defaultdict(set)
        
    Lemma = namedtuple('lemma', 'lemma declination genus')
    
    for declinations in (strong_declinations,weak_declinations,mixed_declinations):
        for genus in ('m','f','n'):
            for category, ending in declinations[genus].items():
                wordform = re.sub(r'er?$','',lemma) + ending
                wordforms[wordform].add(Lemma(lemma,category,genus))

    return wordforms

In [75]:
def get_noun_forms(wiki_record):

    base_categories = ['Nominativ Singular', 'Nominativ Plural', 'Genitiv Singular', 'Genitiv Plural', 'Dativ Singular', 
                         'Dativ Plural', 'Akkusativ Singular', 'Akkusativ Plural']
    
    #remove these rare nouns in order not to confuse with some forms of more common nouns
    stoplist = {'gedanken':'.*', 'real':'.*', 'studium':'Plural', 'fleck':'Plural 2', 'post':'Plural', 'gemein':'.*', 'willen':'.*',
                'namen':'.*', 'arme':'.*', 'schade':'Plural', 'zeug':'Plural', 'omme':'.*', 'praxis':'Plural 2','schranken':'.*','fliegen':'.*'}

    Lemma = namedtuple('lemma', 'lemma declination genus')

    lemmas = defaultdict(set)

    lemma = wiki_record.lemma.lemma.lower() #we will use lowercase lemmas in the dictionary

    if ' ' in lemma:
        #we exclude fixed expressions consisting of multiple words (often entities, e.g. 'Vereinigte Arabische Emirate')
        return {}

    if 'adjektivische Deklination' in wiki_record.pos['Substantiv']:
        #decline as an adjective
        if lemma.endswith('r'):
            #don't use feminine (use masculine) lemma for plural forms, in accordance with other lemmatizers
            lemmas = get_noun_wordforms_adjective_declination(lemma)
        
    genus_categories = [x for x in wiki_record.flexion.keys() if x.startswith('Genus')] # 'Genus', 'Genus 1', 'Genus 2' in case the noun admits multiple genera
            
    if len(genus_categories)==0 and 'Nominativ Plural' in wiki_record.flexion:
        #the noun admits only plural form
        genus_categories = ['only_plural']

    for genus_category in genus_categories:

        if genus_category=='only_plural':
            genus = 'only_plural'
        else:
            genus = wiki_record.flexion[genus_category]
                        
        for base_category in base_categories:

            if len(genus_categories)==1:
                #if there is only a single genus, extra forms ending with digits or stars are possible
                extended_categories = [base_category+genus_suffix+'*'*n for genus_suffix in ('',' 1',' 2',' 3',' 4') for n in range(3)] # additional forms with an asterisk
            else:
                #if there are multiple genera, extra forms endings should match the genus index or be empty
                genus_suffix = genus_category.replace('Genus','') #empty or ' 1', ' 2', etc...
                extended_categories = [base_category+genus_suffix_+'*'*n for genus_suffix_ in ('',genus_suffix) for n in range(3)] # additional forms with an asterisk

            for extended_category in extended_categories:

                if lemma in stoplist and re.search(stoplist[lemma],extended_category):
                    continue
                        
                if extended_category in wiki_record.flexion:
                    
                    wordform = get_flexion_field(wiki_record,extended_category).lower()
        
                    if not re.match('^[\w -]+$',wordform):
                        print(f'Unrecognized characters in wordform {wordform} for {lemma}')
                        return {}
          
                    lemmas[wordform].add(Lemma(lemma,base_category,genus))#assign lemma to the wordform


    return lemmas

In [77]:
def get_nouns(dump):

    nouns = defaultdict(set)
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_noun(wiki_record):
                wordlemmas = get_noun_forms(wiki_record)
                for k,v in wordlemmas.items():
                    nouns[k] = nouns[k].union(v)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')


    nouns = {k:[lemma_record._asdict() for lemma_record in v] for k,v in nouns.items()} #defaultdict with sets of named tuples to dict with lists of dicts

    return nouns

In [None]:
nouns = get_nouns(dump)

In [None]:
#for wordform, lemmas in nouns.items():
#    lemmas_NP = []
#    for lemma in lemmas:
#        if lemma['declination'] == 'Nominativ Plural':
#            lemmas_NP.append((lemma['lemma'],lemma['genus']))
#    if lemmas_NP:
#        lemmas_, genera_ = zip(*lemmas_NP)
#        if 'only_plural' in genera_ and len(set(lemmas_))>1:
#            nouns[wordform] = [lemma for lemma in nouns[wordform] if lemma['genus']=='only_plural' or (lemma['lemma'],lemma['genus']) not in lemmas_NP]
#            print(wordform,lemmas_NP)

In [73]:
#with open('glemma/datanouns.json', 'wt', encoding='UTF-8') as json_file:
#    json.dump(nouns, json_file, ensure_ascii=False)

# Generate verb lookup table

In [None]:
def is_verb(wiki_record):
    return (wiki_record.pos 
            and 'Verb' in wiki_record.pos 
            and wiki_record.pos['Verb'] in ([],['Hilfsverb'])
            and wiki_record.flexion is not None #we can't do much without any flexion information
           )

In [7]:
def stem_verb(verb):
    root = re.sub(r'e?n$|e$|e?t$|est$|([^s])st$',r'\1', verb) #remove ending, leave s when it's part of the root, e.g. lassen->lass 
    return root

def generate_verbforms(verb, lemma, category):
    '''
    Add verb forms that are usually absent in the Wiktionary flexion entry
    '''
    wordforms = [verb,]
    
    if category.startswith('Präsens_ich'):

        if lemma == 'sein':

            wordforms += ['sei','seiest','seist','seiet', 'seien', 'sind'] #Konjunktiv I, Indikativ 3. Person Plural

        elif lemma in ('können', 'sollen', 'müssen', 'dürfen', 'wollen', 'mögen'):

            wordforms += [lemma[:-1],lemma[:-1]+'st',lemma[:-1]+'t', lemma[:-2]+'t']
            
        elif lemma.endswith('ern') or lemma.endswith('eln'): 
            #wandern, sammeln
            if verb.endswith('ere') or verb.endswith('ele'):
                wordforms += [verb[:-1]+'st', verb[:-1]+'t']  #Indikativ 2. Person Plural, Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural
            elif verb.endswith('le'):
                wordforms += [verb+'st', verb+'t']  #Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural, alternative forms
        else:

            wordforms += [verb+'st', verb+'t']  #Konjunktiv I 2. Person Singular, Konjunktiv I 2. Person Plural
                
            verb = re.sub(r'([^td])e$|([wtzpsdfghjkxvb][mn]e)$',r'\1\2',verb) #don't remove e if preceeded by t or d or m,n after a consonant (except l,r,l,m)
                
            wordforms += [verb+'st', verb+'t']  #Indikativ 2. Person Plural

        if lemma.endswith('auern'):
            #bedauern, kauern
            wordforms += [re.sub(r'auere$',r'aure',verb)] #bedauere-->bedaure

        if lemma == 'werden':
            wordforms += ['worden']
           
    elif category.startswith('Präteritum_ich'):
        
        if verb[-1]=='e':
            #regular + mixed verbs
            #machen, denken
            wordforms += [verb+'st', verb+'t', verb+'n']
        elif verb[-1] in ('s','ß','z'):
            #lassen,schmelzen,blasen
            wordforms += [verb+'est', verb+'t', verb+'en']  
        elif re.search(r'([td]$)|([wtzpsdfghjkxvb][mn]$)', verb):
            #halten, finden
            wordforms += [verb+'est', verb+'st', verb+'et', verb+'en']  
        else:
            #irregular verbs
            #sprechen
            wordforms += [verb+'st', verb+'t', verb+'en']  

    elif category.startswith('Konjunktiv II_ich'):

        if lemma=='fahren':
            #don't add Konjunktiv II for 'fahren' to avoid confusuion with Indikativ for 'führen'
            return []
            
        wordforms += [verb+'st', verb+'t', verb+'n'] #Konjunktiv II 2. Person Singular, Konjunktiv II 2.,3. Person Plural
            
    return wordforms
    
def get_verb_forms(wiki_record):

    base_categories = ['Präsens_ich', 'Präsens_du', 'Präsens_er, sie, es', 'Präteritum_ich', 'Konjunktiv II_ich', 
                         'Imperativ Singular', 'Imperativ Plural']
    
    categories = [x+'*'*n for x in base_categories for n in range(4)] + ['Imperativ Singular 2']

    Lemma = namedtuple('lemma', 'lemma connection via')

    verblemmas = defaultdict(set)

    lemma = wiki_record.lemma.lemma #word lemma
    
    if ' ' in lemma:
        #we ignore the cases where the prefix is not attached to the verb in subordinate clauses
        #e.g. frei geben, bekannt machen
        #dependency parses like SpaCy can't recognize that these are both parts of the same verb anyway
        return {}, None

    verblemmas[lemma].add(Lemma(lemma,None,()))

    is_separable, prefix = False, None

    for category in categories:
        
        #category with asterisk for alternative forms, e.g. ich anerkenne, ich erkenne an
        
        if category in wiki_record.flexion:
            
            verb = get_flexion_field(wiki_record,category)

            if not re.match('^[\w ]+$',verb):
                print(f'Unrecognized characters in wordform {verb} for {lemma}')
                return {}, None
            
            if ' ' in verb: #separable verb
                
                verb_split = verb.split()
                
                if len(verb_split)!=2:
                    #we don't treat cases with more than 1 prefix, e.g. wiederherstellen
                    print(f'Verb morphology not identified for {wiki_record.name}')
                    return {}, None

                is_separable = True

                verb, prefix = verb_split
                verb = prefix + verb #attach the prefix to the word without any space in-between, as they are used in subordinate clauses

            wordforms = generate_verbforms(verb,lemma,category) #get all possible wordforms from this word in this category

            for wordform in wordforms:
                verblemmas[wordform].add(Lemma(lemma,None,())) #assign lemma to each wordform

    if 'Partizip II' in wiki_record.flexion:
        partizip_II = get_flexion_field(wiki_record,'Partizip II')
        hilfs_verbs = []
        for hilfsverb_cat in ('Hilfsverb','Hilfsverb2','Hilfsverb*'):
            hilfsverb = get_flexion_field(wiki_record, hilfsverb_cat)
            if hilfsverb:
                hilfs_verbs.append(hilfsverb)
        verblemmas[partizip_II].add(Lemma(lemma,'Partizip II',tuple(hilfs_verbs)))
        
    if is_separable:
        #add the zu-infinitive form used in subordinate clauses 
        verblemmas[prefix+'zu'+re.sub(f'^{prefix}','',lemma)].add(Lemma(lemma,'zu-inf',()))

    return verblemmas, prefix

In [10]:
def get_verbs(dump):

    verbs = defaultdict(set)
    #partizip_II = defaultdict(set)
    #prefixes = defaultdict(set)
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_verb(wiki_record):
                wordlemmas, sep_prefix = get_verb_forms(wiki_record)
                for k,v in wordlemmas.items():
                    verbs[k] = verbs[k].union(v)
                #if partizip_2:
                #    partizip_II[partizip_2].add(wiki_record.lemma.lemma)
                #if sep_prefix:
                #    prefixes[sep_prefix].add(wiki_record.lemma.lemma)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')

    verbs = {k:[lemma_record._asdict() for lemma_record in v] for k,v in verbs.items()} #defaultdict with sets of named tuples to dict with lists of dicts
    #partizip_II = {k:list(v) for k,v in partizip_II.items()} #defaultdict with sets to dict with lists

    return verbs

In [None]:
verbs = get_verbs(dump)

In [13]:
#with open('glemma/dataverbs.json', 'wt', encoding='UTF-8') as json_file:
#    json.dump(verbs, json_file, ensure_ascii=False)

In [11]:
def is_prefix(wiki_record):
    return wiki_record.pos == {'Affix': ['Präfix']}

def get_prefixes(dump):

    prefixes = set()
        
    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_prefix(wiki_record):
                lemma = wiki_record.lemma.lemma
                lemma = lemma.replace('-','') #remove final - 
                if not lemma.istitle():
                    #ignore a few prefixes starting with a capital
                    prefixes.add(lemma)

    prefixes = list(prefixes)
            
    return prefixes

# Generate adjective lookup table

In [24]:
def is_adjective(wiki_record):
    return (wiki_record.pos 
            and 'Adjektiv' in wiki_record.pos 
            and wiki_record.pos['Adjektiv'] == []
            and wiki_record.flexion is not None #we can't do much without any flexion information
           )

In [29]:
def get_adj_declination(lemma):


    strong_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'en',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'er','Dativ Singular':'er','Akkusativ Singular':'e',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'em','Akkusativ Singular':'es',
                           'Nominativ Plural':'e', 'Genitiv Plural':'er','Dativ Plural':'en', 'Akkusativ Plural':'e'},}
    
    weak_declinations = {'m':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    mixed_declinations = {'m':{'Nominativ Singular':'er','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'en',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'f':{'Nominativ Singular':'e','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'e',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},
                           'n':{'Nominativ Singular':'es','Genitiv Singular':'en','Dativ Singular':'en','Akkusativ Singular':'es',
                           'Nominativ Plural':'en', 'Genitiv Plural':'en','Dativ Plural':'en', 'Akkusativ Plural':'en'},}
    
    wordforms = []
    
    for declinations in (strong_declinations,weak_declinations,mixed_declinations):
        for genus in ('m','f','n'):
            for declination, ending in declinations[genus].items():
                if lemma[-1]=='e':
                    wordform = lemma[:-1] + ending
                else:
                    wordform = lemma + ending
                wordforms.append((wordform, declination, genus))
                if lemma[-2:] in ('el','en','er'):
                    #dunkel,düster
                    wordform = lemma[:-2] + lemma[-1] + ending 
                    wordforms.append((wordform, declination, genus))

    return wordforms

In [30]:
def get_adjective_forms(wiki_record):

    categories = ['Positiv', 'Komparativ', 'Superlativ', 'Positiv*', 'Komparativ*', 'Superlativ*',
                 'Positiv**', 'Komparativ**', 'Superlativ**']
    
    Lemma = namedtuple('lemma', 'lemma form declination genus')

    lemmas = defaultdict(set)

    lemma = wiki_record.lemma.lemma.lower() #we will use lowercase lemmas in the dictionary

    if ' ' in lemma:
        #we exclude fixed expressions consisting of multiple words (often entities, e.g. 'Vereinigte Arabische Emirate')
        return {}
        
    for category in categories:
     
        if category in wiki_record.flexion:
                    
            wordform = get_flexion_field(wiki_record,category).lower()
        
            if not re.match('^[\w -]+$',wordform):
                    print(f'Unrecognized characters in wordform {wordform} for {lemma}')
                    return {}

            adj_form = category.replace('*','')

            if adj_form=='Superlativ':
                wordform = re.sub('en$','',wordform)

            lemmas[wordform].add(Lemma(lemma,adj_form,None,None))

            for wordform, declination, genus in get_adj_declination(wordform):
                lemmas[wordform].add(Lemma(lemma,adj_form,declination,genus))#assign lemma to the wordform


    return lemmas

In [31]:
def get_adjectives(dump):

    adjectives = defaultdict(set)
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_adjective(wiki_record):
                wordlemmas = get_adjective_forms(wiki_record)
                for k,v in wordlemmas.items():
                    adjectives[k] = adjectives[k].union(v)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')

    adjectives = {k:[lemma_record._asdict() for lemma_record in v] for k,v in adjectives.items()} #defaultdict with sets of named tuples to dict with lists of dicts

    return adjectives

In [32]:
adjectives = get_adjectives(dump)

1000 records processed
Unrecognized characters in wordform d’hondtsch for d’hondtsch
2000 records processed
3000 records processed
Unrecognized characters in wordform humorigg.s for humorig
4000 records processed
5000 records processed
6000 records processed
Unrecognized characters in wordform eingebildetsten<!-- for eingebildet
7000 records processed
Unrecognized characters in wordform ([[wenig]]) for minder
8000 records processed
9000 records processed
10000 records processed
Unrecognized characters in wordform äußer(er) for äußer-
Unrecognized characters in wordform inner(er) for inner-
11000 records processed
Unrecognized characters in wordform mendel’sch for mendel’sch
12000 records processed
13000 records processed
Unrecognized characters in wordform d’hondt’sch for d’hondt’sch
14000 records processed
Unrecognized characters in wordform 08/15 for 08/15


In [33]:
#Lemma = namedtuple('lemma', 'lemma form declination genus')
#
#adj_from_verbs = defaultdict(set)
#
#for wordform, lemmas in verbs.items():
#    for lemma in lemmas:
#        if lemma['connection'] == 'Partizip II':
#            partizip_II = wordform
#            for declined_wordform, declination, genus in get_adj_declination(partizip_II):
#                adj_from_verbs[declined_wordform].add(Lemma(partizip_II,'Positiv',declination,genus))
#            partizip_I = lemma['lemma'] + 'd'
#            for declined_wordform, declination, genus in get_adj_declination(partizip_I):
#                adj_from_verbs[declined_wordform].add(Lemma(partizip_I,'Positiv',declination,genus))

In [80]:
adj_derived_forms = []

for wordform, lemmas in adjectives.items():
    for lemma in lemmas:
        lemma_form = lemma['form']
        if  lemma_form in ('Komparativ','Superlativ') and lemma['declination']==None:
            lemma_from_derivative1 = re.sub('er$|est$','',wordform)
            lemma_from_derivative2 = re.sub('r$|st$','',wordform)
            if lemma_from_derivative1!=lemma['lemma'] and lemma_from_derivative2!=lemma['lemma']:
                    adj_derived_forms.append(lemma_from_derivative1)   
                    adj_derived_forms.append(lemma_from_derivative2)                    
            adj_derived_forms.append(wordform)

adj_derived_forms = set(adj_derived_forms)

In [82]:
adjectives = {wordform: [lemma for lemma in lemmas if not lemma['lemma'] in adj_derived_forms] for wordform, lemmas in adjectives.items()}

In [83]:
with open('glemma/dataadjectives.json', 'wt', encoding='UTF-8') as json_file:
    json.dump(adjectives, json_file, ensure_ascii=False)

# Generate adverb lookup table

In [60]:
def is_adverb(wiki_record):
    return (wiki_record.pos 
            and 'Adverb' in wiki_record.pos 
           )

In [174]:
def get_adverbs(dump):

    adverbs = set()
    
    n_records = 0

    parser = WiktionaryParser()

    for page in dump.pages():
        if page.redirect_to:
            continue
                
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record) and is_adverb(wiki_record):
                lemma = wiki_record.lemma.lemma.lower() #we will use lowercase lemmas in the dictionary
                if not re.match('^[\w -]+$',lemma):
                    print(f'Unrecognized characters in {lemma}')
                    continue
                adverbs.add(lemma)
                n_records += 1
                if n_records%1000==0:
                    print(f'{n_records} records processed')

    return adverbs

In [175]:
adverbs = get_adverbs(dump)

Unrecognized characters in d’accord
1000 records processed
Unrecognized characters in ’naus


In [176]:
adverbs = {adv:[{'lemma':adv}] for adv in adverbs}

In [177]:
with open('glemma/dataadverbs.json', 'wt', encoding='UTF-8') as json_file:
    json.dump(adverbs, json_file, ensure_ascii=False)

# Combine all together

In [84]:
vocab = {}

with open('glemma/datavocab.json', 'wt', encoding='UTF-8') as vocab_json:
    
    for pos, pos_dict_file in zip(('N','V','ADJ','ADV'),('nouns.json','verbs.json','adjectives.json','adverbs.json')):
        
        with open(f'glemma/data{pos_dict_file}', 'rt', encoding='UTF-8') as pos_json:
            vocab[pos] = json.load(pos_json)
            
    json.dump(vocab, vocab_json, ensure_ascii=False)

## Test area

In [45]:
parser = WiktionaryParser()
    
noun_flexion_cat = []

n_records = 0

for page in dump.pages():
    if page.redirect_to:
        continue
        
    #page_names.append(page.name)
                
    if page.name in ("spätest",):
        for entry in parser.entries_from_page(page):
            wiki_record = parser.parse_entry(entry)
            if is_german(wiki_record):# and is_adjective(wiki_record):
                print(wiki_record)
                #adj_forms = get_adjective_forms(wiki_record)
                #keys = tuple([x for x in wiki_record.flexion.keys() if x[0].isupper()])
                #noun_flexion_cat.append((page.name,keys))
                n_records += 1
                
            #pprint(wiki_record)
            #break
        #break

KeyboardInterrupt: 

In [19]:
words, cats = zip(*noun_flexion_cat)

words = np.array(words)
cats_flatten = pd.Series([y for x in cats for y in x])

In [21]:
cats_flatten.unique()

array(['Positiv', 'Komparativ', 'Superlativ', 'Komparativ*',
       'Superlativ*', 'Komparativ**', 'Superlativ**'], dtype=object)