In [1]:
# !pip install -r requirements.txt
# !pip install git+https://github.com/repodiac/german_compound_splitter
#
# get dictionary from https://sourceforge.net/projects/germandict/files/latest/download and store it in data/german.dic

# Lemmatization and Linguistic Transformations

In [18]:
# Project Gutenberg, 244: A Study in Scarlet (en), Arthur Conan Doyle
text_en = """
    This was a lofty chamber, lined and littered with countless bottles.
    Broad, low tables were scattered about, which bristled with retorts,
    test-tubes, and little Bunsen lamps, with their blue flickering flames.
    There was only one student in the room, who was bending over a distant
    table absorbed in his work. At the sound of our steps he glanced round
    and sprang to his feet with a cry of pleasure. “I’ve found it! I’ve
    found it,” he shouted to my companion, running towards us with a
    test-tube in his hand. “I have found a re-agent which is precipitated
    by hæmoglobin, and by nothing else.” Had he discovered a gold mine,
    greater delight could not have shone upon his features.
""".strip()


# Project Gutenberg, 34811: Buddenbrooks: Verfall einer Familie (de), Thomas Mann
text_de = """
»Ich rechne«, sagte der Konsul trocken. Die Kerze flammte auf, und man
sah, wie er gerade aufgerichtet und mit Augen, so kalt und aufmerksam,
wie sie während des ganzen Nachmittags noch nicht darein geschaut
hatten, fest in die tanzende Flamme blickte. -- »Einerseits: Sie geben
33335 an Gotthold und 15000 an die in Frankfurt, und das macht 48335 in
Summa. Andererseits: Sie geben nur 25000 an die in Frankfurt, und das
bedeutet für die Firma einen Gewinn von 23335. Das ist aber nicht alles.
Gesetzt, Sie leisten an Gotthold eine Entschädigungssumme für den Anteil
am Hause, so ist das Prinzip durchbrochen, so ist er damals =nicht=
endgültig abgefunden worden, so kann er nach Ihrem Tode ein gleich
großes Erbe beanspruchen, wie meine Schwester und ich, und dann handelt
es sich für die Firma um einen Verlust von Hunderttausenden, mit dem sie
nicht rechnen kann, mit dem ich als künftiger alleiniger Inhaber nicht
rechnen kann ... Nein, Papa!« beschloß er mit einer energischen
Handbewegung und richtete sich noch höher auf. »Ich muß Ihnen abraten,
nachzugeben!«
""".strip()

# Project Gutenberg, 13951: Les trois mousquetaires (fr), Alexandre Dumas
text_fr = """
D’Artagnan, tout en marchant et en monologuant, était arrivé à quelques
pas de l’hôtel d’Aiguillon, et devant cet hôtel il avait aperçu Aramis
causant gaiement avec trois gentilshommes des gardes du roi. De son
côté, Aramis aperçut d’Artagnan; mais comme il n’oubliait point que
c’était devant ce jeune homme que M. de Tréville s’était si fort
emporté le matin, et qu’un témoin des reproches que les mousquetaires
avaient reçus ne lui était d’aucune façon agréable, il fit semblant de
ne pas le voir. D’Artagnan, tout entier au contraire à ses plans de
conciliation et de courtoisie, s’approcha des quatre jeunes gens en
leur faisant un grand salut accompagné du plus gracieux sourire. Aramis
inclina légèrement la tête, mais ne sourit point. Tous quatre, au
reste, interrompirent à l’instant même leur conversation.
""".strip()

## Stemming

### English text

Define a few helper functions to help wordnet apply the right stemming based on POS tag on word.

In [19]:
import nltk
from nltk.corpus import wordnet
import spacy

def get_wordnet_pos(universal_tag):
    return {
        'ADJ': wordnet.ADJ,
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADV': wordnet.ADV
    }.get(universal_tag, wordnet.NOUN)

In [20]:
from IPython.display import Markdown
from tabulate import tabulate

tokens_tagged = nltk.pos_tag([word.lower() for word in nltk.word_tokenize(text_en) if word.isalpha()], tagset='universal')

# create all stemmers
stemmers = {
    'porter':       (nltk.PorterStemmer(), lambda s,w,pos: s.stem(w)),
    'lancaster':    (nltk.LancasterStemmer(), lambda s,w,pos: s.stem(w)),
    'snowball':     (nltk.SnowballStemmer("english"), lambda s,w,pos: s.stem(w)),
    'wordnet':      (nltk.WordNetLemmatizer(), lambda s,w,pos: s.lemmatize(w, get_wordnet_pos(pos))),
    'spaCy':        (spacy.load('en_core_web_sm'), lambda s,w,pos: s(w)[0].lemma_),
}

headers = ['term', 'pos(universal)'] + list(stemmers.keys())
words_with_diff = set()
rows = []
for (word, tag) in tokens_tagged:
    row = [word, tag]
    for name, (stemmer, lemmatizer) in stemmers.items():
        row.append(lemmatizer(stemmer, word, tag))
    # only add if stemming is different, and we have not seen this word before
    if len(set(row[2:])) > 1 and not word in words_with_diff:
        rows.append(row)
        words_with_diff.add(word)

Markdown(tabulate(sorted(rows)[:30], headers, tablefmt='github'))

| term       | pos(universal)   | porter    | lancaster   | snowball   | wordnet    | spaCy     |
|------------|------------------|-----------|-------------|------------|------------|-----------|
| blue       | ADJ              | blue      | blu         | blue       | blue       | blue      |
| bottles    | NOUN             | bottl     | bottl       | bottl      | bottle     | bottle    |
| bristled   | VERB             | bristl    | bristl      | bristl     | bristle    | bristle   |
| bunsen     | ADJ              | bunsen    | buns        | bunsen     | bunsen     | bunsen    |
| chamber    | NOUN             | chamber   | chamb       | chamber    | chamber    | chamber   |
| companion  | NOUN             | companion | comp        | companion  | companion  | companion |
| cry        | NOUN             | cri       | cry         | cri        | cry        | cry       |
| discovered | VERB             | discov    | discov      | discov     | discover   | discover  |
| distant    | ADJ              | distant   | dist        | distant    | distant    | distant   |
| features   | NOUN             | featur    | feat        | featur     | feature    | feature   |
| feet       | NOUN             | feet      | feet        | feet       | foot       | foot      |
| flames     | NOUN             | flame     | flam        | flame      | flame      | flame     |
| flickering | NOUN             | flicker   | flick       | flicker    | flickering | flicker   |
| found      | ADP              | found     | found       | found      | found      | find      |
| glanced    | VERB             | glanc     | glant       | glanc      | glance     | glance    |
| greater    | ADJ              | greater   | gre         | greater    | great      | great     |
| had        | VERB             | had       | had         | had        | have       | have      |
| have       | VERB             | have      | hav         | have       | have       | have      |
| his        | PRON             | hi        | his         | his        | his        | his       |
| i          | NOUN             | i         | i           | i          | i          | I         |
| is         | VERB             | is        | is          | is         | be         | be        |
| lined      | VERB             | line      | lin         | line       | line       | line      |
| littered   | VERB             | litter    | lit         | litter     | litter     | litter    |
| little     | ADJ              | littl     | littl       | littl      | little     | little    |
| lofty      | ADJ              | lofti     | lofty       | lofti      | lofty      | lofty     |
| mine       | NOUN             | mine      | min         | mine       | mine       | mine      |
| nothing    | NOUN             | noth      | noth        | noth       | nothing    | nothing   |
| one        | NUM              | one       | on          | one        | one        | one       |
| only       | ADV              | onli      | on          | onli       | only       | only      |
| over       | ADP              | over      | ov          | over       | over       | over      |

### German text

In [26]:
from IPython.display import Markdown
from tabulate import tabulate

nlp = spacy.load('de_core_news_sm')
tokens = nlp(text_de)

# create all stemmers
stemmers = {
    'snowball':     (nltk.SnowballStemmer("german"), lambda s,w,l: s.stem(w)),
    'spaCy':        (nlp, lambda s,w,l: l),
}

headers = ['term', 'pos'] + list(stemmers.keys())
words_with_diff = set()
rows = []
for token in tokens:
    word = token.text.lower()
    if not word.isalpha():
        continue
    row = [token.text, token.pos_]
    for name, (stemmer, lemmatizer) in stemmers.items():
        row.append(lemmatizer(stemmer, token.text, token.lemma_))
    # only add if stemming is different, and we have not seen this word before
    if len(set(row[2:])) > 1 and not word in words_with_diff:
        rows.append(row)
        words_with_diff.add(word)

Markdown(tabulate(sorted(rows, key=lambda x: x[0].lower())[:30], headers, tablefmt='github'))


| term                | pos   | snowball           | spaCy               |
|---------------------|-------|--------------------|---------------------|
| abgefunden          | VERB  | abgefund           | abfunden            |
| abraten             | VERB  | abrat              | abraten             |
| alleiniger          | ADJ   | allein             | alleinig            |
| alles               | PRON  | all                | alle                |
| am                  | ADP   | am                 | an                  |
| Andererseits        | ADV   | andererseit        | andererseits        |
| Anteil              | NOUN  | anteil             | Anteil              |
| aufgerichtet        | VERB  | aufgerichtet       | aufrichten          |
| Augen               | NOUN  | aug                | Auge                |
| beanspruchen        | VERB  | beanspruch         | beanspruchen        |
| bedeutet            | VERB  | bedeutet           | bedeuten            |
| beschloß            | VERB  | beschloss          | beschließen         |
| blickte             | VERB  | blickt             | blicken             |
| damals              | ADV   | damal              | damals              |
| das                 | PRON  | das                | der                 |
| dem                 | PRON  | dem                | der                 |
| den                 | DET   | den                | der                 |
| des                 | DET   | des                | der                 |
| Die                 | DET   | die                | der                 |
| durchbrochen        | VERB  | durchbroch         | durchbrochen        |
| Einerseits          | ADV   | einerseit          | einerseits          |
| endgültig           | ADV   | endgult            | endgültig           |
| energischen         | ADJ   | energ              | energisch           |
| Entschädigungssumme | NOUN  | entschadigungssumm | Entschädigungssumme |
| Erbe                | NOUN  | erb                | Erbe                |
| Firma               | NOUN  | firma              | Firma               |
| Flamme              | NOUN  | flamm              | Flamme              |
| flammte             | VERB  | flammt             | flammen             |
| Frankfurt           | PROPN | frankfurt          | Frankfurt           |
| für                 | ADP   | fur                | für                 |
| geben               | VERB  | geb                | geben               |
| gerade              | ADV   | gerad              | gerade              |
| geschaut            | VERB  | geschaut           | schauen             |
| Gesetzt             | VERB  | gesetzt            | setzen              |
| Gewinn              | NOUN  | gewinn             | Gewinn              |
| Gotthold            | PROPN | gotthold           | Gotthold            |
| großes              | ADJ   | gross              | groß                |
| Handbewegung        | NOUN  | handbeweg          | Handbewegung        |
| handelt             | VERB  | handelt            | handeln             |
| hatten              | AUX   | hatt               | haben               |
| Hause               | NOUN  | haus               | Haus                |
| Hunderttausenden    | NOUN  | hunderttaus        | Hunderttausend      |
| höher               | ADV   | hoh                | hoch                |
| Ihnen               | PRON  | ihn                | Ihnen               |
| Inhaber             | NOUN  | inhab              | Inhaber             |
| ist                 | AUX   | ist                | sein                |
| kann                | AUX   | kann               | können              |
| Kerze               | NOUN  | kerz               | Kerze               |
| Konsul              | NOUN  | konsul             | Konsul              |
| künftiger           | ADJ   | kunftig            | künftig             |

### French text

In [41]:
from IPython.display import Markdown
from tabulate import tabulate

nlp = spacy.load('fr_core_news_sm')
tokens = nlp(text_fr)

# create all stemmers
stemmers = {
    'snowball':     (nltk.SnowballStemmer("french"), lambda s,w,l: s.stem(w)),
    'spaCy':        (nlp, lambda s,w,l: l),
}

headers = ['term', 'pos'] + list(stemmers.keys())
words_with_diff = set()
rows = []
for token in tokens:
    word = token.text
    if not word.isalpha():
        continue
    row = [word, token.pos_]
    for name, (stemmer, lemmatizer) in stemmers.items():
        row.append(lemmatizer(stemmer, token.text, token.lemma_))
    # only add if stemming is different, and we have not seen this word before
    if len(set(row[2:])) > 1 and not word in words_with_diff:
        rows.append(row)
        words_with_diff.add(word)

Markdown(tabulate(sorted(rows)[:30], headers, tablefmt='github'))


| term          | pos   | snowball    | spaCy        |
|---------------|-------|-------------|--------------|
| Aiguillon     | PROPN | aiguillon   | Aiguillon    |
| Aramis        | X     | aram        | Aramis       |
| Artagnan      | PROPN | artagnan    | Artagnan     |
| Tous          | ADJ   | tous        | tout         |
| Tréville      | PROPN | trévill     | Tréville     |
| accompagné    | VERB  | accompagn   | accompagner  |
| agréable      | ADJ   | agréabl     | agréable     |
| aperçu        | ADJ   | aperçu      | apercevoir   |
| aperçut       | VERB  | aperçut     | apercevoir   |
| approcha      | PROPN | approch     | approcher    |
| arrivé        | VERB  | arriv       | arriver      |
| avaient       | AUX   | avaient     | avoir        |
| avait         | AUX   | avait       | avoir        |
| causant       | VERB  | caus        | causer       |
| cet           | DET   | cet         | ce           |
| comme         | SCONJ | comm        | comme        |
| conciliation  | NOUN  | concili     | conciliation |
| contraire     | NOUN  | contrair    | contraire    |
| conversation  | NOUN  | convers     | conversation |
| courtoisie    | NOUN  | courtois    | courtoisie   |
| côté          | NOUN  | côt         | côté         |
| devant        | ADP   | dev         | devant       |
| du            | ADP   | du          | de           |
| emporté       | ADJ   | emport      | emporté      |
| entier        | ADJ   | enti        | entier       |
| faisant       | VERB  | fais        | faire        |
| fit           | VERB  | fit         | faire        |
| gaiement      | VERB  | gai         | gaiemer      |
| gardes        | NOUN  | gard        | garde        |
| gentilshommes | NOUN  | gentilshomm | gentilhomme  |

## Splitting compunds

In [None]:
from german_compound_splitter import comp_split

compounds = [
    'Hundehalter',
    'Abwasserkanal',
    'Gartenhaus',
    'Tischplatte',
    'Kuhstall',
    'Donaudampfschifffahrtskapitänsmötzenabzeichen',
]
# please load an appropriate (external) dictionary, see the notes in section Installation/Setup on the dictionary
input_file = './data/german.dic'
ahocs = comp_split.read_dictionary_from_file(input_file)

for compound in compounds:
    dissection = comp_split.dissect(compound, ahocs, make_singular=True)
    print(f'SPLIT WORDS (plain): {dissection}')
    print(f'SPLIT WORDS (post-merge): {comp_split.merge_fractions(dissection)}')
    print()


In [18]:
nltk.download('tagsets')
nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Roger\AppData\Roaming\nltk_data...


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data]   Unzipping help\tagsets.zip.


In [28]:
import spacy

# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_sm")
# lemmatizer = nlp.get_pipe("lemmatizer")
# print(lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.pos_ for token in doc])
print([token.tag_ for token in doc])
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']

['PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'PUNCT']
['PRP', 'VBD', 'VBG', 'DT', 'NN', '.']
['I', 'be', 'read', 'the', 'paper', '.']
