In [11]:
import pandas as pd
import spacy

nlp = spacy.load("de_core_news_lg")
file_path = r"data\Goethe Wortliste A2.csv"

df_inp = pd.read_csv(file_path, header=0, delimiter="|")
dfv = pd.read_csv(r"data\clean_verbs.csv")

Importing file w/ sentences and dropping useless columns, leaving only ones with sentences in it.

In [77]:
# Dropping every column exept for the sentences
df_inp = df_inp.drop(["Audio_Wort", "Word_DE", "Word_EN", "Plural", "Hinweis", "Verbformen", "S1_EN", "S2_EN", "S3_EN", "S4_EN"], axis=1)
print("Total number of sentences: ", df_inp.S1_DE.nunique() + df_inp.S2_DE.nunique() + df_inp.S3_DE.nunique() + df_inp.S4_DE.nunique())

df1 = df_inp[["S1_DE"]].rename(columns={"S1_DE": "sentence"})
df2 = df_inp[["S2_DE"]].rename(columns={"S2_DE": "sentence"})
df3 = df_inp[["S3_DE"]].rename(columns={"S3_DE": "sentence"})
df4 = df_inp[["S4_DE"]].rename(columns={"S4_DE": "sentence"})
df_s = pd.concat([df1, df2, df3, df4], ignore_index=True).dropna().drop_duplicates().reset_index().drop("index", axis=1)
print("Total number of sentences in concated DF: ", len(df_s))


Total number of sentences:  1180
Total number of sentences in concated DF:  1178


In [78]:
df_w = pd.DataFrame({"word":[], "lemma_predicted":[], "pos": [], "morph": []})

# print("╔════════════════════════════════════╦══════════╦══════════╗")
# print(f"{'║    word':<16} => {' lemma':<16} ║ {'pos':<8} ║ {'morph    ║'}")
# print("╚════════════════════════════════════╩══════════╩══════════╝")
for sentence in df_s.loc[df_s.index, "sentence"]:
    for token in nlp(sentence):
        # print(f"{token.text:<16} => {token.lemma_:<16} | {token.pos_:<8} | {token.morph}")
        new_word = {
            "word": token.text,
            "pos": token.pos_,
            "lemma_predicted": token.lemma_,
            "morph": token.morph}
        df_w.loc[len(df_w)] = new_word

df_w = dfv[["word", "lemma"]].merge(df_w, on="word", how="right")
df_w.head(10)

Unnamed: 0,word,lemma,lemma_predicted,pos,morph
0,Darf,dürfen,dürfen,AUX,"(Mood=Ind, Number=Sing, Person=1, Tense=Pres, ..."
1,ich,,ich,PRON,"(Case=Nom, Number=Sing, Person=1, PronType=Prs)"
2,Ihnen,,ihnen,PRON,"(Case=Dat, Number=Sing, Person=2, PronType=Prs)"
3,ein,,ein,DET,"(Case=Acc, Definite=Ind, Gender=Neut, Number=S..."
4,Stück,,Stück,NOUN,"(Case=Acc, Gender=Neut, Number=Sing)"
5,Kuchen,,Kuchen,NOUN,"(Case=Acc, Gender=Masc, Number=Sing)"
6,anbieten,anbieten,anbieten,VERB,(VerbForm=Inf)
7,?,,--,PUNCT,()
8,Ich,,ich,PRON,"(Case=Nom, Number=Sing, Person=1, PronType=Prs)"
9,muss,müssen,mussen,AUX,"(Mood=Ind, Number=Sing, Person=1, Tense=Pres, ..."


In [80]:
mask = (df_w["pos"].isin(["AUX", "VERB"])) & (df_w["lemma"]==df_w["lemma_predicted"])
df_w[mask]["lemma"].nunique() / df_w[df_w["pos"].isin(["AUX", "VERB"])]["lemma_predicted"].nunique()

0.7873417721518987

In [81]:
# testing various attributes
for sentence in df_s.loc[:1, "sentence"]:
    for token in nlp(sentence):
        print(f"{token.text:<16} => {token.lemma_:<16} | {token.pos_:<8} | {token.tag_}")

Darf             => dürfen           | AUX      | VMFIN
ich              => ich              | PRON     | PPER
Ihnen            => ihnen            | PRON     | PPER
ein              => ein              | DET      | ART
Stück            => Stück            | NOUN     | NN
Kuchen           => Kuchen           | NOUN     | NN
anbieten         => anbieten         | VERB     | VVINF
?                => --               | PUNCT    | $.
Ich              => ich              | PRON     | PPER
muss             => mussen           | AUX      | VMFIN
meinen           => mein             | DET      | PPOSAT
Schlüssel        => Schlüssel        | NOUN     | NN
an               => an               | ADP      | APPR
der              => der              | DET      | ART
Rezeption        => Rezeption        | NOUN     | NN
abgeben          => abgeben          | VERB     | VVINF
.                => --               | PUNCT    | $.


### Importing database from dwds

In [82]:
dwds_df = pd.read_csv(r"data\dwds_korpora.csv")

In [None]:
dwds_df = dwds_df.drop(["transliterated_text", "cab_normalized_text"], axis=1)
dwds_df.info()

#### Cleaning corpora dataset from foreign words, symbols and numbers.

In [84]:
dwds_df["pos"].unique()

array(['$.', '$(', 'NN', 'ADJA', 'NE', 'FM.fr', 'FM', 'XY', 'CARD',
       'TRUNC', 'VVINF', 'APPR', 'ADV', 'FM.xy', 'FM.en', 'ART', 'FM.es',
       'FM.la', 'FM.nl', 'FM.sv', 'PPER', 'FM.da', 'FM.it', 'FM.el', '$,',
       'VAFIN', 'ADJD', 'VVIZU', 'VVFIN', 'VVPP', 'VMFIN', 'APPRART',
       'PTKVZ', 'KON', 'KOKOM', 'KOUS', 'PIAT', 'PTKA', 'APZR', 'ITJ',
       'VVIMP', 'PIS', 'PDS', 'PAV', 'KOUI', 'PTKANT', 'VMINF', 'PRELS',
       'PDAT', 'PPOSAT', 'PWAV', 'PPOSS', 'PRF', 'PWS', 'PTKNEG', 'VAIMP',
       'PWAT', 'PTKZU', 'PRELAT', 'APPO', 'VAPP', 'VMPP', 'VAINF'],
      dtype=object)

CARD - Kardinalzahl

In [89]:
# Dropping duplicates, NaN and useless columns
dwds_df = dwds_df.dropna().drop_duplicates().drop(["transliterated_text", "cab_normalized_text"], axis=1)

In [None]:
mask = (
    dwds_df["pos"].str.contains("^XY|^\$\.|^\$\(|^FM|^\$\,|CARD", regex=True, case=False) |
    dwds_df["utf8_text"].str.contains("^[\W\d]+$|^[\W]{0,2}[\wöäüßÖÄÜ]{1,2}$|[^\wöäüßÖÄÜ]|^\d+|\d", regex=True, case=False)
    )
dwds_df = dwds_df[~mask]
print(len(dwds_df))
dwds_df = dwds_df.reset_index(drop=True)
dwds_df = dwds_df.drop(index=range(0,260)).reset_index(drop=True)
dwds_df.head(50)

In [106]:
df_test = dwds_df[["utf8_text", "lemma"]].merge(df_w, left_on="utf8_text", right_on="word", how="right")

mask = (df_test["pos"].isin(["AUX", "VERB"])) & (df_test["lemma_x"]!=df_test["lemma_y"])
df_test[mask]["lemma_y"].nunique() / df_test[df_test["pos"].isin(["AUX", "VERB"])]["lemma_x"].nunique()

0.26628895184135976

In [108]:
len(df_test[mask])

284

Importing Text

In [None]:
text_file_path = r"data\tintenherz_text.txt"
df1 = pd.DataFrame({"word":[], "lemma_spacy":[], "pos": [], "morph": []})

with open(text_file_path) as f:
    text = f.read().replace("\n\n", " ").replace("\n", " ")
    chapters = text.split("## ")[1:]

doc = nlp(chapters[0])
sentences = list(doc.sents)

for s in sentences:
    # print(s)
    for token in s:
        # print(f"{token.text:<16} => {token.lemma_:<16} | {token.pos_:<8} | {token.tag_}")
        new_word = {
            "word": token.text,
            "pos": token.pos_,
            "lemma_spacy": token.lemma_,
            "morph": token.morph}
        df1.loc[len(df1)] = new_word

df1 = dfv[["word", "lemma"]].merge(df1, on="word", how="right")
mask = df1["pos"].isin(["AUX", "VERB"]) & -df1["lemma"].isna()
df1["comp"] = df1.apply(lambda x: True if x["lemma"]==x["lemma_spacy"] else False, axis=1)

In [23]:
print("Correct predictions ratio:", df1[mask].comp.sum() / len(df1[mask]))
# only show different rows
mask_different_rows = -df1["comp"] & df1["pos"].isin(["AUX", "VERB"])
df1[mask_different_rows].head(50)

Correct predictions ratio: 0.9347826086956522


Unnamed: 0,word,lemma,lemma_spacy,pos,morph,comp
22,musste,müssen,mussen,AUX,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
42,klopften,,klopfen,VERB,"(Mood=Ind, Number=Plur, Person=3, Tense=Past, ...",False
48,bellte,,bellen,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
69,drehte,,drehen,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
74,lag,,liegen,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
85,drückte,,drücken,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
101,locken,,locken,VERB,(VerbForm=Inf),False
137,entdeckte,,entdecken,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",False
140,Gib,geben,gib,VERB,"(Mood=Imp, Number=Sing, Person=2)",False
144,flüstert,,flüstern,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ...",False
