In [1]:
'''
Compound splitter
'''

# https://github.com/dtuggener/CharSplit
import urllib
url = "https://github.com/dtuggener/CharSplit/archive/master.zip"
filename = "CharSplit.zip"
urllib.request.urlretrieve(url, filename)

import zipfile
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall()
zip_ref.close()

import os, sys
#os.rename("CharSplit-master", "CharSplit")
from shutil import copyfile
for filename in os.listdir("CharSplit-master"):
    if filename != "README.md": 
        copyfile("CharSplit-master/" + filename, filename)
    
import char_split, ngram_probs
def compoundSplit(string):
    request = char_split.split_compound(string)[0]
    if request[0] <= 0:
        return [string]
    else:
        return compoundSplit(request[1]) + compoundSplit(request[2])

In [4]:
'''
Functions for obtaining the morphological features.  Goals:

• Derivational
- hand-compiled list by Hancke et al (2012) 

• Inflectional (RFTagger)
- mood, person, tense, type of verbs
- case of nouns

• Compound words

'''

from nltk import word_tokenize
from collections import defaultdict, Counter, deque
import random
import numpy as np
import pandas as pd
import urllib.request
import io
import ast

url = "https://raw.githubusercontent.com/soerenetler/TextDifficultyAssessmentGerman/master/preprocessed_text_df.csv"
df= pd.read_csv(url, sep="|", lineterminator='\n')

# preprocessing Level classifications to get rid of unwanted data and "B1 " vs. "B1"
df = df[(df["Level"] == "A1")|(df["Level"] == "A2")|(df["Level"] == "B1")|(df["Level"] == "B1 ")|(df["Level"] == "B2")|(df["Level"] == "C1")|(df["Level"] == "C2")]
df = df.replace("B1 ","B1")

# returns a list of all the features included in the dataset, to be used as attributes in feature vector
def get_all_features(df):
    all_features = [] 
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            for sentence in all_sentences:
                for word_features in sentence:
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)                    
                    for token in word_features:
                        token = "MO-"+token
                        if token not in all_features:
                            all_features.append(token)
            index += 1
                    
    return all_features

def get_classifications(df):
    y = []
    
    for classification in df.loc[:,'Level']:
        y.append(classification)
        
    return y

def add_morph_columns(df_document):
    df_document["MO-inf2verbs"] = np.zeros(len(df_document))
    df_document["MO-part2verbs"] = np.zeros(len(df_document))
    df_document["MO-imp2verbs"] = np.zeros(len(df_document))
    df_document["MO-1st2finverbs"] = np.zeros(len(df_document))
    df_document["MO-2nd2finverbs"] = np.zeros(len(df_document))
    df_document["MO-3rd2finverbs"] = np.zeros(len(df_document))
    df_document["MO-subj2finverbs"] = np.zeros(len(df_document))
    df_document["MO-finverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-modverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-auxverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-verbspersent"] = np.zeros(len(df_document))
    df_document["MO-nom2nouns"] = np.zeros(len(df_document))
    df_document["MO-gen2nouns"] = np.zeros(len(df_document))
    df_document["MO-dat2nouns"] = np.zeros(len(df_document))
    df_document["MO-acc2nouns"] = np.zeros(len(df_document))
    df_document["MO-keit2nouns"] = np.zeros(len(df_document))
    df_document["MO-ung2nouns"] = np.zeros(len(df_document))
    df_document["MO-werk2nouns"] = np.zeros(len(df_document))
    #df_document["MO-derived2nouns"] = np.zeros(len(df_document))
    df_document["MO-compounds2nouns"] = np.zeros(len(df_document))
    
def fill_morph_features(df_document, word_list, morph_dict, index):
    derived_list = ["ant", "anten", "antin", "antinnen", "arium", "arien", "ast", "asten", "astin", "astinnen", 
                    "at", "ate", "ator", "atoren", "atorin", "atorinnen", "atur", "aturen", "ei", "eien", "er", 
                    "erin", "erinnen", "ent", "ents", "enz", "enzen", 'eur', 'eure', 'eurin', 'eurinnen', 'heit', 
                    'heiten', 'ist', 'isten', 'istin', 'istinnen', 'ion', 'ionen', 'ismus', 'ismen', 'ität', 
                    'itäten', 'keit', 'keiten', 'ling', 'lingen', 'nis', 'nisse', 'schaft', 'schaften', 'tum', 
                    'tümer', 'ung', 'ungen', 'ur', 'werk', 'werke', 'wesen']
    
    total_derived = 0
    total_nouns = morph_dict["N"]
    total_verbs = morph_dict["VFIN"] + morph_dict["VINF"] + morph_dict["VIMP"] + morph_dict["VPP"]
    total_keit = 0
    total_ung = 0
    total_werk = 0
    total_nom = morph_dict["Nom"]
    total_gen = morph_dict["Gen"]
    total_dat = morph_dict["Dat"]
    total_acc = morph_dict["Acc"]
    total_subj = morph_dict["Subj"]
    total_first = morph_dict["1"]
    total_second = morph_dict["2"]
    total_third = morph_dict["3"]
    total_compounds = 0
    total_infinitives = morph_dict["VINF"]
    total_participles = morph_dict["PART"]
    total_imperatives = morph_dict["VIMP"]
    total_finite = morph_dict["VFIN"]
    total_modal = morph_dict["MOD"]
    total_auxiliary = morph_dict["AUX"]
    num_sentences = 1 + word_list.count(".")
    
    for word in word_list:          
        for derivation in derived_list:
            if word[(len(word)-len(derivation)):len(word)] == derivation:
                #total_derived += 1
                    
                if derivation == "keit":
                    total_keit += 1
                elif derivation == "ung":
                    total_ung += 1
                elif derivation == "werk":
                    total_werk += 1
                    
        try: compoundSplit(word)
        except:
            total_compounds += 1 # upon inspection almost all exceptions are hyphenated words, i.e. compounds
            continue
        if len(compoundSplit(word)) > 1:
            total_compounds += 1
                    
    try: # catch total_nouns divide by 0 exception
        df_document.loc[index,"MO-nom2nouns"] = total_nom/total_nouns
        df_document.loc[index,"MO-gen2nouns"] = total_gen/total_nouns
        df_document.loc[index,"MO-dat2nouns"] = total_dat/total_nouns
        df_document.loc[index,"MO-acc2nouns"] = total_acc/total_nouns
        #df_document.loc[index,"MO-derived2nouns"] = total_derived/total_nouns
        df_document.loc[index,"MO-compounds2nouns"] = total_compounds/total_nouns
    except:
        df_document.loc[index,"MO-nom2nouns"] = 0
        df_document.loc[index,"MO-gen2nouns"] = 0
        df_document.loc[index,"MO-dat2nouns"] = 0
        df_document.loc[index,"MO-acc2nouns"] = 0
        #df_document.loc[index,"MO-derived2nouns"] = 0
        df_document.loc[index,"MO-compounds2nouns"] = 0
        
    try: # catch total_verbs divide by 0 exception
        df_document.loc[index,"MO-inf2verbs"] = total_infinitives/total_verbs
        df_document.loc[index,"MO-part2verbs"] = total_participles/total_verbs
        df_document.loc[index,"MO-imp2verbs"] = total_imperatives/total_verbs
        df_document.loc[index,"MO-1st2finverbs"] = total_first/total_finite
        df_document.loc[index,"MO-2nd2finverbs"] = total_second/total_finite
        df_document.loc[index,"MO-3rd2finverbs"] = total_third/total_finite
        df_document.loc[index,"MO-subj2finverbs"] = total_subj/total_finite
        df_document.loc[index,"MO-finverbs2verbs"] = total_finite/total_verbs
        df_document.loc[index,"MO-modverbs2verbs"] = total_modal/total_verbs
        df_document.loc[index,"MO-auxverbs2verbs"] = total_auxiliary/total_verbs
    except:
        df_document.loc[index,"MO-inf2verbs"] = 0
        df_document.loc[index,"MO-part2verbs"] = 0
        df_document.loc[index,"MO-imp2verbs"] = 0
        df_document.loc[index,"MO-1st2finverbs"] = 0
        df_document.loc[index,"MO-2nd2finverbs"] = 0
        df_document.loc[index,"MO-3rd2finverbs"] = 0
        df_document.loc[index,"MO-subj2finverbs"] = 0
        df_document.loc[index,"MO-finverbs2verbs"] = 0
        df_document.loc[index,"MO-modverbs2verbs"] = 0
        df_document.loc[index,"MO-auxverbs2verbs"] = 0
              
    df_document.loc[index,"MO-verbspersent"] = total_verbs/num_sentences
    df_document.loc[index,"MO-keit2nouns"] = total_keit/len(word_list)
    df_document.loc[index,"MO-ung2nouns"] = total_ung/len(word_list)
    df_document.loc[index,"MO-werk2nouns"] = total_werk/len(word_list)
    
def fill_document_vector(df,df_document):
    add_morph_columns(df_document)
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            text_length = 0
            word_list = []
            morph_dict = defaultdict(int)
            for sentence in all_sentences:
                text_length += len(sentence)
                for word_features in sentence:
                    word_list.append(word_features[0])
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)
                    for token in word_features:
                        df_document.loc[index,"MO-"+token] +=1
                        if token == "Nom" or token== "Gen" or token=="Dat" or token=="Acc":
                            if "N" in word_features: # only add nouns with above cases
                                morph_dict[token] += 1
                                continue
                            else: continue
                        morph_dict[token] += 1
            df_document.loc[index] = df_document.loc[index] / text_length # normalize each value with the total text length       
            fill_morph_features(df_document, word_list, morph_dict, index)
            index +=1
            
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

# get total number of sentences and all features contained in dataset in order to create vector columns
all_features = get_all_features(df)  

# document level vector, i.e. each row in the feature vector represents one document
zero_data = np.zeros(shape=(len(df),len(all_features))) # create 1 row in the feature vector per document
df_document = pd.DataFrame(zero_data,columns=sorted(all_features))
y = get_classifications(df)

fill_document_vector(df,df_document)
display(df_document.head())
print(list(df_document))

df = pd.concat([df,df_document], axis=1)

display(df.head())

filename = "06_MorphologicalFeatures_df.csv"
save_df(filename,df)

Unnamed: 0,MO-*,MO--,MO-1,MO-2,MO-3,MO-ADJA,MO-ADJD,MO-ADV,MO-APPO,MO-APPR,...,MO-auxverbs2verbs,MO-verbspersent,MO-nom2nouns,MO-gen2nouns,MO-dat2nouns,MO-acc2nouns,MO-keit2nouns,MO-ung2nouns,MO-werk2nouns,MO-compounds2nouns
0,0.142857,0.092437,0.07563,0.0,0.117647,0.016807,0.02521,0.10084,0.0,0.07563,...,0.0,1.230769,0.375,0.0,0.291667,0.333333,0.0,0.0,0.0,0.5
1,0.081395,0.104651,0.093023,0.0,0.110465,0.005814,0.034884,0.05814,0.0,0.069767,...,0.0,1.3,0.534884,0.023256,0.186047,0.255814,0.0,0.0,0.0,0.232558
2,0.046512,0.108527,0.093023,0.0,0.116279,0.0,0.031008,0.03876,0.0,0.03876,...,0.0,1.333333,0.583333,0.0,0.138889,0.277778,0.0,0.0,0.0,0.5
3,0.052174,0.052174,0.0,0.0,0.217391,0.034783,0.017391,0.113043,0.0,0.069565,...,0.0,1.266667,0.3,0.033333,0.366667,0.3,0.0,0.0,0.0,0.166667
4,0.097222,0.045139,0.072917,0.0,0.121528,0.013889,0.017361,0.052083,0.0,0.038194,...,0.0,1.205882,0.615385,0.0,0.087912,0.296703,0.0,0.0,0.0,0.241758


['MO-*', 'MO--', 'MO-1', 'MO-2', 'MO-3', 'MO-ADJA', 'MO-ADJD', 'MO-ADV', 'MO-APPO', 'MO-APPR', 'MO-APPRART', 'MO-APZR', 'MO-ART', 'MO-Aber', 'MO-Acc', 'MO-Adj', 'MO-Als', 'MO-An', 'MO-Ans', 'MO-Attr', 'MO-Auf', 'MO-Aux', 'MO-Außer', 'MO-Bis', 'MO-CARD', 'MO-CONJ', 'MO-Colon', 'MO-Comma', 'MO-Comp', 'MO-Cont', 'MO-Coord', 'MO-Dat', 'MO-Def', 'MO-Deg', 'MO-Dem', 'MO-Denn', 'MO-Doch', 'MO-FM', 'MO-Fem', 'MO-Full', 'MO-Gen', 'MO-Haben', 'MO-Hinter', 'MO-Hyph', 'MO-ITJ', 'MO-In', 'MO-Ind', 'MO-Indef', 'MO-Inter', 'MO-Left', 'MO-Masc', 'MO-Mod', 'MO-N', 'MO-Name', 'MO-Neg', 'MO-Neut', 'MO-Noch', 'MO-Nom', 'MO-Noun', 'MO-Other', 'MO-PART', 'MO-PRO', 'MO-PROADV', 'MO-Paren', 'MO-Past', 'MO-Per', 'MO-Pers', 'MO-Pl', 'MO-Pos', 'MO-Poss', 'MO-Pres', 'MO-Pro', 'MO-Psp', 'MO-Pun', 'MO-Quot', 'MO-Refl', 'MO-Reg', 'MO-Rel', 'MO-Right', 'MO-SYM', 'MO-Sein', 'MO-Sent', 'MO-Sg', 'MO-Slash', 'MO-SubFin', 'MO-SubInf', 'MO-Subj', 'MO-Subst', 'MO-Sup', 'MO-TRUNC', 'MO-Unter', 'MO-VFIN', 'MO-VIMP', 'MO-VINF'

Unnamed: 0,Level,Title,Text,Source,Type,cleanedText,RFTagger,parsedText,newLevel,MO-*,...,MO-auxverbs2verbs,MO-verbspersent,MO-nom2nouns,MO-gen2nouns,MO-dat2nouns,MO-acc2nouns,MO-keit2nouns,MO-ung2nouns,MO-werk2nouns,MO-compounds2nouns
0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,Zu meiner Familie gehören vier Personen. Die M...,"[[['Zu', 'APPR', 'Dat'], ['meiner', 'PRO', 'Po...",['(ROOT (S (PP (APPR Zu) (PPOSAT meiner) (NN F...,A,0.142857,...,0.0,1.230769,0.375,0.0,0.291667,0.333333,0.0,0.0,0.0,0.5
1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,"[[['Mein', 'PRO', 'Poss', 'Attr', '-', 'Nom', ...",['(ROOT (S (NP (PPOSAT Mein) (NN Name)) (VAFIN...,A,0.081395,...,0.0,1.3,0.534884,0.023256,0.186047,0.255814,0.0,0.0,0.0,0.232558
2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,"[[['Hallo', 'ITJ'], ['!', 'SYM', 'Pun', 'Sent'...","['(ROOT (NUR (ITJ Hallo) ($. !)))', '(ROOT (S ...",A,0.046512,...,0.0,1.333333,0.583333,0.0,0.138889,0.277778,0.0,0.0,0.0,0.5
3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,"[[['Ricarda', 'N', 'Name', 'Nom', 'Sg', 'Fem']...",['(ROOT (CS (S (NE Ricarda) (VAFIN ist) (AP (N...,A,0.052174,...,0.0,1.266667,0.3,0.033333,0.366667,0.3,0.0,0.0,0.0,0.166667
4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,Frau Meier geht heute in den Supermarkt. Ihr M...,"[[['Frau', 'N', 'Reg', 'Nom', 'Sg', 'Fem'], ['...",['(ROOT (S (NP (NN Frau) (NE Meier)) (VVFIN ge...,A,0.097222,...,0.0,1.205882,0.615385,0.0,0.087912,0.296703,0.0,0.0,0.0,0.241758
