## Dependency Features

In [1]:
# Hancke 2013
# maximum number of words between a head and a dependent in a text, 
# average number of words between a head and a dependent per sentence, 
# average number of dependents per verb (in words) including and excluding modifers
# the number of dependents per NP (in words)

In [1]:
import ast
from collections import Counter, defaultdict
import de_core_news_sm
import itertools
from nltk import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm

In [3]:
nlp = de_core_news_sm.load()

In [4]:
#maximum number of words between a head and a dependent in a text
def max_no_words_head_dep(text):
    sent_text = sent_tokenize(text)
    max_no =  0
    for sentence in sent_text:
        doc = nlp(sentence)
        tok_sent =  [token for token in doc]
        for token in doc:
            for child in token.children:
                if child.pos_ != "PUNCT":
                    is_max = abs((tok_sent).index((token)) - list(tok_sent).index((child))) - 1
                    
                    if is_max > max_no:
                        max_no = is_max
                            
    return max_no

In [5]:
#the average number of words between a head and a dependent per sentence
def avg_no_words_head_dep(text):
    sent_text = sent_tokenize(text)
    lst = []
    for sentence in sent_text:
        doc = nlp(sentence)
        tok_sent =  [token for token in doc]
        for token in doc:
            for child in token.children:
                if child.pos_ != "PUNCT":
                    dist = abs(tok_sent.index((token)) - tok_sent.index((child))) - 1
                    lst.append(dist)
                        
    return np.mean(lst)

In [6]:
#the average number of dependents per verb/noun (in words) including and excluding modifers
def avg_no_dep(text, pos, including_mod = True):
    if not isinstance(text, str):
        return 0
    else:
        lst = []
        doc = nlp(text)
        for token in doc:
            if token.pos_ == pos:
                if including_mod == False:
                    lst.append(len(list([token for token in token.children if token.pos_ != "PUNCT" 
                                        and token.dep_ != "mo"])))
                else:
                    lst.append(len(list([token for token in token.children if token.pos_ != "PUNCT"])))
        if lst == []:
            return 0
        else:
            return np.mean(lst)
        
def avg_no_dep_verb_inc_mod(text):
    return avg_no_dep(text, "VERB")

def avg_no_dep_noun_inc_mod(text):
    return avg_no_dep(text, "NOUN")

def avg_no_dep_verb_exc_mod(text):
    return avg_no_dep(text, "VERB", including_mod=False)

def avg_no_dep_noun_exc_mod(text):
    return avg_no_dep(text, "NOUN", including_mod=False)
    

In [7]:
SD_feature_functions = [max_no_words_head_dep, avg_no_words_head_dep, avg_no_dep_verb_inc_mod, avg_no_dep_noun_inc_mod,
                       avg_no_dep_verb_exc_mod, avg_no_dep_noun_exc_mod]

In [8]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/RandomText/FeatureSet5_MO")
input_files = ["01_Preprocessing_df.csv"]

for input_file in input_files:
    df = pd.read_csv("datasets/RandomText/FeatureSet5_MO/" + input_file, sep="|", lineterminator = '\n')

    for function in tqdm(SD_feature_functions):
        df["SD-"+function.__name__]  = [function(text) for text in df["cleanedText"]]

    filename = "datasets/RandomText/FeatureSet6_SD/" + input_file
    save_df(filename, df)

100%|██████████| 6/6 [26:38<00:00, 266.37s/it]


In [9]:
df.head()

Unnamed: 0,Level,Title,Text,Source,Type,newLevel,cleanedText,SE_ratioRealSpellingErrors,SE_ratioSpellingErrors,SE_ratioSpellingErrorsWithCorrection,...,MO-keit2nouns,MO-ung2nouns,MO-werk2nouns,MO-compounds2nouns,SD-max_no_words_head_dep,SD-avg_no_words_head_dep,SD-avg_no_dep_verb_inc_mod,SD-avg_no_dep_noun_inc_mod,SD-avg_no_dep_verb_exc_mod,SD-avg_no_dep_noun_exc_mod
0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,A,Zu meiner Familie gehören vier Personen. Die M...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,8,0.804348,2.615385,1.25,1.384615,1.166667
1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,A,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.232558,5,0.651515,2.833333,1.058824,1.5,1.029412
2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,A,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,6,0.483516,2.333333,0.962963,1.833333,0.925926
3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,A,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.166667,4,0.627907,2.866667,0.9375,1.4,0.875
4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,A,Frau Meier geht heute in den Supermarkt. Ihr M...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,4,0.494505,2.4,0.861111,1.866667,0.819444
