## Dependency Features

In [1]:
# Hancke 2013
# maximum number of words between a head and a dependent in a text, 
# average number of words between a head and a dependent per sentence, 
# average number of dependents per verb (in words) including and excluding modifers
# the number of dependents per NP (in words)

In [2]:
import ast
from collections import Counter, defaultdict
import de_core_news_sm
import itertools
from nltk import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
import os

In [3]:
nlp = de_core_news_sm.load()

In [4]:
#maximum number of words between a head and a dependent in a text
def max_no_words_head_dep(text):
    sent_text = sent_tokenize(text)
    max_no =  0
    for sentence in sent_text:
        doc = nlp(sentence)
        tok_sent =  [token for token in doc]
        for token in doc:
            for child in token.children:
                if child.pos_ != "PUNCT":
                    is_max = abs((tok_sent).index((token)) - list(tok_sent).index((child))) - 1
                    
                    if is_max > max_no:
                        max_no = is_max
                            
    return max_no

In [5]:
#the average number of words between a head and a dependent per sentence
def avg_no_words_head_dep(text):
    sent_text = sent_tokenize(text)
    lst = []
    for sentence in sent_text:
        doc = nlp(sentence)
        tok_sent =  [token for token in doc]
        for token in doc:
            for child in token.children:
                if child.pos_ != "PUNCT":
                    dist = abs(tok_sent.index((token)) - tok_sent.index((child))) - 1
                    lst.append(dist)
                        
    return np.mean(lst)

In [6]:
#the average number of dependents per verb/noun (in words) including and excluding modifers
def avg_no_dep(text, pos, including_mod = True):
    if not isinstance(text, str):
        return 0
    else:
        lst = []
        doc = nlp(text)
        for token in doc:
            if token.pos_ == pos:
                if including_mod == False:
                    lst.append(len(list([token for token in token.children if token.pos_ != "PUNCT" 
                                        and token.dep_ != "mo"])))
                else:
                    lst.append(len(list([token for token in token.children if token.pos_ != "PUNCT"])))
        if lst == []:
            return 0
        else:
            return np.mean(lst)
        
def avg_no_dep_verb_inc_mod(text):
    return avg_no_dep(text, "VERB")

def avg_no_dep_noun_inc_mod(text):
    return avg_no_dep(text, "NOUN")

def avg_no_dep_verb_exc_mod(text):
    return avg_no_dep(text, "VERB", including_mod=False)

def avg_no_dep_noun_exc_mod(text):
    return avg_no_dep(text, "NOUN", including_mod=False)
    

In [7]:
SD_feature_functions = [max_no_words_head_dep, avg_no_words_head_dep, avg_no_dep_verb_inc_mod, avg_no_dep_noun_inc_mod,
                       avg_no_dep_verb_exc_mod, avg_no_dep_noun_exc_mod]

In [11]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/06_MorphologicalFeatures")
input_files = [input_file for input_file in input_files if not input_file in os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/07_SyntaxDependencyFeatures")]


for input_file in input_files:
    df = pd.read_csv("datasets/06_MorphologicalFeatures/" + input_file, sep="|", lineterminator = '\n')

    for function in tqdm(SD_feature_functions):
        df["SD-"+function.__name__]  = [function(text) for text in df["cleanedText"]]

    filename = "datasets/07_SyntaxDependencyFeatures/" + input_file
    save_df(filename, df)

100%|██████████| 6/6 [08:01<00:00, 80.20s/it] 
100%|██████████| 6/6 [09:31<00:00, 95.28s/it] 
100%|██████████| 6/6 [07:52<00:00, 78.68s/it] 
100%|██████████| 6/6 [08:07<00:00, 81.30s/it] 
100%|██████████| 6/6 [03:35<00:00, 35.88s/it]
100%|██████████| 6/6 [03:34<00:00, 35.71s/it]
100%|██████████| 6/6 [07:41<00:00, 76.92s/it] 
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 6/6 [06:39<00:00, 66.52s/it]
100%|██████████| 6/6 [03:50<00:00, 38.46s/it]
100%|██████████| 6/6 [03:29<00:00, 34.85s/it]
100%|██████████| 6/6 [03:43<00:00, 37.31s/it]
100%|██████████| 6/6 [03:18<00:00, 33.11s/it]
100%|██████████| 6/6 [15:28<00:00, 154.73s/it]
100%|██████████| 6/6 [08:51<00:00, 88.56s/it] 
100%|██████████| 6/6 [08:05<00:00, 80.84s/it] 
100%|██████████| 6/6 [07:59<00:00, 79.89s/it] 
100%|██████████| 6/6 [12:34<00:00, 125.68s/it]
100%|██████████| 6/6 [04:26<00:00, 44.37s/it]


In [9]:
df.head()

Unnamed: 0,Level,Title,Text,Source,cleanedSource,Type,newLevel,cleanedText,SE_ratioRealSpellingErrors,SE_ratioSpellingErrors,...,MO-keit2nouns,MO-ung2nouns,MO-werk2nouns,MO-compounds2nouns,SD-max_no_words_head_dep,SD-avg_no_words_head_dep,SD-avg_no_dep_verb_inc_mod,SD-avg_no_dep_noun_inc_mod,SD-avg_no_dep_verb_exc_mod,SD-avg_no_dep_noun_exc_mod
0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,german.net,Reading,A,Zu meiner Familie gehören vier Personen. Die M...,0.0,0.0,...,0.0,0.0,0.0,4.958333,8,0.804348,2.615385,1.25,1.384615,1.166667
1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,german.net,Reading,A,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,0.0,0.0,...,0.0,0.0,0.0,4.0,5,0.651515,2.833333,1.058824,1.5,1.029412
2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,german.net,Reading,A,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,0.0,0.0,...,0.0,0.0,0.0,3.583333,6,0.483516,2.333333,0.962963,1.833333,0.925926
3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,german.net,Reading,A,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,0.0,0.0,...,0.0,0.0,0.0,3.833333,4,0.627907,2.866667,0.9375,1.4,0.875
4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,german.net,Reading,A,Frau Meier geht heute in den Supermarkt. Ihr M...,0.0,0.0,...,0.0,0.0,0.0,2.945652,4,0.527174,2.4,0.901408,1.866667,0.859155
