## Lexical Density and Variation Features

In [2]:
from collections import Counter, defaultdict
import de_core_news_sm
from math import sqrt, log
from nltk import word_tokenize
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("preprocessed_text_df.csv", lineterminator = '\n')

In [4]:
#Lexical Density #Tok Lex / #Tok 
#Lexical Word Variation #Typ Lex / #Tok Lex 
#Noun Variation #Typ Noun / #Tok Lex
#Adjective Variation #Typ Adj / #Tok Lex
#Adverb Variation #Typ Adv / #Tok Lex
#Modifer Variation (#Typ Adj + #Typ Adv) / #Tok Lex 
#Verb Variation 2 #Typ Verb / #Tok Verb 
#Verb Variation 1 #Typ Verb / #Tok Lex
#Squared Verb Variation 1 #Typ Verb^2 / #Tok Verb 
#Corrected Verb Variation 1 #Typ Verb / sqrt(2*#Tok Verb) 
#Verb Token Ratio #Tok Verb/#Tok
#Noun Token Ratio #Tok Noun/#Tok 
#Verb-Noun Token Ratio #Tok Verb/#Tok Noun 

In [5]:
nlp = de_core_news_sm.load()

In [6]:
lexical_tags = ["ADJA", "ADJD", "ADV", "NE", "NNE", "NN", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP"]

In [7]:
def lexical_density(text):
    doc = nlp(text)
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return no_lex / len(doc)

In [8]:
def lexical_word_variation(text):
    doc = nlp(text)
    no_lex = [str(tok) for tok in doc if tok.tag_ in lexical_tags]
    return len(set(no_lex)) / len(no_lex)
    

In [9]:
def pos_variation(text, pos):
    doc = nlp(text)
    no_nouns = len([tok for tok in doc if tok.pos_ == pos])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return no_nouns / no_lex

In [10]:
def noun_variation(text):
    return pos_variation(text, "NOUN")

In [11]:
def adj_variation(text):
    return pos_variation(text, "ADJ")

In [12]:
def adv_variation(text):
    return pos_variation(text, "ADV")

In [13]:
def modifier_variation(text):
    doc = nlp(text)
    no_mods = len([tok for tok in doc if tok.pos_ == "ADJ"  or tok.pos_ == "ADV"])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return no_mods / no_lex

In [14]:
def verb_variation1(text):
    return pos_variation(text, "VERB")

In [15]:
def verb_variation2(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    if len(verbs) != 0:
        return len(set(verbs)) / len(verbs)
    else:
        return 0

In [16]:
def squared_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    if len(verbs) != 0:
        return (len(set(verbs)) ** 2) / len(verbs)
    else:
        return 0

In [25]:
def corrected_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    if len(verbs) != 0:
        return len(set(verbs)) / (sqrt(2 * len(verbs)))
    else:
        return 0

In [18]:
def verb_token_ratio(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return len(verbs) / len(doc)

In [19]:
def noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    return len(nouns) / len(doc)

In [20]:
def verb_noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    if len(nouns) != 0:
        return len(verbs) / len(nouns)
    else:
        return 0

In [28]:
LV_feature_functions = [lexical_density, lexical_word_variation, noun_variation, adj_variation, adv_variation, 
                        modifier_variation, verb_variation1, verb_variation2, squared_verb_variation, 
                        corrected_verb_variation, verb_token_ratio, noun_token_ratio, verb_noun_token_ratio]

In [31]:
for function in LV_feature_functions:
    df["LV-"+function.__name__]  = [function(text) for text in df["cleanedText"]]

In [33]:
filename = "LV_features_df.csv"
df.to_csv(filename)

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Level,Title,Text,Source,Type,cleanedText,RFTagger,parsedText,LV-lexical_density,...,LV-adj_variation,LV-adv_variation,LV-modifier_variation,LV-verb_variation1,LV-verb_variation2,LV-squared_verb_variation,LV-corrected_verb_variation,LV-verb_token_ratio,LV-noun_token_ratio,LV-verb_noun_token_ratio
0,0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,Zu meiner Familie gehören vier Personen. Die M...,"[[['Zu', 'APPR', 'Dat'], ['meiner', 'PRO', 'Po...",['(ROOT (S (PP (APPR Zu) (PPOSAT meiner) (NN F...,0.436975,...,0.076923,0.25,0.326923,0.25,0.846154,9.307692,2.157277,0.109244,0.201681,0.541667
1,1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,"[[['Mein', 'PRO', 'Poss', 'Attr', '-', 'Nom', ...",['(ROOT (S (NP (PPOSAT Mein) (NN Name)) (VAFIN...,0.424419,...,0.109589,0.136986,0.246575,0.164384,0.916667,10.083333,2.245366,0.069767,0.197674,0.352941
2,2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,"[[['Hallo', 'ITJ'], ['!', 'SYM', 'Pun', 'Sent'...","['(ROOT (NUR (ITJ Hallo) ($. !)))', '(ROOT (S ...",0.376923,...,0.081633,0.102041,0.183673,0.122449,0.666667,2.666667,1.154701,0.046154,0.207692,0.222222
3,3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,"[[['Ricarda', 'N', 'Name', 'Nom', 'Sg', 'Fem']...",['(ROOT (CS (S (NE Ricarda) (VAFIN ist) (AP (N...,0.547826,...,0.095238,0.206349,0.301587,0.238095,0.666667,6.666667,1.825742,0.130435,0.13913,0.9375
4,4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,Frau Meier geht heute in den Supermarkt. Ihr M...,"[[['Frau', 'N', 'Reg', 'Nom', 'Sg', 'Fem'], ['...",['(ROOT (S (NP (NN Frau) (NE Meier)) (VVFIN ge...,0.487973,...,0.056338,0.112676,0.169014,0.211268,0.6,10.8,2.32379,0.103093,0.247423,0.416667
