## Lexical Density and Variation Features

In [23]:
from collections import Counter, defaultdict
import de_core_news_sm
from math import sqrt, log
from nltk import word_tokenize
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

In [24]:
def safe_div(a,b):
    try:
        return a/b
    except:
        return 0

In [25]:
#Lexical Density #Tok Lex / #Tok 
#Lexical Word Variation #Typ Lex / #Tok Lex 
#Noun Variation #Typ Noun / #Tok Lex
#Adjective Variation #Typ Adj / #Tok Lex
#Adverb Variation #Typ Adv / #Tok Lex
#Modifer Variation (#Typ Adj + #Typ Adv) / #Tok Lex 
#Verb Variation 2 #Typ Verb / #Tok Verb 
#Verb Variation 1 #Typ Verb / #Tok Lex
#Squared Verb Variation 1 #Typ Verb^2 / #Tok Verb 
#Corrected Verb Variation 1 #Typ Verb / sqrt(2*#Tok Verb) 
#Verb Token Ratio #Tok Verb/#Tok
#Noun Token Ratio #Tok Noun/#Tok 
#Verb-Noun Token Ratio #Tok Verb/#Tok Noun 

In [26]:
nlp = de_core_news_sm.load()

In [27]:
lexical_tags = ["ADJA", "ADJD", "ADV", "NE", "NNE", "NN", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP"]

In [28]:
def lexical_density(text):
    doc = nlp(text)
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_lex, len(doc))

In [29]:
def lexical_word_variation(text):
    doc = nlp(text)
    no_lex = [str(tok) for tok in doc if tok.tag_ in lexical_tags]
    return safe_div(len(set(no_lex)), len(no_lex))
    

In [30]:
def pos_variation(text, pos):
    doc = nlp(text)
    no_nouns = len([tok for tok in doc if tok.pos_ == pos])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_nouns, no_lex)

In [31]:
def noun_variation(text):
    return pos_variation(text, "NOUN")

In [32]:
def adj_variation(text):
    return pos_variation(text, "ADJ")

In [33]:
def adv_variation(text):
    return pos_variation(text, "ADV")

In [34]:
def modifier_variation(text):
    doc = nlp(text)
    no_mods = len([tok for tok in doc if tok.pos_ == "ADJ"  or tok.pos_ == "ADV"])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_mods, no_lex)

In [35]:
def verb_variation1(text):
    return pos_variation(text, "VERB")

In [36]:
def verb_variation2(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(set(verbs)), len(verbs))

In [37]:
def squared_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div((len(set(verbs)) ** 2), len(verbs))

In [38]:
def corrected_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(set(verbs)), (sqrt(2 * len(verbs))))


In [39]:
def verb_token_ratio(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(verbs), len(doc))

In [40]:
def noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    return safe_div(len(nouns), len(doc))

In [41]:
def verb_noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(verbs), len(nouns))

In [42]:
LV_feature_functions = [lexical_density, lexical_word_variation, noun_variation, adj_variation, adv_variation, 
                        modifier_variation, verb_variation1, verb_variation2, squared_verb_variation, 
                        corrected_verb_variation, verb_token_ratio, noun_token_ratio, verb_noun_token_ratio]

In [44]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/03_LexicalDiversity")
input_files = [input_file for input_file in input_files if not input_file in os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/04_LexicalVariation")]

for input_file in input_files:
    df = pd.read_csv("datasets/03_LexicalDiversity/" + input_file, sep="|", lineterminator = '\n')

    for function in tqdm(LV_feature_functions):
        df["LV-"+function.__name__]  = [function(text) for text in df["cleanedText"]]

    filename = "datasets/04_LexicalVariation/" + input_file
    save_df(filename, df)

100%|██████████| 13/13 [14:56<00:00, 68.95s/it]
100%|██████████| 13/13 [13:45<00:00, 63.54s/it]
100%|██████████| 13/13 [18:06<00:00, 83.56s/it]
100%|██████████| 13/13 [06:23<00:00, 29.48s/it]
100%|██████████| 13/13 [06:42<00:00, 30.93s/it]
100%|██████████| 13/13 [10:01<00:00, 46.27s/it]
100%|██████████| 13/13 [10:03<00:00, 46.40s/it]
100%|██████████| 13/13 [14:31<00:00, 67.05s/it]
100%|██████████| 13/13 [07:21<00:00, 33.96s/it]
100%|██████████| 13/13 [04:23<00:00, 20.24s/it]
100%|██████████| 13/13 [04:47<00:00, 22.11s/it]
100%|██████████| 13/13 [09:56<00:00, 45.85s/it]
100%|██████████| 13/13 [21:15<00:00, 98.12s/it]
100%|██████████| 13/13 [24:16<00:00, 112.04s/it]
100%|██████████| 13/13 [15:33<00:00, 71.82s/it]
100%|██████████| 13/13 [15:15<00:00, 70.43s/it]
100%|██████████| 13/13 [07:07<00:00, 32.89s/it]
100%|██████████| 13/13 [07:50<00:00, 36.17s/it]
100%|██████████| 13/13 [15:03<00:00, 69.47s/it]
100%|██████████| 13/13 [18:28<00:00, 85.29s/it]
100%|██████████| 13/13 [05:00<00:00, 23