## Lexical Density and Variation Features

In [30]:
from collections import Counter, defaultdict
import de_core_news_sm
from math import sqrt, log
from nltk import word_tokenize
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

In [31]:
def safe_div(a,b):
    try:
        return a/b
    except:
        return 0

In [32]:
#Lexical Density #Tok Lex / #Tok 
#Lexical Word Variation #Typ Lex / #Tok Lex 
#Noun Variation #Typ Noun / #Tok Lex
#Adjective Variation #Typ Adj / #Tok Lex
#Adverb Variation #Typ Adv / #Tok Lex
#Modifer Variation (#Typ Adj + #Typ Adv) / #Tok Lex 
#Verb Variation 2 #Typ Verb / #Tok Verb 
#Verb Variation 1 #Typ Verb / #Tok Lex
#Squared Verb Variation 1 #Typ Verb^2 / #Tok Verb 
#Corrected Verb Variation 1 #Typ Verb / sqrt(2*#Tok Verb) 
#Verb Token Ratio #Tok Verb/#Tok
#Noun Token Ratio #Tok Noun/#Tok 
#Verb-Noun Token Ratio #Tok Verb/#Tok Noun 

In [33]:
nlp = de_core_news_sm.load()

In [34]:
lexical_tags = ["ADJA", "ADJD", "ADV", "NE", "NNE", "NN", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP"]

In [35]:
def lexical_density(text):
    doc = nlp(text)
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_lex, len(doc))

In [36]:
def lexical_word_variation(text):
    doc = nlp(text)
    no_lex = [str(tok) for tok in doc if tok.tag_ in lexical_tags]
    return safe_div(len(set(no_lex)), len(no_lex))
    

In [37]:
def pos_variation(text, pos):
    doc = nlp(text)
    no_nouns = len([tok for tok in doc if tok.pos_ == pos])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_nouns, no_lex)

In [38]:
def noun_variation(text):
    return pos_variation(text, "NOUN")

In [39]:
def adj_variation(text):
    return pos_variation(text, "ADJ")

In [40]:
def adv_variation(text):
    return pos_variation(text, "ADV")

In [41]:
def modifier_variation(text):
    doc = nlp(text)
    no_mods = len([tok for tok in doc if tok.pos_ == "ADJ"  or tok.pos_ == "ADV"])
    no_lex = len([tok for tok in doc if tok.tag_ in lexical_tags])
    return safe_div(no_mods, no_lex)

In [42]:
def verb_variation1(text):
    return pos_variation(text, "VERB")

In [43]:
def verb_variation2(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(set(verbs)), len(verbs))

In [44]:
def squared_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div((len(set(verbs)) ** 2), len(verbs))

In [45]:
def corrected_verb_variation(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(set(verbs)), (sqrt(2 * len(verbs))))


In [46]:
def verb_token_ratio(text):
    doc = nlp(text)
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(verbs), len(doc))

In [47]:
def noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    return safe_div(len(nouns), len(doc))

In [48]:
def verb_noun_token_ratio(text):
    doc = nlp(text)
    nouns = [str(tok) for tok in doc if tok.pos_ == "NOUN"]
    verbs = [str(tok) for tok in doc if tok.pos_ == "VERB"]
    return safe_div(len(verbs), len(nouns))

In [49]:
LV_feature_functions = [lexical_density, lexical_word_variation, noun_variation, adj_variation, adv_variation, 
                        modifier_variation, verb_variation1, verb_variation2, squared_verb_variation, 
                        corrected_verb_variation, verb_token_ratio, noun_token_ratio, verb_noun_token_ratio]

In [50]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/RandomText/FeatureSet2_LD")
input_files = ["01_Preprocessing_df.csv"]

for input_file in input_files:
    df = pd.read_csv("datasets/RandomText/FeatureSet2_LD/" + input_file, sep="|", lineterminator = '\n')

    for function in tqdm(LV_feature_functions):
        df["LV-"+function.__name__]  = [function(text) for text in df["Text"]]

    filename = "datasets/RandomText/FeatureSet3_LV/" + input_file
    save_df(filename, df)




  0%|          | 0/13 [00:00<?, ?it/s][A[A[A


  8%|▊         | 1/13 [02:42<32:29, 162.42s/it][A[A[A


 15%|█▌        | 2/13 [05:28<30:07, 164.30s/it][A[A[A


 23%|██▎       | 3/13 [08:12<27:20, 164.08s/it][A[A[A


 31%|███       | 4/13 [10:48<24:18, 162.09s/it][A[A[A


 38%|███▊      | 5/13 [13:26<21:30, 161.29s/it][A[A[A


 46%|████▌     | 6/13 [16:33<19:19, 165.65s/it][A[A[A


 54%|█████▍    | 7/13 [19:14<16:29, 164.92s/it][A[A[A


 62%|██████▏   | 8/13 [22:00<13:45, 165.01s/it][A[A[A


 69%|██████▉   | 9/13 [24:43<10:59, 164.79s/it][A[A[A


 77%|███████▋  | 10/13 [27:37<08:17, 165.75s/it][A[A[A


 85%|████████▍ | 11/13 [31:01<05:38, 169.19s/it][A[A[A


 92%|█████████▏| 12/13 [33:39<02:48, 168.32s/it][A[A[A


100%|██████████| 13/13 [36:23<00:00, 167.97s/it][A[A[A


[A[A[A