# Traditional Features

In [1]:
import nltk
nltk.download('punkt')
import pyphen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy import random
from tqdm import tqdm
import os
import ast

[nltk_data] Downloading package punkt to /home/soeren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def average_character_per_word(preprocessed_text):
    characters_per_wors = []
    for sentence in preprocessed_text:
        characters_per_wors += [len(word[0]) for word in sentence if word[1]!="SYM"]
    return np.mean(characters_per_wors)

test1 = [[["Ich", ""], ["studiere", ""], ["in", ""], ["Potsdam", ""], [".", "SYM"]]]
assert average_character_per_word(test1) == 5

In [3]:
def text_length(preprocessed_text):
    count = 0
    for sentence in preprocessed_text:
        count+= len(sentence)
    return count

In [4]:
def average_sentence_length(preprocessed_text):
    ''' Average length of a sentence. All characters and symbols are included.'''
    return np.mean([len(sentence) for sentence in preprocessed_text])

In [5]:
def average_syllable_word(preprocessed_text):
    '''Average length of a sentence. SYM are excluded and not counted'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    counts = []
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1]!="SYM":
                counts.append(syllable_count(word[0], dic))
    
    return np.mean(counts)

In [6]:
def syllable_count(string, dic):
    '''The number of syllables of a string is returned'''
    return len(dic.inserted(string).split("-"))

In [7]:
def Flesch_Reading_Ease_Score(preprocessed_text):
    ''' On a given text the score will produce a value between 1 and 100 where the higher
    the value the easier the text would be. Documents scoring 30 are very difficult to
    read while those scoring 70 should be easy to read.'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return 206.835 - (1.015 * asl) - (84.6 * asw)

In [8]:
def Flesch_Kincaid_readability(preprocessed_text):
    '''The Flesch-Kincaid readability formula (2.2) simplifies the Flesch score to produce a “grade
    level” which is easily interpretable (i.e., a text with a grade level of eight according to the formula
    could be thought appropriate for an eighth grader).'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return (0.4 * asl) + (12 * asw) - 15

In [9]:
def percentag_hard_words(preprocessed_text):
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    count_hard_words=0
    length=0
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1] != "SYM":
                length+=1
                if syllable_count(word[0], dic)>=3:
                    count_hard_words+=1
    if length == 0:
        print(preprocessed_text)
        return 0
            
    return count_hard_words/length

In [10]:
def fog(preprocessed_text):
    hw = percentag_hard_words(preprocessed_text)
    asl = average_sentence_length(preprocessed_text)
                
    return 0.4 * (hw + asl)

In [11]:
def polysyllable_count(preprocessed_text):
    '''the number of words with 3 or more syllables in 30 sentences
    which shall be picked from the beginning, middle, and end of the document.'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    part_len = int(len(preprocessed_text)/3)
    
    if part_len > 10:
        beginning_sentences = preprocessed_text[:part_len]
        middle_sentences = preprocessed_text[part_len:-part_len]
        end_sentences = preprocessed_text[-part_len:]
        picked_sentences = random.choice(beginning_sentences, 10) + random.choice(middle_sentences, 10) + random.choice(end_sentences, 10)
    else:
        picked_sentences = preprocessed_text
    
    polysyllable_count = 0
    for sentence in picked_sentences:
        for word in sentence:
            if syllable_count(word[0], dic)>=3:
                polysyllable_count+=1
    
    return polysyllable_count

In [12]:
def smog(preprocessed_text):
    psc = polysyllable_count(preprocessed_text)
    return 3 + np.sqrt(psc)

In [15]:
input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/RandomText")
input_files = [input_file for input_file in input_files if not input_file.startswith("FeatureSet")]
input_files = ["01_Preprocessing_df.csv"]

def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

for input_file in tqdm(input_files):
    df=pd.read_csv("datasets/" + input_file, sep="|", lineterminator='\n')
    print(input_file)
    df["preprocessedText"]  = [ast.literal_eval(text) for text in df["RFTagger"]]
    df["BF-average_sentence_length"] = [average_sentence_length(text) for text in df["preprocessedText"]]
    df["BF-average_syllable_word"] = [average_syllable_word(text) for text in df["preprocessedText"]]
    df["BF-Flesch_Reading_Ease_Score"] = [Flesch_Reading_Ease_Score(text) for text in df["preprocessedText"]]
    df["BF-Flesch_Kincaid_readability"] = [Flesch_Kincaid_readability(text) for text in df["preprocessedText"]]
    df["BF-percentag_hard_words"] = [percentag_hard_words(text) for text in df["preprocessedText"]]
    df["BF-fog"] = [fog(text) for text in df["preprocessedText"]]
    df["BF-polysyllable_count"] = [polysyllable_count(text) for text in df["preprocessedText"]]
    df["BF-smog"] = [smog(text) for text in df["preprocessedText"]]
    df["BF-average_character_per_word"] = [average_character_per_word(text) for text in df["preprocessedText"]]
    df["BF-text_length"] = [text_length(text) for text in df["preprocessedText"]]

    filename = "datasets/RandomText/FeatureSet1_BF/" + input_file
    save_df(filename,df)

  0%|          | 0/1 [00:00<?, ?it/s]

01_Preprocessing_df.csv


100%|██████████| 1/1 [00:38<00:00, 38.15s/it]
