# Traditional Features

In [1]:
import nltk
nltk.download('punkt')
import pyphen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy import random
from tqdm import tqdm
import os
import ast

[nltk_data] Downloading package punkt to /home/soeren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def average_character_per_word(preprocessed_text):
    characters_per_wors = []
    for sentence in preprocessed_text:
        characters_per_wors += [len(word[0]) for word in sentence if word[1]!="SYM"]
    return np.mean(characters_per_wors)

test1 = [[["Ich", ""], ["studiere", ""], ["in", ""], ["Potsdam", ""], [".", "SYM"]]]
assert average_character_per_word(test1) == 5

In [3]:
def text_length(preprocessed_text):
    '''given a preprocessed text (in RFTagger output format) the text length in token is returned'''
    count = 0
    for sentence in preprocessed_text:
        count+= len(sentence)
    return count

In [4]:
def average_sentence_length(preprocessed_text):
    ''' Average length of a sentence. All characters and symbols are included.'''
    return np.mean([len(sentence) for sentence in preprocessed_text])

In [5]:
def average_syllable_word(preprocessed_text):
    '''Average length of a sentence. SYM are excluded and not counted'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    counts = []
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1]!="SYM":
                counts.append(syllable_count(word[0], dic))
    
    return np.mean(counts)

In [6]:
def syllable_count(string, dic):
    '''The number of syllables of a string is returned'''
    return len(dic.inserted(string).split("-"))

In [7]:
def Flesch_Reading_Ease_Score(preprocessed_text):
    ''' On a given text the score will produce a value between 1 and 100 where the higher
    the value the easier the text would be. Documents scoring 30 are very difficult to
    read while those scoring 70 should be easy to read.'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return 206.835 - (1.015 * asl) - (84.6 * asw)

In [8]:
def Flesch_Kincaid_readability(preprocessed_text):
    '''The Flesch-Kincaid readability formula (2.2) simplifies the Flesch score to produce a “grade
    level” which is easily interpretable (i.e., a text with a grade level of eight according to the formula
    could be thought appropriate for an eighth grader).'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return (0.4 * asl) + (12 * asw) - 15

In [9]:
def percentag_hard_words(preprocessed_text):
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    count_hard_words=0
    length=0
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1] != "SYM":
                length+=1
                if syllable_count(word[0], dic)>=3:
                    count_hard_words+=1
    if length == 0:
        print(preprocessed_text)
        return 0
            
    return count_hard_words/length

In [10]:
def fog(preprocessed_text):
    "FOG = 0.4 * (percentag_hard_words + average_sentence_length)"
    hw = percentag_hard_words(preprocessed_text)
    asl = average_sentence_length(preprocessed_text)
                
    return 0.4 * (hw + asl)

In [11]:
def polysyllable_count(preprocessed_text):
    '''the number of words with 3 or more syllables in 30 sentences
    which shall be picked from the beginning, middle, and end of the document.'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    part_len = int(len(preprocessed_text)/3)
    
    if part_len > 10:
        beginning_sentences = preprocessed_text[:part_len]
        middle_sentences = preprocessed_text[part_len:-part_len]
        end_sentences = preprocessed_text[-part_len:]
        picked_sentences = random.choice(beginning_sentences, 10) + random.choice(middle_sentences, 10) + random.choice(end_sentences, 10)
    else:
        picked_sentences = preprocessed_text
    
    polysyllable_count = 0
    for sentence in picked_sentences:
        for word in sentence:
            if syllable_count(word[0], dic)>=3:
                polysyllable_count+=1
    
    return polysyllable_count

In [12]:
def smog(preprocessed_text):
    '''3 + sqrt(normalized polysyllable count) 
    The polysyllable count is taken from 30 random sentences from beginning, middle and end of the text'''
    psc = polysyllable_count(preprocessed_text)
    return 3 + np.sqrt(psc)

In [17]:
input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/01_RawDataset")
input_files = [input_file for input_file in input_files if not input_file in os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/02_BasicFeatures")]

def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

for input_file in tqdm(input_files):
    df=pd.read_csv("datasets/01_RawDataset/" + input_file, sep="|", lineterminator='\n')
    print(input_file)
    df["preprocessedText"]  = [ast.literal_eval(text) for text in df["RFTagger"]]
    df["BF-average_sentence_length"] = [average_sentence_length(text) for text in df["preprocessedText"]]
    df["BF-average_syllable_word"] = [average_syllable_word(text) for text in df["preprocessedText"]]
    df["BF-Flesch_Reading_Ease_Score"] = [Flesch_Reading_Ease_Score(text) for text in df["preprocessedText"]]
    df["BF-Flesch_Kincaid_readability"] = [Flesch_Kincaid_readability(text) for text in df["preprocessedText"]]
    df["BF-percentag_hard_words"] = [percentag_hard_words(text) for text in df["preprocessedText"]]
    df["BF-fog"] = [fog(text) for text in df["preprocessedText"]]
    df["BF-polysyllable_count"] = [polysyllable_count(text) for text in df["preprocessedText"]]
    df["BF-smog"] = [smog(text) for text in df["preprocessedText"]]
    df["BF-average_character_per_word"] = [average_character_per_word(text) for text in df["preprocessedText"]]
    df["BF-text_length"] = [text_length(text) for text in df["preprocessedText"]]

    filename = "datasets/02_BasicFeatures/" + input_file
    save_df(filename,df)

  0%|          | 0/30 [00:00<?, ?it/s]

train_3_df.csv


  3%|▎         | 1/30 [00:10<04:51, 10.06s/it]

train_5_df.csv


  7%|▋         | 2/30 [00:19<04:32,  9.72s/it]

train_7_df.csv


 10%|█         | 3/30 [00:28<04:15,  9.45s/it]

test_14_df.csv


 13%|█▎        | 4/30 [00:32<03:32,  8.19s/it]

test_12_df.csv


 17%|█▋        | 5/30 [00:36<03:04,  7.39s/it]

train_10_df.csv


 20%|██        | 6/30 [00:46<03:05,  7.72s/it]

test_2_df.csv


 23%|██▎       | 7/30 [00:50<02:46,  7.23s/it]

train_9_df.csv


 27%|██▋       | 8/30 [00:59<02:44,  7.47s/it]

test_3_df.csv


 30%|███       | 9/30 [01:03<02:28,  7.05s/it]

test_7_df.csv


 33%|███▎      | 10/30 [01:07<02:14,  6.70s/it]

test_11_df.csv


 37%|███▋      | 11/30 [01:10<02:02,  6.42s/it]

test_13_df.csv


 40%|████      | 12/30 [01:14<01:51,  6.19s/it]

train_8_df.csv


 43%|████▎     | 13/30 [01:22<01:48,  6.35s/it]

train_4_df.csv


 47%|████▋     | 14/30 [01:31<01:44,  6.51s/it]

train_12_df.csv


 50%|█████     | 15/30 [01:40<01:40,  6.73s/it]

train_13_df.csv


 53%|█████▎    | 16/30 [01:49<01:35,  6.84s/it]

test_9_df.csv


 57%|█████▋    | 17/30 [01:53<01:26,  6.65s/it]

test_8_df.csv


 60%|██████    | 18/30 [01:56<01:17,  6.48s/it]

train_15_df.csv


 63%|██████▎   | 19/30 [02:05<01:12,  6.59s/it]

test_1_df.csv


 67%|██████▋   | 20/30 [02:09<01:04,  6.49s/it]

test_5_df.csv


 70%|███████   | 21/30 [02:13<00:57,  6.35s/it]

test_10_df.csv


 73%|███████▎  | 22/30 [02:17<00:49,  6.23s/it]

test_6_df.csv


 77%|███████▋  | 23/30 [02:20<00:42,  6.11s/it]

test_15_df.csv


 80%|████████  | 24/30 [02:24<00:36,  6.01s/it]

train_1_df.csv


 83%|████████▎ | 25/30 [02:34<00:30,  6.18s/it]

train_6_df.csv


 87%|████████▋ | 26/30 [02:43<00:25,  6.27s/it]

train_14_df.csv


 90%|█████████ | 27/30 [02:51<00:19,  6.35s/it]

train_11_df.csv


 93%|█████████▎| 28/30 [02:59<00:12,  6.42s/it]

train_2_df.csv


 97%|█████████▋| 29/30 [03:09<00:06,  6.52s/it]

test_4_df.csv


100%|██████████| 30/30 [03:12<00:00,  6.43s/it]
