In [1]:
import nltk
nltk.download('punkt')
import pyphen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy import random
from tqdm import tqdm

def average_character_per_word(preprocessed_text):
    characters_per_wors = []
    for sentence in preprocessed_text:
        characters_per_wors += [len(word[0]) for word in sentence]
    return np.mean(characters_per_wors)

def text_length(preprocessed_text):
    count = 0
    for sentence in preprocessed_text:
        count+= len(sentence)
    return count

def average_sentence_length(preprocessed_text):
    ''' Average length of a sentence. All characters and symbols are included.'''
    return np.mean([len(sentence) for sentence in preprocessed_text])

def average_syllable_word(preprocessed_text):
    '''Average length of a sentence. SYM are excluded and not counted'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    counts = []
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1]!="SYM":
                counts.append(syllable_count(word[0], dic))
    
    return np.mean(counts)

def syllable_count(string, dic):
    '''The number of syllables of a string is returned'''
    return len(dic.inserted(string).split("-"))

def Flesch_Reading_Ease_Score(preprocessed_text):
    ''' On a given text the score will produce a value between 1 and 100 where the higher
    the value the easier the text would be. Documents scoring 30 are very difficult to
    read while those scoring 70 should be easy to read.'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return 206.835 - (1.015 * asl) - (84.6 * asw)

def Flesch_Kincaid_readability(preprocessed_text):
    '''The Flesch-Kincaid readability formula (2.2) simplifies the Flesch score to produce a “grade
    level” which is easily interpretable (i.e., a text with a grade level of eight according to the formula
    could be thought appropriate for an eighth grader).'''
    asl = average_sentence_length(preprocessed_text)
    asw = average_syllable_word(preprocessed_text)
    
    return (0.4 * asl) + (12 * asw) - 15

def percentag_hard_words(preprocessed_text):
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    count_hard_words=0
    length=0
    for sentence in preprocessed_text:
        for word in sentence:
            if word[1]!="SYM":
                length+=1
                if syllable_count(word[0], dic)>=3:
                    count_hard_words+=1
            
    return count_hard_words/length
                
def fog(preprocessed_text):
    hw = percentag_hard_words(preprocessed_text)
    asl = average_sentence_length(preprocessed_text)
                
    return 0.4 * (hw + asl)

def polysyllable_count(preprocessed_text):
    '''the number of words with 3 or more syllables in 30 sentences
    which shall be picked from the beginning, middle, and end of the document.'''
    pyphen.language_fallback('de_DE_variant1')
    dic = pyphen.Pyphen(lang='de_DE')
    part_len = int(len(preprocessed_text)/3)
    
    if part_len > 10:
        beginning_sentences = preprocessed_text[:part_len]
        middle_sentences = preprocessed_text[part_len:-part_len]
        end_sentences = preprocessed_text[-part_len:]
        picked_sentences = random.choice(beginning_sentences, 10) + random.choice(middle_sentences, 10) + random.choice(end_sentences, 10)
    else:
        picked_sentences = preprocessed_text
    
    polysyllable_count = 0
    for sentence in picked_sentences:
        for word in sentence:
            if syllable_count(word[0], dic)>=3:
                polysyllable_count+=1
    
    return polysyllable_count

def smog(preprocessed_text):
    psc = polysyllable_count(preprocessed_text)
    return 3 + np.sqrt(psc)

def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

[nltk_data] Downloading package punkt to /home/soeren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df=pd.read_csv(r"datasets/01_Preprocessing_df.csv", sep="|", lineterminator='\n')
import ast
display(df.head())
df["preprocessedText"]  = [ast.literal_eval(text) for text in df["RFTagger"]]
df["BF-average_sentence_length"] = [average_sentence_length(text) for text in tqdm(df["preprocessedText"])]
df["BF-average_syllable_word"] = [average_syllable_word(text) for text in tqdm(df["preprocessedText"])]
df["BF-Flesch_Reading_Ease_Score"] = [Flesch_Reading_Ease_Score(text) for text in tqdm(df["preprocessedText"])]
df["BF-Flesch_Kincaid_readability"] = [Flesch_Kincaid_readability(text) for text in tqdm(df["preprocessedText"])]
df["BF-percentag_hard_words"] = [percentag_hard_words(text) for text in tqdm(df["preprocessedText"])]
df["BF-fog"] = [fog(text) for text in tqdm(df["preprocessedText"])]
df["BF-polysyllable_count"] = [polysyllable_count(text) for text in tqdm(df["preprocessedText"])]
df["BF-smog"] = [smog(text) for text in tqdm(df["preprocessedText"])]
df["BF-average_character_per_word"] = [average_character_per_word(text) for text in tqdm(df["preprocessedText"])]
df["BF-text_length"] = [text_length(text) for text in tqdm(df["preprocessedText"])]

display(df.head())
filename = "datasets/02_BasicFeatures_df.csv"
save_df(filename,df)

Unnamed: 0,Level,Title,Text,Source,Type,newLevel,cleanedText,SE_ratioRealSpellingErrors,SE_ratioSpellingErrors,SE_ratioSpellingErrorsWithCorrection,SE_ratioSpellingErrorsWithoutCorrection,SE_ratioUmlautSpellingErrors,SE_ratioCapitalisationErrors,RFTagger,parsedText
0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,A,Zu meiner Familie gehören vier Personen. Die M...,0.0,0.0,0.0,0.0,0.0,0.0,"[[['Zu', 'APPR', 'Dat'], ['meiner', 'PRO', 'Po...",['(ROOT (S (PP (APPR Zu) (PPOSAT meiner) (NN F...
1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,A,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,0.0,0.0,0.0,0.0,0.0,0.0,"[[['Mein', 'PRO', 'Poss', 'Attr', '-', 'Nom', ...",['(ROOT (S (NP (PPOSAT Mein) (NN Name)) (VAFIN...
2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,A,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,0.0,0.0,0.0,0.0,0.0,0.0,"[[['Hallo', 'ITJ'], ['!', 'SYM', 'Pun', 'Sent'...","['(ROOT (NUR (ITJ Hallo) ($. !)))', '(ROOT (S ..."
3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,A,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,0.0,0.0,0.0,0.0,0.0,0.0,"[[['Ricarda', 'N', 'Name', 'Nom', 'Sg', 'Fem']...",['(ROOT (CS (S (NE Ricarda) (VAFIN ist) (AP (N...
4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,A,Frau Meier geht heute in den Supermarkt. Ihr M...,0.0,0.0,0.0,0.0,0.0,0.0,"[[['Frau', 'N', 'Reg', 'Nom', 'Sg', 'Fem'], ['...",['(ROOT (S (NP (NN Frau) (NE Meier)) (VVFIN ge...


100%|██████████| 2565/2565 [00:00<00:00, 57205.70it/s]
100%|██████████| 2565/2565 [00:04<00:00, 521.89it/s] 
100%|██████████| 2565/2565 [00:02<00:00, 963.37it/s] 
100%|██████████| 2565/2565 [00:02<00:00, 968.53it/s] 
100%|██████████| 2565/2565 [00:02<00:00, 1067.77it/s]
100%|██████████| 2565/2565 [00:02<00:00, 978.85it/s] 
100%|██████████| 2565/2565 [00:02<00:00, 1048.05it/s]
100%|██████████| 2565/2565 [00:02<00:00, 1067.38it/s]
100%|██████████| 2565/2565 [00:00<00:00, 8646.37it/s] 
100%|██████████| 2565/2565 [00:00<00:00, 183364.97it/s]


Unnamed: 0,Level,Title,Text,Source,Type,newLevel,cleanedText,SE_ratioRealSpellingErrors,SE_ratioSpellingErrors,SE_ratioSpellingErrorsWithCorrection,...,BF-average_sentence_length,BF-average_syllable_word,BF-Flesch_Reading_Ease_Score,BF-Flesch_Kincaid_readability,BF-percentag_hard_words,BF-fog,BF-polysyllable_count,BF-smog,BF-average_character_per_word,BF-text_length
0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,A,Zu meiner Familie gehören vier Personen. Die M...,0.0,0.0,0.0,...,9.916667,1.605769,60.921506,8.235897,0.144231,4.024359,15,6.872983,4.277311,119
1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,A,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,0.0,0.0,0.0,...,9.052632,1.483444,72.147241,6.422377,0.086093,3.65549,13,6.605551,4.005814,172
2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,A,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,0.0,0.0,0.0,...,8.0625,1.654206,58.705768,8.075467,0.084112,3.258645,9,6.0,4.503876,129
3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,A,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,0.0,0.0,0.0,...,8.214286,1.58,64.8295,7.245714,0.1,3.325714,10,6.162278,4.365217,115
4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,A,Frau Meier geht heute in den Supermarkt. Ihr M...,0.0,0.0,0.0,...,6.466667,1.524229,71.321554,5.877416,0.105727,2.628957,21,6.741657,4.003436,291
