## Parse Tree Complexity and Specific Syntactic Constructions Features

In [42]:
import ast
from collections import Counter, defaultdict
import de_core_news_sm
import itertools
from nltk import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import os

In [43]:
def safe_div(a,b):
    try:
        return a/b
    except:
        return 0
    
def safe_literal_eval(text):
    try:
        return ast.literal_eval(text)
    except:
        return []

In [44]:
#Avg. Length of a Clause # W / # C - DONE
#Avg. Sentence Length # W / # S - Sören has it
#Avg. Length of a T-Unit # W / # TU
#Avg. Num. Clauses per Sentence # C / # S - DONE
#Avg. Num. T-Units per Sentence # TU / # S 
#Avg. Num. Clauses per T-Unit #C / # TU - per sentence
#Avg. Num. Complex-T-Units per T-Unit # comp. TU / # TU
#Avg. Num. Dep. Clause per Clause # DC / # C -DONE
#Avg. Num. Dep. Clause per T-Unit # DC / # TU -DONE
#Avg. Num. Co-ordinate Phrases per Clause # CP / # C -DONE
#Avg. Num. Co-ordinate Phrases per T-Unit # CP / # TU -DON
#Avg. Num. Complex Nominals per Clause # compl. Nom. / # C -DONE-
#Avg. Num. Complex Nominals per T-Unit # compl. Nom. / # TU -DONE
#Avg. Num. VPs per T-Unit # VP / # TU -DONE

In [45]:
#helper functions
def break_into_clauses(text):
    #takes text as a string a input
    #returns a list of clauses as lists
    clauses = []
    clause = []
    for ind, element in enumerate(word_tokenize(text)):
        if element == "S" and clause != []:
            clauses.append(clause)
            clause = []
        else:
            clause.append(element) 
                        
    clauses.append(clause)   
    return clauses[1:]

def list_of_clause_lengths(text):
    #takes a text as string as input
    #returns a list of numbers - number of elements in a clause
    clause_lengths = []
    clause_length = 0
    clauses = break_into_clauses(text)
    for clause in clauses:
        for ind, element in enumerate(clause):
            if element[-1].islower(): 
                clause_length += 1 
            
        clause_lengths.append(clause_length)
        clause_length = 0
           
    clause_lengths.append(clause_length)    
            
    return clause_lengths[:-1]     

def av_pos_per_clause_or_sentence(text, pos, clause_or_sentence):
    no_pos = len([word for word in word_tokenize(text) if word == pos])
    no_sentences = len(safe_literal_eval(text))
    no_clauses = len(break_into_clauses(text))
   
    if clause_or_sentence == "clause":
        safe_div(no_pos, no_sentences)
    else:
        safe_div(no_pos, no_clauses)

In [46]:
def av_clause_length(text):
    clause_lenghts = list_of_clause_lengths(text)
    return np.mean(clause_lenghts)

def max_clause_length(text):
    clause_lenghts = list_of_clause_lengths(text)
    if len(clause_lenghts) ==0:
        return 0
    return max(clause_lenghts)

def av_num_clauses_per_sentence(text):
    lengths = []
    
    try:
        sentences = ast.literal_eval(text)
    except:
        return 0
    for sentence in sentences:
        lengths.append(len(break_into_clauses(sentence)))
    return np.mean(lengths)

In [47]:
dependency_pronouns = ["KOUS", "PRELAT" "PRELS"]
def av_num_dep_clauses_per_sentence(text):
    no_dep_clauses = len([word for word in word_tokenize(text) if word in dependency_pronouns])
    no_sentences = len(safe_literal_eval(text)) 
    
    return safe_div(no_dep_clauses, no_sentences)

In [48]:
def av_num_dep_clauses_per_clause(text):
    no_dep_clauses = len([word for word in word_tokenize(text) if word in dependency_pronouns])
    no_clauses = len(break_into_clauses(text))
        
    return safe_div(no_dep_clauses, no_clauses)

In [49]:
#KON is coordinating conjunction
def av_coordinations_per_sentence(text):    
    return av_pos_per_clause_or_sentence(text, "KON", "sentence")

def av_coordinations_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "KON", "clause")

In [50]:
#Avg. Num. NPs per Sentence #NP / # S -DONE
#Avg. Num. VPs per Sentence # VP / # S -DONE
#Avg. Num. PPs per Sentence # VZ / # S -DONE
#Avg. Num. VZs per Sentence # PP / # S -DONE
#Avg. Num. NPs per Clause # NP / # C -DONE
#Avg. Num. VPs per Clause # VP / # C -DONE
#Avg. Num. PPs per Clause # PP / # C -DONE
#Avg. Num. VZs per Clause # VZ / # C -DONE

#Avg. Length of a NP sum(len(NP)) / # NP -- i have this with dependencies
#Avg. Length of a VP sum(len(VP)) / # NP -- i have this with dependencies
#Avg. Length of a PP sum(len(PP)) / # NP -- i have this with dependencies
#Avg. Num. Dep. Clauses per Sentence # DC / # S -- this is up
#Avg. Num. Complex T-Units per Sentence #compl. TU/ # S
#Avg. Num. Co-ordinate Phrases per Sentence # CP / # S -- this is up
#Avg. Parse Tree Height sum(parseTreeHeight) / # S -- done

In [51]:
def av_num_nps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "NP", "sentence")

def av_num_vps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "VP", "sentence")

def av_num_vzs_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "VZ", "sentence")

def av_num_pps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "PP", "sentence")

def av_num_nps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "NP", "clause")

def av_num_vps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "VP", "clause")

def av_num_vzs_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "VZ", "clause")

def av_num_pps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "PP", "clause")

In [52]:
#Avg. Num. Non-Terminals Per Sentence # NTs / # S - DONE
#Avg. Num. Non-Terminal Per Words # NTs / # W - DONE
#Avg. Num. Modifers Per NP # modifersInNPs / # NP -- dependency
#Avg. Num. Modifers Per VP # modif ersInVPs / # VPs -- dependency
#Passive Voice - Sentence Ratio # passiveVoice / # S -- done
#Passive Voice - Clause Ratio # passiveVoice /# C -- done

#Dep. Clauses with Conj. to dep. Clause Ratio # DC w. Conj. / # DC
#Conjunctional Clauses Ratio # Conj. C / # dep. C w. Conj.
#Interrogative Clauses Ratio # Inter. C / # dep. C w. Conj. -- i think so
#Relative Clauses Ratio # Rel. C / # DC w. Conj.  -- i think so
#Dep. Clauses w.o. Conj. to dep. Clause Ratio # DC w.o. Conj. / # DC
#`satzwertige Infnitive' to Clause Ratio # satzInf / # DC -- i think so

# + separated verbs
# + sein/haben ratio

In [53]:
def av_non_terminals_per_word(text):
    text = re.sub('[()]', '', text)

    no_words = len([word for word in word_tokenize(text) if word[:-1].islower()])
    no_non_terminals = len([word for word in word_tokenize(text) if word[:-1].isupper()]) - no_words
    return safe_div(no_non_terminals, no_words)

def av_non_terminals_per_sentence(text):
    text = re.sub('[()]', '', text)
    
    no_words = len([word for word in word_tokenize(text) if word[:-1].islower()])
    no_sentences = len(safe_literal_eval(text))
    no_non_terminals = len([word for word in word_tokenize(text) if word[:-1].isupper()]) - no_words
    
    return safe_div(no_non_terminals, no_sentences)


In [70]:
def no_passives(text):
    clauses = break_into_clauses(text)
    num = 0
    for clause in clauses:
        for ind, element in enumerate(clause):
            if (element == "VAFIN" and (clause[ind+1] == "wird" or clause[ind+1] == "werden" or clause[ind+1] == "wurden") 
            and "VVPP" in clause):                       
                num += 1
                
    return num

def no_passives_per_sentence(text):
    lenght = len(safe_literal_eval(text))
    return safe_div(no_passives(text), lenght)

def no_passives_per_clause(text):
    return safe_div(no_passives(text), len(break_into_clauses(text)))

In [71]:
def zu_infinitive_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "KOUI", "clause")

def zu_infinitive_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "KOUI", "sentence")

def separated_verb_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "PTKVZ", "clause")

def separated_verb_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "PTKVZ", "sentence")

In [72]:
#this needs the cleanedText!
nlp = de_core_news_sm.load()
def sein_haben_ratio(text):
    doc = nlp(text)
    no = 0
    for tok in doc:
        if tok.lemma_ == "sein" or tok.lemma_ == "haben" or tok.lemma_ == "habe":
            no += 1
    return safe_div(no,len(doc))

In [77]:
#PWS substituting interrogative pronoun wer, was
#PWAT attributive interrogative pronoun welche [Farbe], wessen [Hut]
#PWAV adverbial interrogative or relativ
interrogative_pronouns = ["PWS", "PWAT", "PWAV"]
def av_inter_clause_per_sentence(text):
    no_inter_clauses = len([word for word in word_tokenize(text) if word in interrogative_pronouns])
    no_sentences = len(safe_literal_eval(text))
    
    return safe_div(no_inter_clauses, no_sentences)

def av_inter_clause_per_clause(text):
    no_inter_clauses = len([word for word in word_tokenize(text) if word in interrogative_pronouns])
    no_clauses = len(break_into_clauses(text))
    
    return safe_div(no_inter_clauses, no_clauses)

In [78]:
def relative_clauses_to_dependent_clauses_ratio(text):
    no_rel_clauses = len([word for word in text if word == "PRELAT" or word == "PRELS"])
    no_dep_clauses = no_rel_clauses + len([word for word in text if word == "KOUS"])
    
    return safe_div(no_rel_clauses, no_dep_clauses)

In [79]:
SC_feature_functions = [av_clause_length, max_clause_length, av_num_clauses_per_sentence, av_num_dep_clauses_per_sentence,
                         av_num_dep_clauses_per_clause, av_coordinations_per_sentence, av_coordinations_per_clause, 
                         av_num_nps_per_sentence, av_num_vps_per_sentence, av_num_vzs_per_sentence,
                         av_num_pps_per_sentence, av_num_nps_per_clause, av_num_vps_per_clause, av_num_vzs_per_clause,
                         av_num_pps_per_clause, av_non_terminals_per_word, av_non_terminals_per_sentence, no_passives,
                         no_passives_per_sentence, no_passives_per_clause, zu_infinitive_per_clause, zu_infinitive_per_sentence,
                         separated_verb_per_clause, separated_verb_per_sentence, av_inter_clause_per_clause, 
                         av_inter_clause_per_sentence, relative_clauses_to_dependent_clauses_ratio]

SC_feature_function_with_clearedText = sein_haben_ratio                     

In [80]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)
    
input_files = os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/07_SyntaxDependencyFeatures")
input_files = [input_file for input_file in input_files if not input_file in os.listdir("/home/soeren/Documents/TextDifficultyAssessmentGerman/datasets/08_SyntaxComplexity")]

#input_files = ["01_Preprocessing_df.csv"]

for input_file in input_files:
    df = pd.read_csv("datasets/07_SyntaxDependencyFeatures/" + input_file, sep="|", lineterminator = '\n')
    
    for function in tqdm(SC_feature_functions):
        df["SC-"+function.__name__]  = [function(text) for text in df["parsedText"]]

    df["SC-sein_haben_ratio"] = [sein_haben_ratio(text) for text in df["cleanedText"]]

    filename = "datasets/08_SyntaxComplexity/" + input_file
    save_df(filename,df)





  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)




  4%|▎         | 1/27 [00:07<03:16,  7.55s/it][A[A[A[A



  7%|▋         | 2/27 [00:15<03:07,  7.50s/it][A[A[A[A



 11%|█         | 3/27 [00:23<03:05,  7.74s/it][A[A[A[A



 15%|█▍        | 4/27 [00:29<02:51,  7.45s/it][A[A[A[A



 19%|█▊        | 5/27 [00:42<03:07,  8.54s/it][A[A[A[A



 22%|██▏       | 6/27 [00:55<03:15,  9.30s/it][A[A[A[A



 26%|██▌       | 7/27 [01:08<03:16,  9.83s/it][A[A[A[A



 30%|██▉       | 8/27 [01:21<03:14, 10.24s/it][A[A[A[A



 33%|███▎      | 9/27 [01:34<03:09, 10.54s/it][A[A[A[A



 37%|███▋      | 10/27 [01:47<03:03, 10.79s/it][A[A[A[A



 41%|████      | 11/27 [02:00<02:55, 10.98s/it][A[A[A[A



 44%|████▍     | 12/27 [02:13<02:47, 11.15s/it][A[A[A[A



 48%|████▊     | 13/27 [02:26<02:38, 11.29s/it][A[A[A[A



 52%|█████▏    | 14/27 [02:39<02:28, 11.41s/it][A[A[A[A



 56%|█████▌    | 15/27 [02:52<02:18, 11.52s/it][A[A[A[A



  7%|▋         | 2/27 [00:06<01:15,  3.03s/it][A[A[A[A



 11%|█         | 3/27 [00:09<01:16,  3.19s/it][A[A[A[A



 15%|█▍        | 4/27 [00:12<01:12,  3.16s/it][A[A[A[A



 19%|█▊        | 5/27 [00:18<01:20,  3.64s/it][A[A[A[A



 22%|██▏       | 6/27 [00:24<01:25,  4.06s/it][A[A[A[A



 26%|██▌       | 7/27 [00:30<01:28,  4.40s/it][A[A[A[A



 30%|██▉       | 8/27 [00:37<01:29,  4.69s/it][A[A[A[A



 33%|███▎      | 9/27 [00:43<01:27,  4.87s/it][A[A[A[A



 37%|███▋      | 10/27 [00:49<01:23,  4.93s/it][A[A[A[A



 41%|████      | 11/27 [00:55<01:21,  5.08s/it][A[A[A[A



 44%|████▍     | 12/27 [01:01<01:17,  5.15s/it][A[A[A[A



 48%|████▊     | 13/27 [01:07<01:12,  5.17s/it][A[A[A[A



 52%|█████▏    | 14/27 [01:13<01:08,  5.24s/it][A[A[A[A



 56%|█████▌    | 15/27 [01:19<01:03,  5.28s/it][A[A[A[A



 59%|█████▉    | 16/27 [01:22<00:56,  5.17s/it][A[A[A[A



 63%|██████▎   | 17/27 [01:26<00:50,  5.09s/it][A[A[A[A





 37%|███▋      | 10/27 [01:48<03:03, 10.81s/it][A[A[A[A



 41%|████      | 11/27 [02:03<02:58, 11.18s/it][A[A[A[A



 44%|████▍     | 12/27 [02:17<02:51, 11.42s/it][A[A[A[A



 48%|████▊     | 13/27 [02:31<02:43, 11.66s/it][A[A[A[A



 52%|█████▏    | 14/27 [02:45<02:33, 11.84s/it][A[A[A[A



 56%|█████▌    | 15/27 [03:00<02:24, 12.05s/it][A[A[A[A



 59%|█████▉    | 16/27 [03:10<02:10, 11.88s/it][A[A[A[A



 63%|██████▎   | 17/27 [03:19<01:57, 11.71s/it][A[A[A[A



 67%|██████▋   | 18/27 [03:25<01:42, 11.43s/it][A[A[A[A



 70%|███████   | 19/27 [03:33<01:29, 11.22s/it][A[A[A[A



 74%|███████▍  | 20/27 [03:48<01:19, 11.40s/it][A[A[A[A



 78%|███████▊  | 21/27 [04:02<01:09, 11.54s/it][A[A[A[A



 81%|████████▏ | 22/27 [04:15<00:58, 11.62s/it][A[A[A[A



 85%|████████▌ | 23/27 [04:28<00:46, 11.66s/it][A[A[A[A



 89%|████████▉ | 24/27 [04:42<00:35, 11.77s/it][A[A[A[A



 93%|█████████▎| 25/27 [04:55<00:23, 11.84s/it][A[A[

 67%|██████▋   | 18/27 [03:45<01:52, 12.55s/it][A[A[A[A



 70%|███████   | 19/27 [03:53<01:38, 12.31s/it][A[A[A[A



 74%|███████▍  | 20/27 [04:13<01:28, 12.69s/it][A[A[A[A



 78%|███████▊  | 21/27 [04:27<01:16, 12.72s/it][A[A[A[A



 81%|████████▏ | 22/27 [04:39<01:03, 12.71s/it][A[A[A[A



 85%|████████▌ | 23/27 [04:51<00:50, 12.69s/it][A[A[A[A



 89%|████████▉ | 24/27 [05:08<00:38, 12.84s/it][A[A[A[A



 93%|█████████▎| 25/27 [05:27<00:26, 13.11s/it][A[A[A[A



 96%|█████████▋| 26/27 [05:34<00:12, 12.85s/it][A[A[A[A



100%|██████████| 27/27 [05:34<00:00, 12.38s/it][A[A[A[A



[A[A[A[A



  0%|          | 0/27 [00:00<?, ?it/s][A[A[A[A



  4%|▎         | 1/27 [00:09<03:56,  9.10s/it][A[A[A[A



  7%|▋         | 2/27 [00:17<03:41,  8.88s/it][A[A[A[A



 11%|█         | 3/27 [00:27<03:41,  9.22s/it][A[A[A[A



 15%|█▍        | 4/27 [00:36<03:29,  9.09s/it][A[A[A[A



 19%|█▊        | 5/27 [00:53<03:54, 10.66s/it][A[A