## Parse Tree Complexity and Specific Syntactic Constructions Features

In [1]:
import ast
from collections import Counter, defaultdict
import de_core_news_sm
import itertools
from nltk import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv("preprocessed_text_df.csv", lineterminator = '\n')

In [3]:
#Avg. Length of a Clause # W / # C - DONE
#Avg. Sentence Length # W / # S - Sören has it
#Avg. Length of a T-Unit # W / # TU
#Avg. Num. Clauses per Sentence # C / # S - DONE
#Avg. Num. T-Units per Sentence # TU / # S 
#Avg. Num. Clauses per T-Unit #C / # TU - per sentence
#Avg. Num. Complex-T-Units per T-Unit # comp. TU / # TU
#Avg. Num. Dep. Clause per Clause # DC / # C -DONE
#Avg. Num. Dep. Clause per T-Unit # DC / # TU -DONE
#Avg. Num. Co-ordinate Phrases per Clause # CP / # C -DONE
#Avg. Num. Co-ordinate Phrases per T-Unit # CP / # TU -DON
#Avg. Num. Complex Nominals per Clause # compl. Nom. / # C -DONE-
#Avg. Num. Complex Nominals per T-Unit # compl. Nom. / # TU -DONE
#Avg. Num. VPs per T-Unit # VP / # TU -DONE

In [20]:
#helper functions
def break_into_clauses(text):
    #takes text as a string a input
    #returns a list of clauses as lists
    clauses = []
    clause = []
    for ind, element in enumerate(word_tokenize(text)):
        if element == "S" and clause != []:
            clauses.append(clause)
            clause = []
        else:
            clause.append(element) 
                        
    clauses.append(clause)   
    return clauses[1:]

def list_of_clause_lengths(text):
    #takes a text as string as input
    #returns a list of numbers - number of elements in a clause
    clause_lengths = []
    clause_length = 0
    clauses = break_into_clauses(text)
    for clause in clauses:
        for ind, element in enumerate(clause):
            if element[-1].islower(): 
                clause_length += 1 
            
        clause_lengths.append(clause_length)
        clause_length = 0
           
    clause_lengths.append(clause_length)    
            
    return clause_lengths[:-1]     

def av_pos_per_clause_or_sentence(text, pos, clause_or_sentence):
    no_pos = len([word for word in word_tokenize(text) if word == pos])
    no_sentences = len(ast.literal_eval(text))
    no_clauses = len(break_into_clauses(text))
    if clause_or_sentence == "clause":
        return no_pos / no_sentences
    else:
        return no_pos / no_clauses

In [21]:
def av_clause_length(text):
    clause_lenghts = list_of_clause_lengths(text)
    return np.mean(clause_lenghts)

def max_clause_length(text):
    clause_lenghts = list_of_clause_lengths(text)
    return max(clause_lenghts)

def av_num_clauses_per_sentence(text):
    lengths = []
    sentences = ast.literal_eval(text)
    for sentence in sentences:
        lengths.append(len(break_into_clauses(sentence)))
    return np.mean(lengths)

In [22]:
dependency_pronouns = ["KOUS", "PRELAT" "PRELS"]
def av_num_dep_clauses_per_sentence(text):
    no_dep_clauses = len([word for word in word_tokenize(text) if word in dependency_pronouns])
    no_sentences = len(ast.literal_eval(text))
    
    return no_dep_clauses / no_sentences

In [23]:
def av_num_dep_clauses_per_clause(text):
    no_dep_clauses = len([word for word in word_tokenize(text) if word in dependency_pronouns])
    no_clauses = len(break_into_clauses(text))
    
    return no_dep_clauses / no_clauses

In [36]:
#KON is coordinating conjunction
def av_coordinations_per_sentence(text):    
    return av_pos_per_clause_or_sentence(text, "KON", "sentence")

def av_coordinations_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "KON", "clause")

In [37]:
#Avg. Num. NPs per Sentence #NP / # S -DONE
#Avg. Num. VPs per Sentence # VP / # S -DONE
#Avg. Num. PPs per Sentence # VZ / # S -DONE
#Avg. Num. VZs per Sentence # PP / # S -DONE
#Avg. Num. NPs per Clause # NP / # C -DONE
#Avg. Num. VPs per Clause # VP / # C -DONE
#Avg. Num. PPs per Clause # PP / # C -DONE
#Avg. Num. VZs per Clause # VZ / # C -DONE

#Avg. Length of a NP sum(len(NP)) / # NP -- i have this with dependencies
#Avg. Length of a VP sum(len(VP)) / # NP -- i have this with dependencies
#Avg. Length of a PP sum(len(PP)) / # NP -- i have this with dependencies
#Avg. Num. Dep. Clauses per Sentence # DC / # S -- this is up
#Avg. Num. Complex T-Units per Sentence #compl. TU/ # S
#Avg. Num. Co-ordinate Phrases per Sentence # CP / # S -- this is up
#Avg. Parse Tree Height sum(parseTreeHeight) / # S -- done

In [38]:
def av_num_nps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "NP", "sentence")

def av_num_vps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "VP", "sentence")

def av_num_vzs_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "VZ", "sentence")

def av_num_pps_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "PP", "sentence")

def av_num_nps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "NP", "clause")

def av_num_vps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "VP", "clause")

def av_num_vzs_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "VZ", "clause")

def av_num_pps_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "PP", "clause")

In [39]:
#Avg. Num. Non-Terminals Per Sentence # NTs / # S - DONE
#Avg. Num. Non-Terminal Per Words # NTs / # W - DONE
#Avg. Num. Modifers Per NP # modifersInNPs / # NP -- dependency
#Avg. Num. Modifers Per VP # modif ersInVPs / # VPs -- dependency
#Passive Voice - Sentence Ratio # passiveVoice / # S -- done
#Passive Voice - Clause Ratio # passiveVoice /# C -- done

#Dep. Clauses with Conj. to dep. Clause Ratio # DC w. Conj. / # DC
#Conjunctional Clauses Ratio # Conj. C / # dep. C w. Conj.
#Interrogative Clauses Ratio # Inter. C / # dep. C w. Conj. -- i think so
#Relative Clauses Ratio # Rel. C / # DC w. Conj.  -- i think so
#Dep. Clauses w.o. Conj. to dep. Clause Ratio # DC w.o. Conj. / # DC
#`satzwertige Infnitive' to Clause Ratio # satzInf / # DC -- i think so

# + separated verbs
# + sein/haben ratio

In [48]:
def av_non_terminals_per_word(text):
    text = re.sub('[()]', '', text)

    no_words = len([word for word in word_tokenize(text) if word[:-1].islower()])
    no_non_terminals = len([word for word in word_tokenize(text) if word[:-1].isupper()]) - no_words
    if no_words != 0:
        return no_non_terminals / no_words

def av_non_terminals_per_sentence(text):
    text = re.sub('[()]', '', text)
    
    no_words = len([word for word in word_tokenize(text) if word[:-1].islower()])
    no_sentences = len(ast.literal_eval(text))
    no_non_terminals = len([word for word in word_tokenize(text) if word[:-1].isupper()]) - no_words
    
    if no_sentences != 0:
        return no_non_terminals / no_sentences


In [49]:
def no_passives(text):
    clauses = break_into_clauses(text)
    num = 0
    for clause in clauses:
        for ind, element in enumerate(clause):
            if (element == "VAFIN" and (clause[ind+1] == "wird" or clause[ind+1] == "werden" or clause[ind+1] == "wurden") 
            and "VVPP" in clause):                       
                num += 1
                
    return num

def no_passives_per_sentence(text):
    return no_passives(text) / len(ast.literal_eval(text))

def no_passives_per_clause(text):
    return no_passives(text) / len(break_into_clauses(text))

In [50]:
def zu_infinitive_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "KOUI", "clause")

def zu_infinitive_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "KOUI", "sentence")

def separated_verb_per_clause(text):
    return av_pos_per_clause_or_sentence(text, "PTKVZ", "clause")

def separated_verb_per_sentence(text):
    return av_pos_per_clause_or_sentence(text, "PTKVZ", "sentence")

In [51]:
#this needs the cleanedText!
nlp = de_core_news_sm.load()
def sein_haben_ratio(text):
    doc = nlp(text)
    no = 0
    for tok in doc:
        if tok.lemma_ == "sein" or tok.lemma_ == "haben" or tok.lemma_ == "habe":
            no += 1
    return no/len(doc)

In [57]:
#PWS substituting interrogative pronoun wer, was
#PWAT attributive interrogative pronoun welche [Farbe], wessen [Hut]
#PWAV adverbial interrogative or relativ
interrogative_pronouns = ["PWS", "PWAT", "PWAV"]
def av_inter_clause_per_sentence(text):
    no_inter_clauses = len([word for word in word_tokenize(text) if word in interrogative_pronouns])
    no_sentences = len(ast.literal_eval(text))
    
    return no_inter_clauses / no_sentences

def av_inter_clause_per_clause(text):
    no_inter_clauses = len([word for word in word_tokenize(text) if word in interrogative_pronouns])
    no_clauses = len(break_into_clauses(text))
    
    return no_inter_clauses / no_clauses

In [61]:
def relative_clauses_to_dependent_clauses_ratio(text):
    no_rel_clauses = len([word for word in text if word == "PRELAT" or word == "PRELS"])
    no_dep_clauses = no_rel_clauses + len([word for word in text if word == "KOUS"])
    
    if no_rel_clauses != 0:
        return no_rel_clauses / no_dep_clauses
    else:
        return 0

In [62]:
SC_feature_functions = [av_clause_length, max_clause_length, av_num_clauses_per_sentence, av_num_dep_clauses_per_sentence,
                         av_num_dep_clauses_per_clause, av_coordinations_per_sentence, av_coordinations_per_clause, 
                         av_num_nps_per_sentence, av_num_vps_per_sentence, av_num_vzs_per_sentence,
                         av_num_pps_per_sentence, av_num_nps_per_clause, av_num_vps_per_clause, av_num_vzs_per_clause,
                         av_num_pps_per_clause, av_non_terminals_per_word, av_non_terminals_per_sentence, no_passives,
                         no_passives_per_sentence, no_passives_per_clause, zu_infinitive_per_clause, zu_infinitive_per_sentence,
                         separated_verb_per_clause, separated_verb_per_sentence, av_inter_clause_per_clause, 
                         av_inter_clause_per_sentence, relative_clauses_to_dependent_clauses_ratio]

SC_feature_function_with_clearedText = sein_haben_ratio                     

In [66]:
for function in SC_feature_functions:
    df["SC-"+function.__name__]  = [function(text) for text in df["parsedText\r"]]
    
df["SC-sein_haben_ratio"] = [sein_haben_ratio(text) for text in df["cleanedText"]]


filename = "SC_features_df.csv"
df.to_csv(filename)

In [72]:
df.head()

Unnamed: 0.1,Unnamed: 0,Level,Title,Text,Source,Type,cleanedText,RFTagger,parsedText,SC-av_clause_length,...,SC-no_passives_per_sentence,SC-no_passives_per_clause,SC-zu_infinitive_per_clause,SC-zu_infinitive_per_sentence,SC-separated_verb_per_clause,SC-separated_verb_per_sentence,SC-av_inter_clause_per_clause,SC-av_inter_clause_per_sentence,SC-relative_clauses_to_dependent_clauses_ratio,SC-sein_haben_ratio
0,0,A1,Meine Familie,Zu meiner Familie gehören vier Personen. Die M...,https://german.net/reading/familie/,Reading,Zu meiner Familie gehören vier Personen. Die M...,"[[['Zu', 'APPR', 'Dat'], ['meiner', 'PRO', 'Po...",['(ROOT (S (PP (APPR Zu) (PPOSAT meiner) (NN F...,6.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.042017
1,1,A1,Maria und ihre Familie,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,https://german.net/reading/marias-familie/,Reading,Mein Name ist Maria. Ich bin 30 Jahre alt. Zu ...,"[[['Mein', 'PRO', 'Poss', 'Attr', '-', 'Nom', ...",['(ROOT (S (NP (PPOSAT Mein) (NN Name)) (VAFIN...,6.208333,...,0.0,0.0,0.0,0.0,0.105263,0.083333,0.0,0.0,0,0.075581
2,2,A1,Ich bin Tom,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,https://german.net/reading/tom/,Reading,Hallo! Ich bin Tom Maier. Ich bin 13 Jahre alt...,"[[['Hallo', 'ITJ'], ['!', 'SYM', 'Pun', 'Sent'...","['(ROOT (NUR (ITJ Hallo) ($. !)))', '(ROOT (S ...",5.263158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.092308
3,3,A1,Freundinnen,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,https://german.net/reading/freundinnen/,Reading,Ricarda ist 21 Jahre alt und wohnt in Lübeck. ...,"[[['Ricarda', 'N', 'Name', 'Nom', 'Sg', 'Fem']...",['(ROOT (CS (S (NE Ricarda) (VAFIN ist) (AP (N...,5.444444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.034783
4,4,A1,Einkaufen im Supermarkt,Frau Meier geht heute in den Supermarkt. Ihr M...,https://german.net/reading/einkaufen/,Reading,Frau Meier geht heute in den Supermarkt. Ihr M...,"[[['Frau', 'N', 'Reg', 'Nom', 'Sg', 'Fem'], ['...",['(ROOT (S (NP (NN Frau) (NE Meier)) (VVFIN ge...,5.022222,...,0.0,0.0,0.0,0.0,0.054054,0.044444,0.0,0.0,0,0.030928
