## Draft for processing UD treebanks - goal: Return question sentences from a select treebank

The treebanks we will be exploring were chosen according to diversity in syntactic and morphological features of the language and the absolute quality rank of the treebank as reported by 

In [1]:
import os
import udapi
from udapi.core.document import Document
from pathlib import Path
import logging
import json


en_atis_train = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_English-Atis/en_atis-ud-train.conllu")
en_ewt_train = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_English-EWT/en_ewt-ud-train.conllu")
hi_hdtb_train = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu")
ja_gsd_train = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_Japanese-GSD/ja_gsd-ud-train.conllu")
ko_kaist_train = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_Korean-Kaist/ko_kaist-ud-dev.conllu")

In [7]:
class QuestionFinder:
  QWORDS = {"en": {"who", "what", "when", "where", "why", "how", "which", "whose", "whom", "is", "are", "do", "does", "did", "have", "has", "had", "can", "could", "will", "would", "should", "may", "might", "must"}}

  def __init__(self, lang_code):
    self.lang_code = lang_code
    self.qwords = self.QWORDS.get(lang_code, set())
    

  def question(self, tree):
    
    if not tree.descendants:
      return False
    
    qmark = tree.descendants[-1].form == '?' if tree.descendants else False

    words = [node.form.lower() for node in tree.descendants]
    qword = any(word in self.qwords for word in words)

    return qmark or qword


  def run(self, input_file):
    doc = Document(input_file)
    filtered_doc = Document()

    for tree in doc.trees:
      if self.question(tree):
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)
    return filtered_doc
  

  def merge(self, docs):
    merged_doc = Document()
    for doc in docs:
      for tree in doc.trees:
        bundle = merged_doc.create_bundle()
        bundle.add_tree(tree)

      return merged_doc


  def run_splits(self, UD_bank, out_file):
    filtered_docs = []

    for path in UD_bank:
      filtered_doc = self.run(path)
      filtered_docs.append(filtered_doc)

      merged_doc = self.merge(filtered_docs)
      merged_doc.store_conllu(out_file)


def main():
  base_path = os.path.expanduser("~/Research/qtype-eval/data")

  TREEBANKS = {"en": ["UD_English-Atis/en_atis-ud"]}

  lang_code = "en"

  finder = QuestionFinder(lang_code)

  for name in TREEBANKS[lang_code]:
    input_files = [os.path.join(base_path, f"{name}-{split}.conllu") for split in ["train", "dev", "test"]]

    treebank_name = name.split('/')[0].replace('UD_', '')
    out_file = os.path.join(base_path, f"{treebank_name}-questions.conllu")

    finder.run_splits(input_files, out_file)

if __name__ == "__main__":
  main()


In [2]:
import udapi
import os


doc = udapi.Document(en_ewt_train)

doc[5].draw(layout="align", attributes="ord,form,upos,deprel")

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006
# text = The third was being run by the head of an investment firm.
─┮                            
 │   ╭─╼   [32m1[0m  [33mThe[0m        [31mDET[0m   [34mdet[0m
 │ ╭─┶     [32m2[0m  [33mthird[0m      [31mADJ[0m   [34mnsubj:pass[0m
 │ ┢─╼     [32m3[0m  [33mwas[0m        [31mAUX[0m   [34maux[0m
 │ ┢─╼     [32m4[0m  [33mbeing[0m      [31mAUX[0m   [34maux:pass[0m
 ╰─┾       [32m5[0m  [33mrun[0m        [31mVERB[0m  [34mroot[0m
   │ ╭─╼   [32m6[0m  [33mby[0m         [31mADP[0m   [34mcase[0m
   │ ┢─╼   [32m7[0m  [33mthe[0m        [31mDET[0m   [34mdet[0m
   ┡─┾     [32m8[0m  [33mhead[0m       [31mNOUN[0m  [34mobl:agent[0m
   │ │ ╭─╼ [32m9[0m  [33mof[0m         [31mADP[0m   [34mcase[0m
   │ │ ┢─╼ [32m10[0m [33man[0m         [31mDET[0m   [34mdet[0m
   │ │ ┢─╼ [32m11[0m [33minvestment[0m [31mNOUN[0m  [34mcompound[0m
   │ ╰─┶   [

possible attributes= ord, form, lemma, upos, xpos, feats, deprel, deps, misc.

In [55]:
import udapi
import os


doc = udapi.Document(ja_gsd_train)
doc[80].draw(layout="align", attributes='ord,form,lemma,upos,feats,deprel')


# sent_id = train-s81
# text = 加えて、この今回のリリース方法も面白いことがある。
─┮                             
 │ ╭─┮     [32m1[0m  [33m加え[0m   [36m加える[0m  [31mVERB[0m  _[0m [34madvcl[0m
 │ │ ┡─╼   [32m2[0m  [33mて[0m    [36mて[0m    [31mSCONJ[0m _[0m [34mmark[0m
 │ │ ╰─╼   [32m3[0m  [33m、[0m    [36m、[0m    [31mPUNCT[0m _[0m [34mpunct[0m
 │ │ ╭─╼   [32m4[0m  [33mこの[0m   [36m此の[0m   [31mDET[0m   _[0m [34mdet[0m
 │ │ ┢─┮   [32m5[0m  [33m今回[0m   [36m今回[0m   [31mNOUN[0m  _[0m [34mnmod[0m
 │ │ │ ╰─╼ [32m6[0m  [33mの[0m    [36mの[0m    [31mADP[0m   _[0m [34mcase[0m
 │ │ ┢─╼   [32m7[0m  [33mリリース[0m [36mリリース[0m [31mNOUN[0m  _[0m [34mcompound[0m
 │ ┢─┾     [32m8[0m  [33m方法[0m   [36m方法[0m   [31mNOUN[0m  _[0m [34mnsubj[0m
 │ │ ╰─╼   [32m9[0m  [33mも[0m    [36mも[0m    [31mADP[0m   _[0m [34mcase[0m
 ╰─┾       [32m10[0m [33m面白い[0m  [36m面白い[0m  [31mADJ[0m   _[0m [34mroot[0m
   ┡─┮     [32m11[0m [33mこと[0m   [36m事[

In [66]:
doc = udapi.Document(hi_hdtb_train)
doc[10].draw(layout="align", attributes='ord,form,upos')

# sent_id = train-s11
# text = इसे चार्ल्स कोरिया ने डिजाइन किया है ।
─┮                
 │ ╭─╼   [32m1[0m [33mइसे[0m     [31mPRON[0m
 │ │ ╭─╼ [32m2[0m [33mचार्ल्स[0m [31mNOUN[0m
 │ ┢─┾   [32m3[0m [33mकोरिया[0m  [31mPROPN[0m
 │ │ ╰─╼ [32m4[0m [33mने[0m      [31mADP[0m
 │ ┢─╼   [32m5[0m [33mडिजाइन[0m  [31mNOUN[0m
 ╰─┾     [32m6[0m [33mकिया[0m    [31mVERB[0m
   ┡─╼   [32m7[0m [33mहै[0m      [31mAUX[0m
   ╰─╼   [32m8[0m [33m।[0m       [31mPUNCT[0m



In [67]:
doc=udapi.Document(ko_kaist_train)
doc[1].draw(layout="align", attributes="ord,form,upos")

# sent_id = train-s2
# text = また行きたい、そんな気持ちにさせてくれるお店です。
─┮               
 │   ╭─╼   [32m1[0m  [33mまた[0m  [31mADV[0m
 │ ╭─┾     [32m2[0m  [33m行き[0m  [31mVERB[0m
 │ │ ┡─╼   [32m3[0m  [33mたい[0m  [31mAUX[0m
 │ │ ╰─╼   [32m4[0m  [33m、[0m   [31mPUNCT[0m
 │ │   ╭─╼ [32m5[0m  [33mそんな[0m [31mPRON[0m
 │ │ ╭─┾   [32m6[0m  [33m気持ち[0m [31mNOUN[0m
 │ │ │ ╰─╼ [32m7[0m  [33mに[0m   [31mADP[0m
 │ ┢─┾     [32m8[0m  [33mさ[0m   [31mVERB[0m
 │ │ ┡─╼   [32m9[0m  [33mせ[0m   [31mAUX[0m
 │ │ ╰─┮   [32m10[0m [33mて[0m   [31mSCONJ[0m
 │ │   ╰─╼ [32m11[0m [33mくれる[0m [31mVERB[0m
 │ ┢─╼     [32m12[0m [33mお[0m   [31mNOUN[0m
 ╰─┾       [32m13[0m [33m店[0m   [31mNOUN[0m
   ┡─╼     [32m14[0m [33mです[0m  [31mAUX[0m
   ╰─╼     [32m15[0m [33m。[0m   [31mPUNCT[0m



#### Configuring the question filtering process
1. create files where questions will live, make a file per data set split
2. think of how to configure the "filter", what logic should we use for finding questions in each treebank, can we use a universal filter or do we need language specific filters, are datasets aligned well in which feats they contain

In [7]:
from udapi.core.document import Document
import os

# configure input and output files, make sure to respect split names

def question(tree):
    # first pass at filtering anything but questions would look at the presence of '?'

    

    # more elaborate way would be to look for specific question words at specific places in the tree
    #
    interrogatives = {
        "how",
        "what",
        "which",
        "when",
        "where",
        "who",
        "why",
        "whose",
        "whom",
        "whether",
        "could",
        "would",
        "should",
    }

    if tree.descendants:
        first_node = tree.descendants[0]  # look at the first word in the sentence
        if first_node.form.lower() in interrogatives:
            for node in tree.descendants:
                if node.form == "?":
                    return True
            


doc = Document(en_ewt_train)
filtered_doc = Document()
UD_English_questions = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_English-questions.conllu")

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(UD_English_questions)
print(f"\nExtracted questions and saved to '{UD_English_questions}'")


Total questions found in this file: 188

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/UD_English-questions.conllu'


### Now we can continue with other splits for the same bank

1. create separate files for dev/train/test splits
2. select the correct input and output file
3. repeat for all splits
4. repeat for all tree banks

In [None]:
from udapi.core.document import Document
import os

# output files, make sure to respect split names

en_atis_dev = os.path.expanduser(
    "/home/robin/Research/qtype-eval/data/filtered_data/en-atis-questions/en_atis-questions.test.conllu"
)


def question(tree):
    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == "?":
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #
    interrogatives = {
        "how",
        "what",
        "which",
        "when",
        "where",
        "who",
        "why",
        "whose",
        "whom",
        "whether",
        "could",
        "would",
        "should",
    }

    if tree.descendants:
        first_node = tree.descendants[0]  # look at the first word in the sentence
        print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
        if first_node.form.lower() in interrogatives:
            print("Found interrogative word at start!")
            return True


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")

First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: explain, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: are, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: find, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: list, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: what, UPOS: DET, Features: PronType=Int,Rel
Found interrogative word at start!
First word: what, UPOS: PRON, Features: PronType=Int,

In [None]:
from udapi.core.document import Document
import os

# configure input and output files, make sure to respect split names

input_file = os.path.expanduser("~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-train.conllu")


def question(tree):
    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == "?":
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #
    interrogatives = {
        "how",
        "what",
        "which",
        "when",
        "where",
        "who",
        "why",
        "whose",
        "whom",
        "whether",
        "could",
        "would",
        "should",
    }

    if tree.descendants:
        first_node = tree.descendants[0]  # look at the first word in the sentence
        print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
        if first_node.form.lower() in interrogatives:
            print("Found interrogative word at start!")
            return True


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")

First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: now, UPOS: ADV, Features: Degree=Pos
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: list, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: now, UPOS: ADV, Features: Degree=Pos
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First w

After collecting data for the `UD_English-Atis` tree bank, we can move on to other treebanks, another interesting English tree bank is `UD_English-EWT`. 


## Cross lingual question filter
After collecting some .conllu files containing English questions we filtered from source English tree banks, we can move on to configuring question filter strategies for lanuages other than English. For example, we can try to come up with a similar strategy for filtering questions in Japanese from `UD_Japanese-GSD`

Japanese has several question words and a unique syntactic question particle:

'か' - ka - PART  
'何' - nan / nani - what  
'誰' - _ - who  
'どこ' - _ - where  
'いつ' - _ - when  
'なぜ ' - naze - why - for  
'どうして' - doshite - why - conv  
'何で' - nande - why  
'いくつ' - ikutsu - how many  
'いくら' - ikura - how much  
'どちら' - dochira - which of two - for  
'どっち' - docchi - which of two - conv  
'どれ' - dore - which of three or more  

In [10]:
from udapi.core.document import Document
import os

# configure input and output files, make sure to respect split names
ja_gsd_questions = os.path.expanduser("~/Research/qtype-eval/data/UD_japanese_questions.conllu")
def question(tree):
    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == "?":
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #
    # interrogatives = {}
     
    """    
    if tree.descendants:
        first_node = tree.descendants[0]  # look at the first word in the sentence
        if first_node.form.lower() in interrogatives:
            return True
    """


doc = Document(ja_gsd_train)
filtered_doc = Document()
UD_Japanese = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_Japanese-questions.conllu")

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(UD_Japanese)
print(f"\nExtracted questions and saved to '{UD_Japanese}'")


Total questions found in this file: 42

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/UD_Japanese-questions.conllu'


In [15]:
file = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_Japanese-questions.conllu")

doc=udapi.Document(file)
doc[1].draw(layout="align", attributes="ord,form,lemma,upos,deprel")

# sent_id = train-s129
# text = 疲労回復では他に譲るか?
─┮                    
 │   ╭─╼ [32m1[0m [33m疲労[0m [36m疲労[0m [31mNOUN[0m  [34mcompound[0m
 │ ╭─┾   [32m2[0m [33m回復[0m [36m回復[0m [31mNOUN[0m  [34mobl[0m
 │ │ ┡─╼ [32m3[0m [33mで[0m  [36mで[0m  [31mADP[0m   [34mcase[0m
 │ │ ╰─╼ [32m4[0m [33mは[0m  [36mは[0m  [31mADP[0m   [34mcase[0m
 │ ┢─┮   [32m5[0m [33m他[0m  [36m他[0m  [31mNOUN[0m  [34mobl[0m
 │ │ ╰─╼ [32m6[0m [33mに[0m  [36mに[0m  [31mADP[0m   [34mcase[0m
 ╰─┾     [32m7[0m [33m譲る[0m [36m譲る[0m [31mVERB[0m  [34mroot[0m
   ┡─╼   [32m8[0m [33mか[0m  [36mか[0m  [31mPART[0m  [34mmark[0m
   ╰─╼   [32m9[0m [33m?[0m  [36m?[0m  [31mPUNCT[0m [34mpunct[0m



repeat for the Korean treebank `UD_Korean-GSD`

In [8]:
from udapi.core.document import Document
import os

# configure input and output files, make sure to respect split names

hi_hdtb_questions = os.path.expanduser("~/Research/qtype-eval/data/UD_Hindi-questions.conllu")


def question(tree):
    for node in tree.descendants:
        if node.form == "?":
            return True


doc = Document(hi_hdtb_train)
filtered_doc = Document()

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(hi_hdtb_questions)
print(f"\nExtracted questions and saved to '{hi_hdtb_questions}'")


Total questions found in this file: 66

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/UD_Hindi-questions.conllu'


In [86]:
from udapi.core.document import Document
import os

# configure input and output files, make sure to respect split names

ko_kaist_questions = os.path.expanduser("~/Research/qtype-eval/data/UD_Korean-questions.conllu")


def question(tree):
    for node in tree.descendants:
        if node.form == "?":
            return True


doc = Document(ko_kaist_train)
filtered_doc = Document()

questions_found = 0  # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(ko_kaist_questions)
print(f"\nExtracted questions and saved to '{ko_kaist_questions}'")


Total questions found in this file: 29

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/UD_Korean-questions.conllu'


## Syntax and Morphology

Now that we have some questions, we can start by taking a look at how UD data stores the labels we might be interested in.


In [88]:
import udapi
import os


doc = udapi.Document(ko_kaist_questions)
doc[10].draw(layout="align", attributes='ord,form,upos,deprel')

# sent_id = MH2_0159-s68
# text = 문학비평가와 문학연구가를 굳이 따로 분리할 필요가 있을까?
─┮                         
 │     ╭─┮   [32m1[0m [33m문학비평가와[0m [31mCCONJ[0m [34mobj[0m
 │     │ ╰─╼ [32m2[0m [33m문학연구가를[0m [31mNOUN[0m  [34mconj[0m
 │     ┢─┮   [32m3[0m [33m굳이[0m     [31mADV[0m   [34madvmod[0m
 │     │ ╰─╼ [32m4[0m [33m따로[0m     [31mADV[0m   [34madvmod[0m
 │   ╭─┶     [32m5[0m [33m분리할[0m    [31mVERB[0m  [34macl[0m
 │ ╭─┶       [32m6[0m [33m필요가[0m    [31mNOUN[0m  [34mnsubj[0m
 ╰─┾         [32m7[0m [33m있을까[0m    [31mADJ[0m   [34mroot[0m
   ╰─╼       [32m8[0m [33m?[0m      [31mPUNCT[0m [34mpunct[0m



In [29]:
file = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_English-questions.conllu")

doc=udapi.Document(file)
doc[0].draw(layout="align", attributes="ord,form,lemma,upos,deprel")

# sent_id = weblog-juancole.com_juancole_20030911085700_ENG_20030911_085700-0001
# text = What do the new al-Qaeda videotape and audio speeches of Bin Laden and Ayman al-Zawahiri tell us about the hopes of the remaining top leadership of the organization?
─┮                                               
 │ ╭─╼         [32m1[0m  [33mWhat[0m         [36mwhat[0m         [31mPRON[0m  [34mobj[0m
 │ ┢─╼         [32m2[0m  [33mdo[0m           [36mdo[0m           [31mAUX[0m   [34maux[0m
 │ │ ╭─╼       [32m3[0m  [33mthe[0m          [36mthe[0m          [31mDET[0m   [34mdet[0m
 │ │ ┢─╼       [32m4[0m  [33mnew[0m          [36mnew[0m          [31mADJ[0m   [34mamod[0m
 │ │ ┢─┮       [32m5[0m  [33mal[0m           [36mal[0m           [31mPROPN[0m [34mcompound[0m
 │ │ │ ┡─╼     [32m6[0m  [33m-[0m            [36m-[0m            [31mPUNCT[0m [34mpunct[0m
 │ │ │ ╰─╼     [32m7[0m  [33mQaeda[0m        [36mQaeda[0m        [31mPROPN[0m [34mf

In [33]:
import pandas as pd


def complexity(file):
  doc = udapi.Document(file)
  results = []

  

  for bundle in doc.bundles:
    tree = bundle.get_tree()

    text = " ".join([node.form for node in tree.descendants])


    clauses = 0
    total_dep_len = 0
    for node in tree.descendants:
        if node.parent:
           dependency_length = abs(node.ord - node.parent.ord)
           total_dep_len += dependency_length

        if node.deprel == 'root' or node.deprel == 'csubj' or node.deprel =='advcl':
           clauses += 1

    results.append({
       'text': text,
       'clauses': clauses,
       'total dependency length': total_dep_len
    })

  df = pd.DataFrame(results)


  return df

file = os.path.expanduser("/home/robin/Research/qtype-eval/data/UD_English-questions.conllu")

df = complexity(file)
#pd.options.display.max_rows = 10
display(df)

Unnamed: 0,text,clauses,total dependency length
0,What do the new al - Qaeda videotape and audio...,1,151
1,What do you really know about George W. Bush ’...,1,63
2,How Would You Like To Know How YOU Can Live In...,1,104
3,What was Bush doing with his youth ?,1,20
4,What 's a few dead soldiers in comparison to k...,1,72
...,...,...,...
183,How long does it take to train new people at w...,2,28
184,How fast your support queries get answered ?,1,20
185,How much could it possibly cost ?,1,18
186,How has it gone so far ?,1,16


## References

[1] https://nbviewer.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb

[2] D. Zeman, “Udapi.” 2023. https://github.com/udapi/udapi-python/blob/master/tutorial/udapi-tutorial-dz.pdf
