## Draft for processing UD treebanks - goal: Return question sentences from a select treebank

The treebanks we will be exploring were chosen according to diversity in syntactic and morphological features of the language and the absolute quality rank of the treebank as reported by 

In [None]:
!pip install stanza conllu

In [5]:
import stanza
from stanza.utils.conll import CoNLL
import conllu


In [33]:
import os
from udapi.core.document import Document

path = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-dev.conllu')


with open(path, 'r') as f:
    first_lines = f.readlines()[:5]
    print("First few lines:")
    print(''.join(first_lines))


doc = Document()
doc.load_conllu(path)

print(f"Number of sentences: {len(doc.bundles)}")

First few lines:
# sent_id = 0001.dev
# text = i would like the cheapest flight from pittsburgh to atlanta leaving april twenty fifth and returning may sixth
1	i	I	PRON	_	Case=Nom|Number=Sing|Person=1|PronType=Prs	3	nsubj	_	_
2	would	will	AUX	_	_	3	aux	_	_
3	like	like	VERB	_	VerbForm=Inf	0	root	_	_

Number of sentences: 572


In [None]:
! cat ~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-dev.conllu | udapy -T | head -n 40

2025-02-09 15:48:13,212 [   INFO] execute - No reader specified, using read.Conllu
2025-02-09 15:48:13,212 [   INFO] execute -  ---- ROUND ----
2025-02-09 15:48:13,212 [   INFO] execute - Executing block read.Conllu
2025-02-09 15:48:13,264 [   INFO] execute - Executing block write.TextModeTrees
global.Entity = None
loaded_from = -
# sent_id = 0001.dev
# text = i would like the cheapest flight from pittsburgh to atlanta leaving april twenty fifth and returning may sixth
─┮
 │ ╭─╼ [33mi[0m [31mPRON[0m [34mnsubj[0m
 │ ┢─╼ [33mwould[0m [31mAUX[0m [34maux[0m
 ╰─┾ [33mlike[0m [31mVERB[0m [34mroot[0m
   │ ╭─╼ [33mthe[0m [31mDET[0m [34mdet[0m
   │ ┢─╼ [33mcheapest[0m [31mADJ[0m [34mamod[0m
   ╰─┾ [33mflight[0m [31mNOUN[0m [34mobj[0m
     │ ╭─╼ [33mfrom[0m [31mADP[0m [34mcase[0m
     ┡─┾ [33mpittsburgh[0m [31mPROPN[0m [34mnmod[0m
     │ │ ╭─╼ [33mto[0m [31mADP[0m [34mcase[0m
     │ ╰─┶ [33matlanta[0m [31mPROPN[0m [34mnmod[0m
     ╰─┮ 

In [50]:
import udapi
import os

path = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-dev.conllu')
doc = udapi.Document(path)
doc[10].draw() # [n] = sent_id

# sent_id = 0011.dev
# text = i need information for ground transportation denver colorado
─┮
 │ ╭─╼ [33mi[0m [31mPRON[0m [34mnsubj[0m
 ╰─┾ [33mneed[0m [31mVERB[0m [34mroot[0m
   ╰─┮ [33minformation[0m [31mNOUN[0m [34mobj[0m
     │ ╭─╼ [33mfor[0m [31mADP[0m [34mcase[0m
     │ ┢─╼ [33mground[0m [31mNOUN[0m [34mcompound[0m
     ╰─┾ [33mtransportation[0m [31mNOUN[0m [34mnmod[0m
       ╰─┮ [33mdenver[0m [31mPROPN[0m [34mlist[0m
         ╰─╼ [33mcolorado[0m [31mPROPN[0m [34mappos[0m



In [44]:
doc[10].draw(layout="align", attributes="ord,form,feats")

# sent_id = 0011.dev
# text = i need information for ground transportation denver colorado
─┮                           
 │ ╭─╼       [32m1[0m [33mi[0m              Case=Nom|Number=Sing|Person=1|PronType=Prs[0m
 ╰─┾         [32m2[0m [33mneed[0m           Mood=Ind|Tense=Pres|VerbForm=Fin[0m
   ╰─┮       [32m3[0m [33minformation[0m    Number=Sing[0m
     │ ╭─╼   [32m4[0m [33mfor[0m            _[0m
     │ ┢─╼   [32m5[0m [33mground[0m         Number=Sing[0m
     ╰─┾     [32m6[0m [33mtransportation[0m Number=Sing[0m
       ╰─┮   [32m7[0m [33mdenver[0m         Number=Sing[0m
         ╰─╼ [32m8[0m [33mcolorado[0m       Number=Sing[0m



#### Configuring the question filtering process
1. create files where questions will live, make a file per data set split
2. think of how to configure the "filter", what logic should we use for finding questions in each treebank, can we use a universal filter or do we need language specific filters, are datasets homogenous in which feats they contain

In [88]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-dev.conllu')

output_file = os.path.expanduser('/home/robin/Research/qtype-eval/data/filtered_data/en-atis-questions/en_atis-questions.dev.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #     
    interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    if tree.descendants:
        first_node = tree.descendants[0] # look at the first word in the sentence
        print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
        if first_node.form.lower() in interrogatives:
            print("Found interrogative word at start!")
            return True


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")


First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: DET, Features: PronType=Int,Rel
Found interrogative word at start!
First word: could, UPOS: AUX, Features: _
Found interrogative word at start!
First word: list, UPOS: VERB, Features: VerbForm=Inf
First word: display, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: what, UPO

### Now we can continue with other splits for the same bank

1. create separate files for dev/train/test splits
2. select the correct input and output file
3. repeat for all splits
4. repeat for all tree banks

In [89]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-test.conllu')

output_file = os.path.expanduser('/home/robin/Research/qtype-eval/data/filtered_data/en-atis-questions/en_atis-questions.test.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #     
    interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    if tree.descendants:
        first_node = tree.descendants[0] # look at the first word in the sentence
        print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
        if first_node.form.lower() in interrogatives:
            print("Found interrogative word at start!")
            return True


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")


First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: explain, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: are, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: find, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: list, UPOS: VERB, Features: VerbForm=Inf
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: what, UPOS: DET, Features: PronType=Int,Rel
Found interrogative word at start!
First word: what, UPOS: PRON, Features: PronType=Int,

In [90]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-Atis/en_atis-ud-train.conllu')

output_file = os.path.expanduser('/home/robin/Research/qtype-eval/data/filtered_data/en-atis-questions/en_atis-questions.train.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree
    #     
    interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    if tree.descendants:
        first_node = tree.descendants[0] # look at the first word in the sentence
        print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
        if first_node.form.lower() in interrogatives:
            print("Found interrogative word at start!")
            return True


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")


First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: now, UPOS: ADV, Features: Degree=Pos
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: what, UPOS: PRON, Features: PronType=Int,Rel
Found interrogative word at start!
First word: show, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: list, UPOS: VERB, Features: VerbForm=Inf
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: now, UPOS: ADV, Features: Degree=Pos
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: show, UPOS: VERB, Features: Mood=Ind|Tense=Pres|VerbForm=Fin
First word: i, UPOS: PRON, Features: Case=Nom|Number=Sing|Person=1|PronType=Prs
First w

After collecting data for the `UD_English-Atis` tree bank, we can move on to other treebanks, another interesting English tree bank is `UD_English-EWT`. 


In [97]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-EWT/en_ewt-ud-dev.conllu')
output_file = os.path.expanduser('~/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-dev.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree, EWT has annotated punctuation

    # interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    #if tree.descendants:
    #    first_node = tree.descendants[0] # look at the first word in the sentence
    #    print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
    #   if first_node.form.lower() in interrogatives:
    #        print("Found interrogative word at start!")
    #        return True
    


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")



Total questions found in this file: 163

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-dev.conllu'


In [98]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-EWT/en_ewt-ud-test.conllu')

output_file = os.path.expanduser('~/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-test.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree, EWT has annotated punctuation

    # interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    #if tree.descendants:
    #    first_node = tree.descendants[0] # look at the first word in the sentence
    #    print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
    #   if first_node.form.lower() in interrogatives:
    #        print("Found interrogative word at start!")
    #        return True
    


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")



Total questions found in this file: 166

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-test.conllu'


In [99]:
from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_English-EWT/en_ewt-ud-train.conllu')

output_file = os.path.expanduser('~/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-train.conllu')


def question(tree):

    # first pass at filtering anything but questions would look at the presence of '?'

    for node in tree.descendants:
        if node.form == '?':
            return True

    # more elaborate way would be to look for specific question words at specific places in the tree, EWT has annotated punctuation

    # interrogatives = {'how', 'what', 'which', 'when', 'where', 'who', 'why', 'whose', 'whom', 'whether', 'could', 'would', 'should'}

    #if tree.descendants:
    #    first_node = tree.descendants[0] # look at the first word in the sentence
    #    print(f"First word: {first_node.form}, UPOS: {first_node.upos}, Features: {first_node.feats}")
    #   if first_node.form.lower() in interrogatives:
    #        print("Found interrogative word at start!")
    #        return True
    


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")



Total questions found in this file: 754

Extracted questions and saved to '/home/robin/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-train.conllu'


## Cross lingual question filter
After collecting some .conllu files containing English questions we filtered from source English tree banks, we can move on to configuring question filter strategies for lanuages other than English. For example, we can try to come up with a similar strategy for filtering questions in Japanese from `UD_Japanese-GSD`

from udapi.core.block import Block
from udapi.core.document import Document
from udapi.block.write.conllu import Conllu
import os

#configure input and output files, make sure to respect split names

input_file = os.path.expanduser('~/Research/qtype-eval/data/ud_data/UD_Japanese-GSD/ja_gsd-ud-dev.conllu')

output_file = os.path.expanduser('~/Research/qtype-eval/data/filtered_data/ja-gsd-questions/ja-gsd-questions-dev.conllu')


def question(tree):

    # add question logic for japanese


doc = Document(input_file)
filtered_doc = Document()

questions_found = 0 # counter for keeping track of how many question we add to our new file

# main loop to filter our file

for tree in doc.trees:
    if question(tree):
        questions_found += 1
        bundle = filtered_doc.create_bundle()
        bundle.add_tree(tree)

print(f"\nTotal questions found in this file: {questions_found}")

filtered_doc.store_conllu(output_file)
print(f"\nExtracted questions and saved to '{output_file}'")


## Syntax and Morphology

Now that we have some questions, we can start by taking a look at how UD data stores the labels we might be interested in.


In [106]:
file = os.path.expanduser('~/Research/qtype-eval/data/filtered_data/en-ewt-questions/en_ewt-questions-train.conllu')

doc = udapi.Document(file)

doc[69].draw()

doc[69].draw(layout="align", attributes="ord,form,feats")



# sent_id = weblog-blogspot.com_dakbangla_20050311135387_ENG_20050311_135387-0216
# text = What mosques exactly did they visit and who did they meet?
─┮
 │   ╭─╼ [33mWhat[0m [31mDET[0m [34mdet[0m
 │ ╭─┾ [33mmosques[0m [31mNOUN[0m [34mobj[0m
 │ │ ╰─╼ [33mexactly[0m [31mADV[0m [34madvmod[0m
 │ ┢─╼ [33mdid[0m [31mAUX[0m [34maux[0m
 │ ┢─╼ [33mthey[0m [31mPRON[0m [34mnsubj[0m
 ╰─┾ [33mvisit[0m [31mVERB[0m [34mroot[0m
   │ ╭─╼ [33mand[0m [31mCCONJ[0m [34mcc[0m
   │ ┢─╼ [33mwho[0m [31mPRON[0m [34mobj[0m
   │ ┢─╼ [33mdid[0m [31mAUX[0m [34maux[0m
   │ ┢─╼ [33mthey[0m [31mPRON[0m [34mnsubj[0m
   ┡─┶ [33mmeet[0m [31mVERB[0m [34mconj[0m
   ╰─╼ [33m?[0m [31mPUNCT[0m [34mpunct[0m

# sent_id = weblog-blogspot.com_dakbangla_20050311135387_ENG_20050311_135387-0216
# text = What mosques exactly did they visit and who did they meet?
─┮                 
 │   ╭─╼ [32m1[0m  [33mWhat[0m    PronType=Int[0m
 │ ╭─┾   [32m2[0m  [33mm

## References
