# Subword-level UD

In [1]:
import sys
sys.path.append("../")

from IPython.display import HTML

from src import (
    LexItem, WFToken,
    RuleInfo, ComplexRuleInfo, CompoundRuleInfo,
    Inventory, unite_inventories
)

## Storing information about word formation

In [2]:
rules_by_ids = {
    "-able": RuleInfo("-able", "SFX", "NOUN", "ADJ"),
    "-ite": RuleInfo("-ite", "SFX", "VERB", "ADV"),
    "-ly": RuleInfo("-ly", "SFX", "ADJ", "ADV"),

    "in-": RuleInfo("in-", "PFX", "ADJ", "ADJ"),

    "ADJ + NOUN + -ed -> ADJ": CompoundRuleInfo(
        "A+N+ed", "COMPOUND,SFX", "NOUN", "ADJ",
        [RuleInfo("-ed", "SFX", "NOUN", "ADJ")], [[]], []
    ),
}

In [3]:
lexicon = {
    'comfort': LexItem(
        lemma='comfort', form='comfort', upos='NOUN'
    ),
    'comfortable': LexItem(
        lemma='comfortable', form='comfortable', upos='ADJ'
    ),
    'define': LexItem(
        lemma='define', form='define', upos='VERB'
    ),
    'definite': LexItem(
        lemma='definite', form='definite', upos='ADJ'
    ),
    'indefinite': LexItem(
        lemma='indefinite', form='indefinite', upos='ADJ'
    ),
    'indefinitely': LexItem(
        lemma='indefinitely', form='indefinitely', upos='ADV'
    ),
    'eye': LexItem(
        lemma='eye', form='eye', upos='NOUN'
    ),
    'green': LexItem(
        lemma='green', form='green', upos='ADJ'
    ),
    'green-eyed': LexItem(
        lemma='green-eyed', form='green-eyed', upos='ADJ'
    ),
}

In [4]:
word_analyses = {
    lexicon['comfortable']: WFToken(
        d_from=lexicon['comfort'],
        rule_id='-able',
    ),
    lexicon['definite']: WFToken(
        d_from=lexicon['define'],
        rule_id='-ite',
    ),
    lexicon['indefinite']: WFToken(
        d_from=lexicon['definite'],
        rule_id='in-',
    ),
    lexicon['indefinitely']: WFToken(
        d_from=lexicon['indefinite'],
        rule_id='-ly',
    ),
    lexicon['green-eyed']: WFToken(
        d_from=lexicon['eye'],
        rule_id="ADJ + NOUN + -ed -> ADJ",
        d_modifiers=[lexicon['green']]
    )
}

In [5]:
inventory = Inventory(
    word_analyses=word_analyses,
    rules_by_ids=rules_by_ids
)

## Now let's have a look at some examples!

In [6]:
for word in word_analyses:
    subword_tree = inventory.make_subword_tree(word)
    subword_tree.html(fpath=f"tree_{word.lemma}_{word.upos}.html")

In [7]:
ud_sentence = """
# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0028
# text = So instead Posada may be held indefinitely, in comfortable custody.
1	So	so	ADV	RB	_	6	advmod	6:advmod	_
2	instead	instead	ADV	RB	_	6	advmod	6:advmod	_
3	Posada	Posada	PROPN	NNP	Number=Sing	6	nsubj:pass	6:nsubj:pass	_
4	may	may	AUX	MD	VerbForm=Fin	6	aux	6:aux	_
5	be	be	AUX	VB	VerbForm=Inf	6	aux:pass	6:aux:pass	_
6	held	hold	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
7	indefinitely	indefinitely	ADV	RB	_	6	advmod	6:advmod	SpaceAfter=No
8	,	,	PUNCT	,	_	6	punct	6:punct	_
9	in	in	ADP	IN	_	11	case	11:case	_
10	comfortable	comfortable	ADJ	JJ	Degree=Pos	11	amod	11:amod	_
11	custody	custody	NOUN	NN	Number=Sing	6	obl	6:obl:in	SpaceAfter=No
12	.	.	PUNCT	.	_	6	punct	6:punct	_
""".strip()

tree = inventory.make_tree(ud_sentence)
tree_html = tree.html()

# print(tree.latex())

In [8]:
HTML(tree_html)

In [9]:
HTML(inventory.make_subword_tree(lexicon["indefinitely"]).html("indefinitely.html"))

In [10]:
HTML(inventory.make_subword_tree(lexicon["green-eyed"]).html("green-eyed.html"))

## Using data from existing resources

In [11]:
from data_readers import *

### [afr] Afrikaans: AuCoPro

In [21]:
afr_inventory = AuCoProReader(lang="afr").build_inventory(
    "../data/afr/aucopro/sample.txt",
    # "../data/afr/aucopro/List.AUCOPRO.AfrikaansSplitting.txt",
    bracketing_strategy="last"  # "last", "head", "chain"
)

query = LexItem(
    lang="afr",
    lemma="woordeskatitems",
    form="woordeskatitems",
    upos="NOUN",
)

tree = afr_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)

### [deu] German: DErivBase 2.0 from UDer 1.1 and GermaNet

In [20]:
deu_derivation_inventory = UDerReader(lang="deu").build_inventory(
    "../data/deu/derivbase-uder/sample.txt",
    "../data/deu/rules_sample.json"
)
deu_compounding_inventory = GermaNetReader(lang="deu").build_inventory(
    "../data/deu/germanet/sample.txt"
    # "../data/deu/germanet/split_compounds_from_GermaNet17.0.txt"
)
deu_inventory = unite_inventories(deu_derivation_inventory, deu_compounding_inventory)

query = LexItem(
    lang="deu",
    lemma="Olympiamedaillengewinner",
    form="Olympiamedaillengewinner",
    upos="NOUN",
)

tree = deu_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)

### [fra] French: Demonext

In [23]:
fra_inventory = DemonextReader(lang="fra").build_inventory(
    "../data/fra/demonext/sample.txt"
    # "../data/fra/demonext/relations.csv"
)

query = LexItem(
    lang="fra",
    lemma="anti-inflammatoire",
    form="anti-inflammatoire",
    upos="NOUN",
    xpos="Nm"
)

tree = fra_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)

### [nld] Dutch: AuCoPro

In [25]:
nld_inventory = AuCoProReader(lang="nld").build_inventory(
    "../data/nld/aucopro/sample.txt",
    # "../data/nld/aucopro/List.AUCOPRO.DutchSplitting.txt",
    bracketing_strategy="last"  # "last", "head", "chain"
)

query = LexItem(
    lang="nld",
    lemma="temperatuursverandering",
    form="temperatuursverandering",
    upos="NOUN",
)

tree = nld_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)

### [rus] Russian: DerivBase.Ru from UDer 1.1 and RuCompounds from UDer 1.1

In [26]:
rus_derivation_inventory = UDerReader(lang="rus").build_inventory(
    "../data/rus/derivbaseru-uder/sample.txt",
    "../data/rus/rules_sample.json"
)

rus_compounding_inventory = UDerReader(lang="rus").build_inventory(
    "../data/rus/rucompounds-uder/sample.txt",
    "../data/rus/rules_sample.json"
)

rus_inventory = unite_inventories(rus_derivation_inventory, rus_compounding_inventory)

query = LexItem(
    lang="rus",
    lemma="пятикомнатный",
    form="пятикомнатный",
    upos="ADJ",
)

tree = rus_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)

### [zho] Chinese: SJTU (Shanghai Jiao Tong University) Chinese Character Dependency Treebank

In [27]:
zho_inventory = CharDepsReader(lang="zho").build_inventory(
    "../data/zho/chinesechardeps/sample.txt"
)

query = LexItem(
    lang="zho",
    lemma="聆听已久",
    form="聆听已久",
    upos="ADJ",
    xpos="a"
)

tree = zho_inventory.make_subword_tree(
    query
).html(f"{query.lang}_{query.lemma}_{query.upos}.html")

HTML(tree)