In [None]:
import stanza

Installing stanza takes ages. There are google colab notebooks available that document the usage.

In [None]:
# Download an English model into the default directory
print("Downloading English model...")
stanza.download("en")

# Pipeline
Stanza offers pipelines similarly to spaCy.

In [None]:
# Build an English pipeline, with all processors by default
print("Building an English pipeline...")
en_nlp = stanza.Pipeline("en")

# Sentencize, Tokenize, Lemma

In [None]:
# Processing English text
en_doc = en_nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
for i, sent in enumerate(en_doc.sentences):
    print("[Sentence {}]".format(i + 1))
    for word in sent.words:
        print(
            "{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(
                word.text, word.lemma, word.pos, word.head, word.deprel
            )
        )
    print("")

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize")
doc = nlp("This is a test sentence for stanza. This is another sentence.")
for i, sentence in enumerate(doc.sentences):
    print(f"====== Sentence {i+1} tokens =======")
    print(
        *[f"id: {token.id}\ttext: {token.text}" for token in sentence.tokens], sep="\n"
    )

In [None]:
print([sentence.text for sentence in doc.sentences])

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize", tokenize_no_ssplit=True)
doc = nlp("This is a sentence.\n\nThis is a second. This is a third.")
for i, sentence in enumerate(doc.sentences):
    print(f"====== Sentence {i+1} tokens =======")
    print(
        *[f"id: {token.id}\ttext: {token.text}" for token in sentence.tokens], sep="\n"
    )

## Can also use spaCy for english tokenization in its pipeline

In [None]:
nlp = stanza.Pipeline(
    lang="en", processors={"tokenize": "spacy"}
)  # spaCy tokenizer is currently only allowed in English pipeline.
doc = nlp("This is a test sentence for stanza. This is another sentence.")
for i, sentence in enumerate(doc.sentences):
    print(f"====== Sentence {i+1} tokens =======")
    print(
        *[f"id: {token.id}\ttext: {token.text}" for token in sentence.tokens], sep="\n"
    )

In [None]:
nlp = stanza.Pipeline(
    lang="en", processors="tokenize,mwt,pos,lemma"
)  # the warning about mwt ('multi-word token') will be
# triggered in English and Chinese, for example, that
# do not have mwt
doc = nlp("Barack Obama was born in Hawaii.")
print(
    *[
        f'word: {word.text+" "}\tlemma: {word.lemma}'
        for sent in doc.sentences
        for word in sent.words
    ],
    sep="\n",
)

Lemmatizer can be improved by loading specific dictionary that the user provides.

# Part-of-speech, morphological

# Dependency

# Constituency 

# Named entities

Utilize entity recognition (NER) module to identify spans of particular entity type. Running the [NERProcessor](https://stanfordnlp.github.io/stanza/ner.html) requires the [TokenizeProcessor](https://stanfordnlp.github.io/stanza/tokenize.html). After the pipeline was run the named entities can be accessed via Doc.ents.

In [None]:
from collections import defaultdict

path = "../data/Original/iued_test_original.txt"

with open(path, "r") as file:
    data = file.read().replace("\n", "")

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize, ner")
doc = nlp(data)
print(*["entity:{} \ttype:{}".format(ent.text, ent.type) for ent in doc.ents], sep="\n")

In [None]:
def named_entities_stanza(doc):

    named_entities = defaultdict(list)

    for i, ent in enumerate(doc.ents):
        # add the entities label, start index and end index to the dictionary
        named_entities["Text: {} |Label: {}".format(ent.text, ent.type)].append(
            [ent.start_char, ent.end_char, i]
        )

    return named_entities

Stanza doesnt seem to support the labeling using a corpus wide indexing, instead giving the found entity tokens Ids based
on their position in their respective sentence. So using the definite start and end chars seems more appropriate?

In [None]:
named_ent = named_entities_stanza(doc)

In [None]:
print(named_ent)

In [None]:
for elem in named_ent["Text: Audi |Label: ORG"]:
    print(data[elem[0] - 20 : elem[1] + 20])

In [None]:
print(doc.ents)