# Example Notebook for annotator package
In the following the basic functionalities of the package are introduced. We will load basic text in english and german and annotate it using the available features.

In [None]:
# import annotator.base as be
# import annotator.mspacy as msp

import base as be
import mspacy as msp

# The input
The input for the package would normally be set up in an input.json file passed to the package.  There are two example .json files in this directory which we will use. Lets look at the contents of example_en.json. The first few parameters we encounter are:
```
    "input": "example_en.txt",
    "output": "output_en",
    "tool": "spacy",
```
These tell the programm that the data we want to annotate is stored in example_en.txt, that we want to output to a file which we can identify as output_en and that we want to use the tool spacy to annotate the data. There will be more tools to chose from, but for simplicity their configurations have been stripped for now. The spacy specific config is found in the `"spacy_dict"` section. Here we find the parameters we can tell spacy to enable it to annotate the data. The entries do usually come with a comment explaining what the parameters do. Lets look through the ones we set up in example_en.json:
```
    "model": false,
```
Here we can specify a model spacy should use to annotate the text if we want to. We leave it to false for now though.
```
    "lang": "en",
```
Here we specify that the language of the data we want to annotate is english. Since we didn't specify a model this information will be needed to chose one for us.
```
    "text_type": "news",
```
We specify what kind of text we want to annotate in order to chose an appropriate model for the task. This does currently only support "news" for english. The setup we chose here will lead to the usage of the model en_core_web_md.
```
    "processors": "senter, tagger, parser, attribute_ruler, lemmatizer, ner",
```
Here we specifiy the processors for the pipeline we will apply to our data. This will define what kind of annotations we get in the end.

The remaining entries are not immediatly important for this example and are all set to their default values. Especially the `"config"` parameter and its contents are usually defined for a given model in it's config.cfg file and should not be tempered with.

# Running based of the .json
Let's now look at what the program would do with the supplied information from the .json input files. First we would read in the .json files to make them available as dictionaries.

In [None]:
# read in example_en.json
dict_en = be.prepare_run.load_input_dict("example_en")
# print(dict_en)

# read in example_de.json
dict_de = be.prepare_run.load_input_dict("example_de")

Now that we habe access to the information from the .json files we can load in the data from the specified locations.

In [None]:
# read in the english example text from example_en.txt
data_en = be.prepare_run.get_text(dict_en["input"])
# print(data_en)

# read in the german example text from example_de.txt
data_de = be.prepare_run.get_text(dict_de["input"])
# print(data_de)

Next we would load the tool as specified by the .json. In this case we would load the spacy pipeline from the mspacy module. We are told what components we load and which model we are using.

In [None]:
# load the pipeline from the config
pipe_en = msp.spacy_pipe(dict_en)

pipe_de = msp.spacy_pipe(dict_de)

After doing this we only have to apply the pipeline to the data we read in earlier.

In [None]:
# apply pipeline to data
annotated_en = pipe_en.apply_to(data_en)

annotated_de = pipe_de.apply_to(data_de)

The text has now been annotated. We can easily pass the results to a .vrt file using the output name defined in the .json. This would also directly encode the annotated results to cwb.

In [None]:
# get the annotated .vrt and pass to cwb
annotated_en.pass_results()

annotated_de.pass_results()

Loading the pipeline, applying it and passing the results can be done conveniently in one line:

# example for english data
msp.spacy_pipe(dict_en).apply_to(data_en).pass_results()

# is equivalent to the above
# pipe_en = 

mspacy main

if __name__ == "__main__":
    data = be.prepare_run.get_sample_text()
    # or read the main dict and activate
    mydict = be.prepare_run.load_input_dict("src/annotator/input")
    # take only the part of dict pertaining to spacy
    # filename needs to be moved to/taken from top level of dict
    # spacy_dict = mydict["spacy_dict"]
    # remove comment lines starting with "_"
    # for now, we are not using "components" as these are defined through the pre-
    # made models; for making your own model, they will need to be used
    # we will worry about this later
    spacy_dict = be.prepare_run.update_dict(mydict)
    # build pipe from config, apply it to data, write results to vrt
    # spacy_pipe(spacy_dict).pply_to(data).pass_results()
    # if we use "outname", this needs to be passed the full dict
    spacy_pipe(mydict).apply_to(data).pass_results()

    # this throws a warning that the senter may not work as intended, it seems to work
    # fine though
    # senter_config = {
    #     "filename": "Test1",
    #     "lang": "en",
    #     "text_type": "news",
    #     "processors": "tok2vec,tagger,attribute_ruler,lemmatizer",
    #     "pretrained": False,
    #     "set_device": False,
    #     "config": {},
    # # }

    # # spacy_pipe(senter_config).apply_to(data).begin_to_vrt()
    # # try to chunk the plenary text from example into pieces, annotate these and than reasemble to .vrt
    # # get chunked text
    data = be.chunk_sample_text("data/Original/plenary.vrt")

    # # start with basic config as above if we use the pretrained keyword it
    # # replaces the lang and text_type keys so we don't need to specifiy them

    # # some testing to check wheter spacy_pipe.pipe_multiple or spacy_pipe.get_multiple is faster
    # config = {
    #     "filename": "test_new",
    #     "processors": "tok2vec, tagger, parser,\
    #         attribute_ruler, lemmatizer, ner",
    #     "pretrained": "de_core_news_md",
    #     "set_device": False,
    #     "config": {"nlp.batch_size": 10},
    # }

    # config1 = {
    #     "filename": "test_class",
    #     "processors": "tok2vec, tagger, parser,\
    #         attribute_ruler, lemmatizer, ner",
    #     "pretrained": "de_core_news_md",
    #     "set_device": False,
    #     "config": {"nlp.batch_size": 10},
    # }

    # repeat = 50

    pipe = spacy_pipe(spacy_dict)
    pipe.pipe_multiple(data)

    # stmt = """pipe.pipe_multiple(data)"""

    # Time = timeit.repeat(
    #     stmt=stmt, number=1, repeat=repeat, timer=time.process_time, globals=globals()
    # )

    # check = []
    # with open("test_new_spacy.vrt", "r") as file:
    #     for line in file:
    #         if not line.startswith("!") and not line.startswith("<"):
    #             check.append(int(line.split()[0]))

    # for i, elem in enumerate(check):
    #     if i > 0:
    #         try:
    #             assert elem - check[i - 1] == 1
    #         except AssertionError:
    #             print(i, elem)
    # print("Asserted pipe indexing.")

    # pipe = spacy_pipe(config1)

    # stmt1 = """pipe.get_multiple(data)"""

    # Time1 = timeit.repeat(
    #     stmt=stmt1, number=1, repeat=repeat, timer=time.process_time, globals=globals()
    # )

    # check = []
    # with open("test_class_spacy.vrt", "r") as file:
    #     for line in file:
    #         if not line.startswith("!") and not line.startswith("<"):
    #             check.append(int(line.split()[0]))

    # for i, elem in enumerate(check):
    #     if i > 0:
    #         try:
    #             assert elem - check[i - 1] == 1
    #         except AssertionError:
    #             print(i, elem)
    # print("Asserted iterating indexing.")

    # print(
    #     "Using spacy.pipe: {:.2f}s | Iterating directly: {:.2f}s".format(
    #         min(Time), min(Time1)
    #     )
    # )

    # # check that output is indeed the same for both methods
    # with open("test_new_spacy.vrt", "r") as file:
    #     out1 = file.readlines()

    # with open("test_class_spacy.vrt", "r") as file:
    #     out = file.readlines()

    # i = 0
    # equal = True
    # for i, line in enumerate(out):
    #     if not line.startswith("!"):
    #         try:
    #             assert line == out1[i]
    #         except AssertionError:
    #             if i < 10:
    #                 print(line, out1[i])
    #                 equal = False
    #                 i += 1
    # print("Asserted equality")


# with open("out/test_spacy.vrt", "r") as file:
# for line in file:
# check if vrt file was written correctly
# lines with "!" are comments, <s> and </s> mark beginning and
# end of sentence, respectively
# if line != "<s>\n" and line != "</s>\n" and line.split()[0] != "!":
# try:
#    assert len(line.split()) == len(spacy_dict["processors"].split(","))
# except AssertionError:
#    print(line)

mstanza main

if __name__ == "__main__":
    dict = be.prepare_run.load_input_dict("./src/annotator/input")
    # take only the part of dict pertaining to stanza
    stanza_dict = dict["stanza_dict"]
    # to point to user-defined model directories
    # stanza does not accommodate fully at the moment
    mydict = mstanza_preprocess.fix_dict_path(stanza_dict)
    print(stanza_dict)
    print(mydict)
    # stanza does not care about the extra comment keys
    # but we remove them for subsequent processing just in case
    # now we need to select the processors and "activate" the sub-dictionaries
    mydict = be.prepare_run.update_dict(mydict)
    mydict = be.prepare_run.activate_procs(mydict, "stanza_")
    mytext = be.prepare_run.get_sample_text()
    # mytext = "This is an example. And here we go."
    # initialize instance of the class
    obj = mstanza_pipeline(mydict)
    obj.init_pipeline()
    out = obj.process_text(mytext)
    obj.postprocess()
    # For the output:
    # We need a module that transforms a generic dict into xml.