In [None]:
import pandas as pd
from spacy.tokens.span_group import SpanGroup

from src.spacy_models.spacy_pipe_build import SpacyPipeBuild
from src.spacy_models.spacy_utils import SpacyExt, SpacyTask
from src.settings.enums import NaturalLanguage
from src.data.data_loader import DataLoader

In [None]:
nat_lang = NaturalLanguage.DE

In [None]:
ner = SpacyPipeBuild(natural_language=nat_lang, spacy_task=SpacyTask.NER, use_gpu=True)

In [None]:
ner.nlp.pipe_names

In [None]:
dl = DataLoader()
year = 2023
month = 5
df = dl.load_monthly_df(year=year, month=month)

In [None]:
comp_containing_indexes = 5, 7, 30, 31, 34, 35, 41
# ToDo: Improve pattern for 34, 35
index = 41
text = df.loc[index, 'pp_art_text']
text

In [None]:
# text = """ Heute hat 1&1 AG und 3U Holding AG und Grifal-Vorstand und A.G. BARR und 4GLOBAL plc und 11880 solutions und ACTIA-Vorstand und Adidas-Aktie und ABIONYX Pharma seine Zahlen bekannt gegeben. """

In [None]:
%%time
doc = ner.nlp(text=text)

In [None]:
for t in doc:
    print(t.pos_)

In [None]:
for ent in doc.ents:
    print(ent.start_char, ent.end_char, getattr(ent._, SpacyExt.COMP_SYMBOL.value))

In [None]:
from spacy.tokens import Span
for i in range(10):
    t = doc[i]
    # print('token.ent_type:', t.ent_type_)
    span = Span(doc, start=i, end=i + 3)
    print('span', type(span), (span.start, span.end, span.text), 'ext:', getattr(span._, SpacyExt.COMP_NAME.value), 'span.ent:', span.ents)

In [None]:
for span in doc.ents:
    print(span)

In [None]:
# for t in doc:
#     print(t.i, t.idx, t.text)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.label_, getattr(ent._, SpacyExt.COMP_NAME.value) == "")

In [None]:
for ent in doc.ents:
    # if getattr(ent._, SpacyExt.DOMAIN.ext_name) != '':
    # if ent.has_extension(SpacyExt.COMP_NAME.ext_name):
    print('ENT_TEXT:', ent.text)
    print('SET_IN:', getattr(ent._, SpacyExt.SET_IN.value))
    print('COMP_NAME:', getattr(ent._, SpacyExt.COMP_NAME.value))
    print('COMP_SYMBOL:', getattr(ent._, SpacyExt.COMP_SYMBOL.value))
    print('ent.label_:', ent.label_)
    print('------------------------------------')

In [None]:
if "parser" in ner.nlp.pipe_names:
    for ind, sent in enumerate(doc.sents):
        if [getattr(ent._, SpacyExt.COMP_NAME.value) for ent in sent.ents if getattr(ent._, SpacyExt.COMP_NAME.value) != ""]:
            print(ind,'-', [getattr(ent._, SpacyExt.COMP_NAME.value) for ent in sent.ents if getattr(ent._, SpacyExt.COMP_NAME.value) != ""], '-', sent.text)

In [None]:
from spacy import displacy
options = { "colors": {"ORG": "red","ENT-RULER":"green", "OWN-REGEX":"orange", "ORG-PART":"blue", "FUZZY":"pink", "PER":"grey", "LOC":"gray"}}
displacy.render(doc, style='ent', options=options)

In [None]:
coref = SpacyPipeBuild(natural_language=nat_lang, spacy_task=SpacyTask.COREF)

In [None]:
coref.nlp.pipe_names

In [None]:
text = "Weil er mit seiner Arbeit sehr beschäftigt war, hatte Peter davon genug. Er und seine Frau haben entschieden, dass ihnen ein Urlaub gut tun würde. Sie sind nach Spanien gefahren, weil ihnen das Land sehr gefiel."
doc_new = coref.nlp(text)

In [None]:
for ent in doc_new.ents:
    # if ent.has_extension(SpacyExt.COMP_NAME.ext_name):
    print('SET_IN:', getattr(ent._, SpacyExt.SET_IN.value))
    print('COMP_NAME:', getattr(ent._, SpacyExt.COMP_NAME.value))
    print('COMP_LABEL:', getattr(ent._, SpacyExt.COMP_SYMBOL.value))
    print('ent.label_:', ent.label_)
    print('------------------------------------')

In [None]:
# coref.nlp.pipe_names

In [None]:
# doc_new._.coref_chains

In [None]:
# for t in doc_new:
#     print(t.text)

In [None]:
# coref.show_ents(doc=doc)