In [None]:
import pandas as pd
from spacy.tokens.span_group import SpanGroup
import torch
import random
import multiprocessing as mp

from src.spacy_models.spacy_pipe_build import SpacyPipeBuild
from src.spacy_models.spacy_pipe_funcs import PipeFunc
from src.spacy_models.data_models import EntsWithCustExts
from src.spacy_models.spacy_utils import SpacyExt, SpacyTask, ExtractionType
from src.settings.enums import NaturalLanguage
from src.data.data_loader import DataLoader

from src.spacy_models.data_models import EntsWithCustExts

In [None]:
torch.cuda.empty_cache()

In [None]:
nat_lang = NaturalLanguage.EN

In [None]:
ner = SpacyPipeBuild(natural_language=nat_lang, spacy_task=SpacyTask.NER, use_gpu=True, extraction_type=ExtractionType.GENERATIVE_LLM)

In [None]:
ner.nlp.pipe_names

In [None]:
dl = DataLoader()
year = 2023
month = 5
df = dl.load_monthly_df(year=year, month=month)

In [None]:
n_samples = 5
indexes = random.sample(population=range(len(df.index)), k=n_samples)
indexes

In [None]:
texts: list[str] = df.loc[indexes, 'pp_art_text'].tolist()
texts

In [None]:
%%time
# docs = ner.nlp.pipe(texts=texts, batch_size=5)

In [None]:
# for ind, doc in enumerate(ner.nlp.pipe(texts=texts, batch_size=5)):
#     res = PipeFunc.get_sentences_with_custom_extensions(processed_doc=doc)
#     print('type(res):', type(res))
#     print(res)
#     print('--------------------------------------------------------------------------------------------------')

### Alternative: APPLY

In [None]:
df_reduced = df.loc[indexes]
df_reduced

In [None]:
# df_reduced['spacy_test'] = df_reduced.apply(lambda x: PipeFunc.get_sentences_with_custom_extensions(processed_doc=ner.nlp(x.pp_art_text)), axis=1)

In [None]:
# df_reduced

In [None]:
# df_reduced.spacy_test.info()

In [None]:
# df_reduced['spacy_test'].tolist()

#### spaCy PIPE

In [None]:
df_reduced['test_spacy_3'] = [PipeFunc.get_sentences_with_custom_extensions(processed_doc=doc) for doc in ner.nlp.pipe(df_reduced['pp_art_text'])]

In [None]:
df_reduced

In [None]:
df_reduced.test_spacy_3.tolist()

In [None]:

text_dict = {'sentence': 'Trotz der hinter den Erwartungen liegenden Ergebnisentwicklung verbleibe der Ausblick auf das uebrige Geschaeftsjahr positiv, teilte das SDax -Unternehmen am Freitagabend in Dortmund mit.', 'entities': [EntsWithCustExts(start_char=248, end_char=269, ent_text='das SDax -Unternehmen', comp_name='adesso SE', comp_symbol='ADN1.DE', set_in='xx_coref_resolve', df_index=None)]}

In [None]:
from functools import reduce
def mask_company_name(text_dict: dict, replace_str: str = "Comp@Name@Placeholder") -> str:
    text: str = text_dict['sentence']
    replacements: dict = {ent.ent_text: replace_str for ent in text_dict['entities']}
    return reduce(lambda x, kv: x.replace(*kv), replacements.items(), text)

In [None]:
mask_company_name(text_dict)

In [None]:
text_sentences = []
for row in df_reduced.test_spacy_3.tolist():
    sentence = []
    for sent_dict in row:
        if sent_dict:
            sent = mask_company_name(text_dict=sent_dict)
            sentence.append(sent)
            # sentence.append('- - - - - - - - - - - - - - - - - - - - -')
    if sentence:
        text_sentences.append(sentence)
        # text_sentences.append('#########################################')
        
from pprint import pprint
pprint(text_sentences)

In [None]:
sentences_flattened = [item for row in text_sentences for item in row]
sentences_flattened

In [None]:
df = pd.DataFrame(sentences_flattened, columns=["sents"])
df.to_parquet('../../src/data/comp_sentences.parquet')

In [None]:
# from spacy import displacy
# options = { "colors": {"ORG": "red","ENT-RULER":"green", "OWN-REGEX":"orange", "ORG-PART":"blue", "FUZZY":"pink", "PER":"grey", "LOC":"gray"}}
# displacy.render(doc, style='ent', options=options)