In [1]:
import torch
import pathlib
from spacy import displacy
from src.B_spacy_pipeline.spacy_pipe_process import SpacyPipeProcess
from src.settings.enums import ExtractionType, NaturalLanguage, SpacyTask, SpacyExt
from src.A_data.data_loader import DataLoader

In [2]:
torch.cuda.empty_cache()

In [3]:
# Note: Rule-Based Model with Crosslingual-Coreference
coref_xx = SpacyPipeProcess(natural_language=NaturalLanguage.DE, spacy_task=SpacyTask.COREF, 
           ner_method=ExtractionType.TRADITIONAL, coref_method=ExtractionType.PRETRAINED)
print('pipe components:', coref_xx.nlp.pipe_names)

GPU is used: True
custom extensions "init_extensions" initialized
regex_entity_pattern for own_regex_search were compiled.
Function "own_regex_search" initialized
Added patterns to token_matcher
Function "func_comp_name_token_regex_match" initialized
Function "own_coref_resolve_pretrained" initialized
pipe components: ['tok2vec', 'own_regex_search', 'comp_name_token_regex_match', 'xx_coref_resolve']


In [4]:
# Note: Generative LLM Model using LangChain and OpenAI
coref_gen_llm = SpacyPipeProcess(natural_language=NaturalLanguage.DE, spacy_task=SpacyTask.COREF, 
                ner_method=ExtractionType.TRADITIONAL, coref_method=ExtractionType.GENERATIVE_LLM)
print('pipe components:', coref_gen_llm.nlp.pipe_names)

GPU is used: True
custom extensions "init_extensions" initialized
regex_entity_pattern for own_regex_search were compiled.
Function "own_regex_search" initialized
Added patterns to token_matcher
Function "func_comp_name_token_regex_match" initialized
Function "own_coref_resolve_generative" initialized
pipe components: ['tok2vec', 'own_regex_search', 'comp_name_token_regex_match', 'llm_coref_resolve']


#### DATA

In [5]:
dl = DataLoader()
path = pathlib.Path('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/A_data/monthly/df_demo.parquet')
df_all = dl.load_df(path=path)
df_all

Unnamed: 0,art_source,art_url,art_type,art_datetime,art_language,art_isin,art_company_name,art_title,art_text,art_author,art_abstract,art_video_url,pp_art_text
0,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 10:54:00+02:00,DE,DE000STAB1L8,Stabilus SE,AKTIE IM FOKUS: Stabilus verlieren - Anleger u...,FRANKFURT (dpa-AFX) - Die Ergebnisse von Stabi...,,,,Die Ergebnisse von Stabilus fuer das zweite Ge...
1,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:34:00+02:00,DE,DE0006450000,LPKF Laser & Electronics SE,EQS-DD: LPKF Laser & Electronics SE (deutsch),EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,,,,EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...
2,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...
3,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:30:00+02:00,DE,FR0000120578,Sanofi S.A.,ANALYSE-FLASH: Deutsche Bank Research senkt Sa...,FRANKFURT (dpa-AFX Broker) - Deutsche Bank Res...,,,,Deutsche Bank Research hat Sanofi nach Quartal...
4,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 08:43:00+02:00,DE,DE0005493092,Borussia Dortmund GmbH & Co. KGaA,Spekulationen um mögliche Sancho-Rückkehr zum BVB,DORTMUND (dpa-AFX) - Rund um Fußball-Bundeslig...,,,,Rund um Fussball-Bundesligist Borussia Dortmun...
5,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 10:56:00+02:00,DE,DE000A1MME74,Netfonds AG,EQS-News: Netfonds AG: Erwerb eigener Aktien -...,Netfonds AG: Erwerb eigener Aktien - 24. Zwisc...,,,,Netfonds AG: Erwerb eigener Aktien - 24. Zwisc...
6,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 08:05:00+02:00,DE,DE000A2LQ884,AUTO1 Group SE,Online-Gebrauchtwagenhändler Auto1 reduziert V...,BERLIN (dpa-AFX) - Der Online-Gebrauchtwagenhä...,,,,Der Online-Gebrauchtwagenhaendler Auto1 hat im...
7,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 10:35:00+02:00,DE,DE000A2YN900,TeamViewer SE,ANALYSE-FLASH: JPMorgan belässt Teamviewer auf...,NEW YORK (dpa-AFX Broker) - Die US-Bank JPMorg...,,,,Die US-Bank JPMorgan hat die Einstufung fuer T...
8,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 10:16:00+02:00,DE,DE0006452907,Nemetschek SE,ANALYSE-FLASH: Baader Bank senkt Nemetschek au...,MÜNCHEN (dpa-AFX Broker) - Die Baader Bank hat...,,,,Die Baader Bank hat Nemetschek nach Zahlen zum...
9,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 09:50:00+02:00,DE,DE000STAB1L8,Stabilus SE,ANALYSE-FLASH: SocGen hebt Stabilus auf 'Buy' ...,PARIS (dpa-AFX Broker) - Die französische Groß...,,,,Die franzoesische Grossbank Societe Generale h...


In [None]:
equal = [8,...,11, 14]              # Both approaches are approximately equal
better = [4,10,12,22,32,33,36,37]             # GEN LLM approach is better
worse = [6,7,16,17,23]              # GEN LLM approach is worse

In [34]:
index = 32
text = df_all.loc[index, 'pp_art_text']
print('Sample Text:\n\n', text[:2000], '\n')

Sample Text:

 EnviTec Biogas AG: Vorschlag fuer Dividende auf Basis des positiven Geschaeftsergebnisses 2022 und Prognose fuer das Geschaeftsjahr 2023.  EnviTec Biogas AG: Vorschlag fuer Dividende auf Basis des positiven Geschaeftsergebnisses 2022 und Prognose fuer das Geschaeftsjahr 2023.  Veroeffentlichung einer Insiderinformation nach Artikel 17 der Verordnung  Nr. 596 2014, uebermittelt durch EQS News - ein Service der EQS Group AG.  EnviTec Biogas AG: Vorschlag fuer Dividende auf Basis des positiven Geschaeftsergebnisses 2022 und Prognose fuer das Geschaeftsjahr 2023.  Aufgrund des positiven Abschlusses des Geschaeftsjahres 2022 hat der Aufsichtsrat der EnviTec Biogas AG heute uebereinstimmend mit dem Vorschlag des Vorstands beschlossen, der Hauptversammlung am 3. Juli 2023 die Ausschuettung einer Dividende in Hoehe von EUR 2,00 je Aktie fuer das Geschaeftsjahr 2022 vorzuschlagen. Der Vorstand der EnviTec Biogas AG hat heute zudem nach Abstimmung mit dem Aufsichtsrat die Prognose

#### COREF Processing

In [35]:
# Note: Doc-object from pretrained
doc_pretrained = %time coref_xx.process_text(text=text)

search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=0, end_idx=14, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=139, end_idx=153, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=428, end_idx=442, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas AG', label='OWN-REGEX', start_idx=653, end_idx=670, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTe

In [36]:
# Note: Doc-object from gen_llm
doc_gen_llm = %time coref_gen_llm.process_text(text=text)

search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=0, end_idx=14, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=139, end_idx=153, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas', label='OWN-REGEX', start_idx=428, end_idx=442, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTec Biogas AG', label='OWN-REGEX', start_idx=653, end_idx=670, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='EnviTec Biogas AG', comp_symbol='ETG.DE', text='EnviTe

In [37]:
# Note: Pretrained xx_coref
comp_names = []
for ent in doc_gen_llm.ents:
    if ent.has_extension('comp_symbol') and ent._.comp_symbol != '':
        comp_names.append(ent.text)
        print('TEXT:', ent.text,'COMP-Name:', ent._.comp_name, '--- COM-SYMBOL:', ent._.comp_symbol, '--- SET-IN-FUNCTION:', ent._.set_in)
comp_name = sorted(comp_names, key=comp_names.count, reverse=True)[0] if comp_names else None
print('---------------------')
print('comp_name:', comp_name, '--- len(comp_names):', len(comp_names))

TEXT: EnviTec Biogas COMP-Name: EnviTec Biogas AG --- COM-SYMBOL: ETG.DE --- SET-IN-FUNCTION: own_regex_search
TEXT: EnviTec Biogas COMP-Name: EnviTec Biogas AG --- COM-SYMBOL: ETG.DE --- SET-IN-FUNCTION: own_regex_search
TEXT: EnviTec Biogas COMP-Name: EnviTec Biogas AG --- COM-SYMBOL: ETG.DE --- SET-IN-FUNCTION: own_regex_search
TEXT: EnviTec Biogas AG COMP-Name: EnviTec Biogas AG --- COM-SYMBOL: ETG.DE --- SET-IN-FUNCTION: own_regex_search
TEXT: EnviTec Biogas AG COMP-Name: EnviTec Biogas AG --- COM-SYMBOL: ETG.DE --- SET-IN-FUNCTION: own_regex_search
---------------------
comp_name: EnviTec Biogas --- len(comp_names): 5


In [38]:
# Note: Generative LLM approach
colors = {'OWN-REGEX': 'grey', 'FUZZY': 'grey', "ORG-PART": 'grey', 'LLM-COREF': 'green'}
options = {'ents': ['OWN-REGEX', 'FUZZY', "ORG-PART", "LLM-COREF"], 'colors': colors}
displacy.render(doc_gen_llm, style="ent", options=options)

In [39]:
# Note: PRETRAINED approach: xx-coref
found_comps = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc_pretrained.ents if ent.label_ in ['XX-COREF', 'OWN-REGEX', 'ORG-PART']])
vis_dict = {'text': text, 'ents': [{'start': comp[0], 'end': comp[1], 'label': comp[2]} for comp in found_comps],
    'title': None}
colors = {'OWN-REGEX': 'grey', 'FUZZY': 'grey', "ORG-PART": 'grey', 'XX-COREF': 'red'}
options = {'ents': ['OWN-REGEX', 'FUZZY', "ORG-PART", "XX-COREF"], 'colors': colors}
displacy.render(vis_dict, manual=True, style="ent", options=options)

In [40]:
count_pretrained = []
for ent in doc_pretrained.ents:
    print('TEXT:', ent.text, '---', 'COMP-Name:', ent._.comp_name,'---' ,'COMP-Symbol:', ent._.comp_symbol, '---', 'label:', ent.label_)
    if comp_name.lower() in ent.text.lower():
        count_pretrained.append(ent.text)
print('-------------------------------------------')
print('len(count_pretrained):', len(count_pretrained))

TEXT: EnviTec Biogas --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: OWN-REGEX
TEXT: EnviTec Biogas --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: OWN-REGEX
TEXT: EnviTec Biogas --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: OWN-REGEX
TEXT: EnviTec Biogas AG --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: OWN-REGEX
TEXT: EnviTec Biogas AG --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: OWN-REGEX
TEXT: dem Aufsichtsrat --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: XX-COREF
TEXT: Aufsichtsrat --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: XX-COREF
TEXT: Vorstands --- COMP-Name: EnviTec Biogas AG --- COMP-Symbol: ETG.DE --- label: XX-COREF
-------------------------------------------
len(count_pretrained): 5
