In [1]:
import pathlib

import torch
from spacy import displacy
from src.B_spacy_pipeline.spacy_pipe_process import SpacyPipeProcess
from src.settings.enums import ExtractionType, NaturalLanguage, SpacyTask, SpacyExt
from src.A_data.data_loader import DataLoader

In [2]:
torch.cuda.empty_cache()

#### NER PIPELINES

##### 1. Rule-Based Approach with REGEX

In [3]:
ner_pipe_regex = SpacyPipeProcess(natural_language=NaturalLanguage.DE, spacy_task=SpacyTask.NER, ner_method=ExtractionType.TRADITIONAL)
print('pipe components:', ner_pipe_regex.nlp.pipe_names)

GPU is used: True
custom extensions "init_extensions" initialized
regex_entity_pattern for own_regex_search were compiled.
Function "own_regex_search" initialized
Added patterns to token_matcher
Function "func_comp_name_token_regex_match" initialized
custom extensions "own_sentencizer" initialized
pipe components: ['own_sentencizer', 'tok2vec', 'own_regex_search', 'comp_name_token_regex_match']


In [4]:
ner_pipe_regex.nlp.get_pipe('own_regex_search').regex_entity_patterns

['(?P<SYMB_1U1_DOT_DE>(?:(?<=\\s)|(?<=^))(?:(?:\\b1&1\\b)\\s*(?i:\\bAG\\b)?)(?=\\s))',
 '(?P<SYMB_ALMIL_DOT_PA>(?:(?<=\\s)|(?<=^))(?:(?:\\b1000mercis|1000Mercis\\b)\\s*(?i:\\bNone\\b)?)(?=\\s))',
 '(?P<SYMB_TGT_DOT_DE>(?:(?<=\\s)|(?<=^))(?:(?:\\b11880\\b)\\s*(?i:\\bSolutions\\b)\\s*(?i:\\bAG\\b)?)(?=\\s))',
 '(?P<SYMB_SPA_DOT_L>(?:(?<=\\s)|(?<=^))(?:(?:\\b1Spatial\\b)\\s*(?i:\\bPlc\\b)?)(?=\\s))',
 '(?P<SYMB_AL2SI_DOT_PA>(?:(?<=\\s)|(?<=^))(?:(?:\\b2CRSI|2Crsi\\b)\\s*(?i:\\bS.A.\\b)?)(?=\\s))',
 '(?P<SYMB_2GB_DOT_DE>(?:(?<=\\s)|(?<=^))(?:(?:\\b2G\\b)\\s*(?:\\bEnergy\\b)\\s*(?i:\\bAG\\b)?)(?=\\s))',
 '(?P<SYMB_UUU_DOT_DE>(?:(?<=\\s)|(?<=^))(?:(?:\\b3U\\b)\\s*(?i:\\bHolding\\b)\\s*(?i:\\bAG\\b)?)(?=\\s))',
 '(?P<SYMB_III_DOT_L>(?:(?<=\\s)|(?<=^))(?:(?:\\b3i|3I\\b)\\s*(?i:\\bGroup\\b)\\s*(?i:\\bplc\\b)?)(?=\\s))',
 '(?P<SYMB_3IN_DOT_L>(?:(?<=\\s)|(?<=^))(?:(?:\\b3i|3I\\b)\\s*(?:\\bInfrastructure\\b)\\s*(?i:\\bplc\\b)?)(?=\\s))',
 '(?P<SYMB_450_DOT_L>(?:(?<=\\s)|(?<=^))(?:(?:\\b450\\b)\\s*

##### 2. Pretrained spacy GliNER model

In [5]:
ner_pipe_pretrained = SpacyPipeProcess(natural_language=NaturalLanguage.DE, spacy_task=SpacyTask.NER, ner_method=ExtractionType.PRETRAINED)
print('pipe components:', ner_pipe_pretrained.nlp.pipe_names)

GPU is used: True
custom extensions "init_extensions" initialized
"GLINER" api initialized
custom extensions "own_sentencizer" initialized
pipe components: ['own_sentencizer', 'tok2vec', 'gliner_spacy']


#### DATA

In [6]:
dl = DataLoader()
path = pathlib.Path('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/A_data/monthly/df_demo.parquet')
df_all = dl.load_df(path=path)
df_all

Unnamed: 0,art_source,art_url,art_type,art_datetime,art_language,art_isin,art_company_name,art_title,art_text,art_author,art_abstract,art_video_url,pp_art_text
0,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 10:54:00+02:00,DE,DE000STAB1L8,Stabilus SE,AKTIE IM FOKUS: Stabilus verlieren - Anleger u...,FRANKFURT (dpa-AFX) - Die Ergebnisse von Stabi...,,,,Die Ergebnisse von Stabilus fuer das zweite Ge...
1,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:34:00+02:00,DE,DE0006450000,LPKF Laser & Electronics SE,EQS-DD: LPKF Laser & Electronics SE (deutsch),EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,,,,EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...
2,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...
3,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:30:00+02:00,DE,FR0000120578,Sanofi S.A.,ANALYSE-FLASH: Deutsche Bank Research senkt Sa...,FRANKFURT (dpa-AFX Broker) - Deutsche Bank Res...,,,,Deutsche Bank Research hat Sanofi nach Quartal...
4,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 08:43:00+02:00,DE,DE0005493092,Borussia Dortmund GmbH & Co. KGaA,Spekulationen um mögliche Sancho-Rückkehr zum BVB,DORTMUND (dpa-AFX) - Rund um Fußball-Bundeslig...,,,,Rund um Fussball-Bundesligist Borussia Dortmun...
5,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 10:56:00+02:00,DE,DE000A1MME74,Netfonds AG,EQS-News: Netfonds AG: Erwerb eigener Aktien -...,Netfonds AG: Erwerb eigener Aktien - 24. Zwisc...,,,,Netfonds AG: Erwerb eigener Aktien - 24. Zwisc...
6,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 08:05:00+02:00,DE,DE000A2LQ884,AUTO1 Group SE,Online-Gebrauchtwagenhändler Auto1 reduziert V...,BERLIN (dpa-AFX) - Der Online-Gebrauchtwagenhä...,,,,Der Online-Gebrauchtwagenhaendler Auto1 hat im...
7,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 10:35:00+02:00,DE,DE000A2YN900,TeamViewer SE,ANALYSE-FLASH: JPMorgan belässt Teamviewer auf...,NEW YORK (dpa-AFX Broker) - Die US-Bank JPMorg...,,,,Die US-Bank JPMorgan hat die Einstufung fuer T...
8,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 10:16:00+02:00,DE,DE0006452907,Nemetschek SE,ANALYSE-FLASH: Baader Bank senkt Nemetschek au...,MÜNCHEN (dpa-AFX Broker) - Die Baader Bank hat...,,,,Die Baader Bank hat Nemetschek nach Zahlen zum...
9,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-03 09:50:00+02:00,DE,DE000STAB1L8,Stabilus SE,ANALYSE-FLASH: SocGen hebt Stabilus auf 'Buy' ...,PARIS (dpa-AFX Broker) - Die französische Groß...,,,,Die franzoesische Grossbank Societe Generale h...


In [None]:
equal = [0,1,3,4,5,6,9,10,11,12,13,14,16,17,18,20,21,22,25,26,27,28,29,30,31,32,33,34]     # Both approaches are approximately equal
better = [7,8,23,]                                                                # REGEX approach is better
worse = [2]                                                                    # REGEX approach is worse

In [1]:
index = 6
text = df_all.loc[index, 'pp_art_text']
print('Sample Text:\n\n', text[:30000], '\n')

NameError: name 'df_all' is not defined

#### NER Processing

In [14]:
# Note: GLiNER
doc_pretrained = %time ner_pipe_pretrained.process_text(text=text)

CPU times: user 1.93 s, sys: 21.8 ms, total: 1.95 s
Wall time: 355 ms


In [15]:
# Note: REGEX
doc_regex = %time ner_pipe_regex.process_text(text=text)

search_match in get_search_match_instance: SearchMatch(comp_name='JPMorgan Global Core Real Assets Limited', comp_symbol='JARA.L', text='JPMorgan', label='OWN-REGEX', start_idx=12, end_idx=20, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_match in get_search_match_instance: SearchMatch(comp_name='TeamViewer AG', comp_symbol='TMV.DE', text='Teamviewer', label='OWN-REGEX', start_idx=45, end_idx=55, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)
search_matches in get_matches_from_regex_search: [SearchMatch(comp_name='JPMorgan Global Core Real Assets Limited', comp_symbol='JARA.L', text='JPMorgan', label='OWN-REGEX', start_idx=12, end_idx=20, idx_refer_to=<IDXReferTo.CHARS: 'chars'>), SearchMatch(comp_name='TeamViewer AG', comp_symbol='TMV.DE', text='Teamviewer', label='OWN-REGEX', start_idx=45, end_idx=55, idx_refer_to=<IDXReferTo.CHARS: 'chars'>)]
CPU times: user 540 ms, sys: 5.41 ms, total: 545 ms
Wall time: 549 ms


In [16]:
# Note: REGEX
comp_names = []
for ent in doc_regex.ents:
    if ent.has_extension('comp_symbol') and ent._.comp_symbol != '':
        comp_names.append(ent.text)
        print('COMPANY:', ent.text, '--- COMP-SYMBOL:', ent._.comp_symbol, '--- SET-IN-FUNCTION:', ent._.set_in)
comp_name = sorted(comp_names, key=comp_names.count, reverse=True)[0][:18] if comp_names else None
print('---------------------')
print('comp_name:', comp_name, '--- len(comp_names):', len(comp_names))

COMPANY: JPMorgan --- COMP-SYMBOL: JARA.L --- SET-IN-FUNCTION: comp_name_token_regex_match
COMPANY: Teamviewer --- COMP-SYMBOL: TMV.DE --- SET-IN-FUNCTION: comp_name_token_regex_match
---------------------
comp_name: JPMorgan --- len(comp_names): 2


#### DISPLACY

In [17]:
# Note: REGEX
colors = {'OWN-REGEX': 'green', 'FUZZY': 'green', "ORG-PART": 'green'}
options = {'ents': ['OWN-REGEX', 'FUZZY', "ORG-PART"], 'colors': colors}
displacy.render(doc_regex, style="ent", options=options)

In [18]:
# Note: GLiNER
found_comps = set([(ent.start_char, ent.end_char, ent.label_) for comp in comp_names for ent in doc_pretrained.ents if comp.lower() in ent.text.lower()])
vis_dict = {'text': text, 'ents': [{'start': comp[0], 'end': comp[1], 'label': comp[2]} for comp in found_comps], 'title': None}
colors = {'organization':'red', 'ORG': 'red', 'PER': 'orange', 'MISC': 'yellow', 'LOC': 'pink'}
options = {'ents': ['organization', 'ORG', 'PER', 'MISC', 'LOC'], 'colors': colors}
displacy.render(vis_dict, manual=True, style="ent", options=options)

In [19]:
count_pretrained = []
for ent in doc_pretrained.ents:
    print(ent.text, '---', ent.label_,)
    if comp_name.lower() in ent.text.lower():
        count_pretrained.append(ent.text)
print('-------------------------------------------')
print('len(count_pretrained):', len(count_pretrained))

US-Bank JPMorgan --- organization
Enterprise-Geschaeft --- organization
-------------------------------------------
len(count_pretrained): 1
