### MASTER-THESIS: Constructing a Knowledge Graph by extracting information from financial news articles

#### IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import torch
import pathlib
from tqdm.notebook import tqdm

In [2]:
from src.A_data.data_loader import DataLoader
from main_process import SpacyProcess, Process
from src.E_topic_model.traditional.topic_prepare import TextPreparer
from src.E_topic_model.traditional.topic_vectorize import Vectorizer
from src.E_topic_model.traditional.topic_dim_reduce import DimReducer
from src.E_topic_model.traditional.topic_cluster import Clusterer
from src.E_topic_model.traditional.topic_visualize import Visualizer
from src.E_topic_model.traditional.topic_model import TopicModel

from src.settings.config import ConfigTopic
from src.settings.enums import VectorizerType, ReductionMethod, ClusterMethod, SpacyTask, ExtractionType

#### DATA (After NER and COREF)

In [3]:
torch.cuda.empty_cache()

In [4]:
dl = DataLoader()
df = dl.load_df('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/A_data/monthly/df_demo_after_ner_coref.parquet')
print('Number of Sentences: ', len(df.index))

Number of Sentences:  163


In [5]:
df

Unnamed: 0,art_source,art_url,art_type,art_datetime,art_language,art_isin,art_company_name,art_title,art_text,art_author,art_abstract,art_video_url,pp_art_text,ner_coref,art_id,top_sent,top_sent_masked
0,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 10:54:00+02:00,DE,DE000STAB1L8,Stabilus SE,AKTIE IM FOKUS: Stabilus verlieren - Anleger u...,FRANKFURT (dpa-AFX) - Die Ergebnisse von Stabi...,,,,Die Ergebnisse von Stabilus fuer das zweite Ge...,{'entities': [{'comp_name': 'JPMorgan Global C...,0,"Akshat Kacker von JPMorgan geht davon aus, das...",Akshat Kacker von Comp@Name@Placeholder geht d...
1,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:34:00+02:00,DE,DE0006450000,LPKF Laser & Electronics SE,EQS-DD: LPKF Laser & Electronics SE (deutsch),EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,,,,EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,{'entities': [{'comp_name': 'LPKF Laser & Elec...,1,EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,EQS-DD: Comp@Name@Placeholder SE: Dr. Klaus Fi...
2,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:34:00+02:00,DE,DE0006450000,LPKF Laser & Electronics SE,EQS-DD: LPKF Laser & Electronics SE (deutsch),EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,,,,EQS-DD: LPKF Laser & Electronics SE: Dr. Klaus...,{'entities': [{'comp_name': 'LPKF Laser & Elec...,1,LPKF Laser & Electronics SE.,Comp@Name@Placeholder SE.
3,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,"{'entities': [{'comp_name': 'SGL Carbon SE', '...",2,Der Kohlenstoffspezialist SGL Carbon duerfte n...,Der Kohlenstoffspezialist Comp@Name@Placeholde...
4,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,"{'entities': [{'comp_name': 'SGL Carbon SE', '...",2,Er beruecksichtigt dabei die Unternehmensziele...,Er beruecksichtigt dabei die Unternehmensziele...
5,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,"{'entities': [{'comp_name': 'SGL Carbon SE', '...",2,"Dies unterstreiche, dass SGL im Begriff sei, v...","Dies unterstreiche, dass Comp@Name@Placeholder..."
6,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,"{'entities': [{'comp_name': 'SGL Carbon SE', '...",2,2023 werde wohl noch ein Jahr der Investitione...,2023 werde wohl noch ein Jahr der Investitione...
7,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,{'entities': [{'comp_name': 'Deutsche Bank Akt...,2,Mit der Einstufung Buy empfiehlt die Deutsche ...,Mit der Einstufung Buy empfiehlt die Comp@Name...
8,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 11:39:00+02:00,DE,DE0007235301,SGL Carbon SE,ANALYSE/Deutsche Bank: Umstrukturierung von SG...,FRANKFURT (dpa-AFX) - Der Kohlenstoffspezialis...,,,,Der Kohlenstoffspezialist SGL Carbon duerfte n...,{'entities': [{'comp_name': 'Deutsche Bank Akt...,2,Analysierendes Institut,Comp@Name@Placeholder
9,dpa-afx-compact,https://mobile.traderfox.com/news/dpa-compact/...,unt,2023-05-02 09:30:00+02:00,DE,FR0000120578,Sanofi S.A.,ANALYSE-FLASH: Deutsche Bank Research senkt Sa...,FRANKFURT (dpa-AFX Broker) - Deutsche Bank Res...,,,,Deutsche Bank Research hat Sanofi nach Quartal...,{'entities': [{'comp_name': 'Deutsche Bank Akt...,3,Deutsche Bank Research hat Sanofi nach Quartal...,Comp@Name@Placeholder Research hat Comp@Name@P...


#### CONVERT NESTED NER_COREF DICTIONARY

In [6]:
spacy_process = SpacyProcess(spacy_task=SpacyTask.BASIC)
print('EN pipeline:', spacy_process.nlp_en.pipe_names)
print('DE pipeline:', spacy_process.nlp_de.pipe_names)

CPU is used!
CPU is used!
EN pipeline: ['transformer', 'tagger', 'parser', 'lemmatizer']
DE pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser', 'lemmatizer']


#### START TOPIC MODELING

##### TRADITIONAL TOPIC MODELLING

In [7]:
# Note: Prepare and reduce text
preparer = TextPreparer(df=df, nlp_en= spacy_process.nlp_en, nlp_de=spacy_process.nlp_de, use_comp_mask=True,  save_vocabulary=False)
preparer.load_vocabulary()
df, vocabulary = preparer.prepare(rem_stopwords=True,
                                   rem_punctuation=True,
                                   rem_non_words=True,
                                   rem_non_content_words=True,
                                   lemmatize=True,
                                   lower_case=True)

GPU is used: True


In [8]:
df.top_prep_sent_words

0      [kacker, erreichen, gewinnausblick, verlauf, g...
1                                                 [kauf]
2                                                   <NA>
3      [kohlenstoffspezialist, einschaetzung, bank, f...
4      [beruecksichtigen, unternehmensziele, weisen, ...
5      [unterstreichen, begriff, umstrukturierungs, w...
6      [investition, stabilisierung, sparte, umsatz, ...
7      [einstufung, empfehlen, basis, gesamtrendite, ...
8                                                   <NA>
9      [quartalszahle, abgestuft, kursziel, euro, bel...
10     [jahresstart, bestaetigen, chance, jahresziel,...
11     [resultat, verdeutlichten, margenziel, tiefges...
12     [fussball, rueckkehr, offensivspieler, sommer,...
13        [zeitung, berichten, leihbasis, zurueckkehren]
14             [fluegelspieler, million, euro, wechseln]
15            [tv, sender, berichten, rueckkehr, sancho]
16                           [leihgeschaeft, ueberzeugt]
17                      [erwerb

In [9]:
df.top_prep_sent_words[df.top_prep_sent_words.isna()]

2     <NA>
8     <NA>
48    <NA>
99    <NA>
Name: top_prep_sent_words, dtype: object

In [10]:
# Note: Vectorizer
vectorizer = Vectorizer(df=df, vocabulary=vocabulary, vectorizer_type=VectorizerType.TFIDF, df_vector_name=ConfigTopic.vector_col_name)
df, all_vectors = vectorizer.vectorize()

Vectorizer.vectorize() was run.


In [11]:
all_vectors.shape

(159, 542)

In [12]:
# Note: Dimension Reduction for Cluster
cluster_dim_reducer = DimReducer(df=df, training_data=all_vectors, vectorizer_type=vectorizer.vectorizer_type,
                              reduction_method=ReductionMethod.PCA, reduced_dim=20,
                              df_vector_name=ConfigTopic.vector_col_name, df_red_vector_name=ConfigTopic.reduced_vector_col_name,
                            model_name='cluster_dim_reducer.pkl', scaler_name='reduction_model_scaler.pkl')
cluster_dim_reducer.load_model()
df, all_reduced_vectors = cluster_dim_reducer.reduce()

ValueError: X has 542 features, but QuantileTransformer is expecting 15081 features as input.

In [None]:
cluster_dim_reducer.all_red_vectors[:5]

In [None]:
# Note: Cluster
# model_name = f'cluster_model_monthly_{year}_{month}'
clusterer = Clusterer(df=df, dim_reduced_training_data=all_reduced_vectors, cluster_method=ClusterMethod.KMEANS, n_clusters=20)
clusterer.load_model()
clusterer.predict()

In [None]:
# Note: Dimension Reduction for Visualization
viz_dim_reducer = DimReducer(df=df, training_data=all_vectors, vectorizer_type=VectorizerType.TFIDF,
                                            reduction_method=ReductionMethod.PCA, reduced_dim=3,
                                            df_vector_name=ConfigTopic.vector_col_name, df_red_vector_name=ConfigTopic.viz_reduced_vector_col_name,
                            model_name='viz_dim_reducer')
viz_dim_reducer.load_model()
df, all_viz_reduced_vectors = viz_dim_reducer.reduce()

In [None]:
# Note: Topic Model
top_model = TopicModel(df=df, vocabulary=vocabulary, most_common_n_words=10)
top_model.create_topics()

In [None]:
top_model.df_data[:5]

In [None]:
top_model.df_data.cluster_label.value_counts()

In [None]:
top_model.df_cluster

In [None]:
# Note: Visualize
visualizer = Visualizer(df_data=top_model.df_data, df_cluster=top_model.df_cluster, vocabulary=vocabulary,vectorizer_type=vectorizer.vectorizer_type,
                             dimension_reduction_method=cluster_dim_reducer.reduction_method, reduced_vector_dimension=cluster_dim_reducer.reduced_dim,
                             cluster_method=clusterer.cluster_method, number_of_clusters=clusterer.n_clusters)
visualizer.plot(point_size=3, cluster_cross_size=5, template='seaborn')