### MASTER-THESIS: Constructing a Knowledge Graph by extracting information from financial news articles

#### IMPORT LIBRARIES

In [None]:
# Note: Restart Jupyter-Notebook (Memory issues)

In [None]:
import pandas as pd
import numpy as np
import torch
import pathlib

In [None]:
from src.A_data.data_loader import DataLoader
from main_process import SpacyProcess, Process
from src.E_topic_model.traditional.topic_prepare import TextPreparer
from src.E_topic_model.traditional.topic_vectorize import Vectorizer
from src.E_topic_model.traditional.topic_dim_reduce import DimReducer
from src.E_topic_model.traditional.topic_cluster import Clusterer
from src.E_topic_model.traditional.topic_visualize import Visualizer
from src.E_topic_model.traditional.topic_model import TopicModel

from src.settings.config import ConfigTopic
from src.settings.enums import VectorizerType, ReductionMethod, ClusterMethod, SpacyTask, ExtractionType

#### DATA (After NER and COREF)

In [None]:
torch.cuda.empty_cache()

In [None]:
dl = DataLoader()
# df = dl.load_df('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/E_topic_model/traditional/model_data/df_demo_after_ner_coref.parquet')
df = dl.load_df('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/A_data/monthly/df_demo_after_ner_coref.parquet')
print('Number of Sentences: ', len(df.index))

In [None]:
df

#### CONVERT NESTED NER_COREF DICTIONARY

In [None]:
spacy_process = SpacyProcess(spacy_task=SpacyTask.BASIC)
print('EN pipeline:', spacy_process.nlp_en.pipe_names)
print('DE pipeline:', spacy_process.nlp_de.pipe_names)

#### START TOPIC MODELING

##### TRADITIONAL TOPIC MODELLING

In [None]:
n_most_common_words: int = 1000  # max 1000
df_most_common_words = pd.read_parquet('/media/rainergo/PROJECTS/UASFRA-MS-Thesis/src/settings/files/1000_most_common_words.parquet')
add_stopwords_en = set(df_most_common_words.en.tolist()[:n_most_common_words])
add_stopwords_de = set(df_most_common_words.de.tolist()[:n_most_common_words])

In [None]:
# Note: Prepare and reduce text
preparer = TextPreparer(df=df, nlp_en= spacy_process.nlp_en, nlp_de=spacy_process.nlp_de, use_comp_mask=True,  save_vocabulary=True)
preparer.add_custom_stopwords(stop_words_de=add_stopwords_de, stop_words_en=add_stopwords_en)
df, vocabulary = preparer.prepare(rem_stopwords=True,
                                       rem_punctuation=True,
                                       rem_non_words=True,
                                       rem_non_content_words=True,
                                       lemmatize=True,
                                       lower_case=True)

In [None]:
n_words = 2000
dict(list(vocabulary.items())[:n_words])

In [None]:
df = df[~df.top_prep_sent.isna()]
df

In [None]:
# Note: Vectorizer
vectorizer = Vectorizer(df=df, vocabulary=vocabulary, vectorizer_type=VectorizerType.TFIDF, df_vector_name=ConfigTopic.vector_col_name)
df, all_vectors = vectorizer.vectorize()

In [None]:
all_vectors.shape

In [None]:
# Note: Dimension Reduction for Cluster
cluster_dim_reducer = DimReducer(df=df, training_data=all_vectors, vectorizer_type=vectorizer.vectorizer_type,
                              reduction_method=ReductionMethod.KernelPCA, reduced_dim=20,
                              df_vector_name=ConfigTopic.vector_col_name, df_red_vector_name=ConfigTopic.reduced_vector_col_name,
                            model_name='cluster_dim_reducer')
cluster_dim_reducer.fit(save_model=True)
df, all_reduced_vectors = cluster_dim_reducer.reduce()

In [None]:
all_reduced_vectors.shape

In [None]:
# Note: Cluster
# model_name = f'cluster_model_monthly_{year}_{month}'
clusterer = Clusterer(df=df, dim_reduced_training_data=all_reduced_vectors, cluster_method=ClusterMethod.KMEANS, n_clusters=20)
clusterer.fit()
clusterer.predict()

In [None]:
set(clusterer.cluster_model.labels_.tolist())

In [None]:
# Note: Dimension Reduction for Visualization
viz_dim_reducer = DimReducer(df=df, training_data=all_vectors, vectorizer_type=vectorizer.vectorizer_type,
                                            reduction_method=cluster_dim_reducer.reduction_method, reduced_dim=3,
                                            df_vector_name=ConfigTopic.vector_col_name, df_red_vector_name=ConfigTopic.viz_reduced_vector_col_name,
                            model_name='viz_dim_reducer')
viz_dim_reducer.fit()
df, all_viz_reduced_vectors = viz_dim_reducer.reduce()

In [None]:
# Note: Topic Model
top_model = TopicModel(df=df, vocabulary=vocabulary, cluster_vectorizer_type=VectorizerType.TFIDF, most_common_n_words=10)
top_model.create_topics()

In [None]:
top_model.df_data

In [None]:
top_model.df_data.x.max()

In [None]:
top_model.df_cluster

In [None]:
# Note: Visualize
visualizer = Visualizer(df_data=top_model.df_data, df_cluster=top_model.df_cluster, vocabulary=vocabulary,vectorizer_type=vectorizer.vectorizer_type,
                             dimension_reduction_method=cluster_dim_reducer.reduction_method, reduced_vector_dimension=cluster_dim_reducer.reduced_dim,
                             cluster_method=clusterer.cluster_method, number_of_clusters=clusterer.n_clusters)
visualizer.plot(point_size=3, cluster_cross_size=5, template='plotly')