In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tqdm

from utilities.BTTools import groupSpeechesByDiscussionTitle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def get_speeches():
    speeches = []
    with open('../../bundestagsreden parser/speeches_20.jsonl', 'r', encoding='utf8') as fp:
        for line in list(fp):
            speeches.append(json.loads(line))

    return speeches

In [None]:
original_speeches = get_speeches()
print(f'loaded {len(original_speeches)}')

In [None]:
grouped_speeches_by_tops = groupSpeechesByDiscussionTitle(original_speeches)
print(f'grouped speeches to {len(grouped_speeches_by_tops)} "TagesOrnungsPunkten"')

In [None]:
def get_corpus(tops):
    corpus = []

    for top in tops.values():
        aggregate = []

        for speech in top:
            aggregate.append(speech['text'])
        corpus.append(' '.join(aggregate))

    return corpus

In [None]:
corpus = get_corpus(grouped_speeches_by_tops)
print(f'created corpus with {len(corpus)} entries')

In [None]:
nlp = spacy.load('de_core_news_sm')

In [None]:
def corpus_by_POS(corpus, consider):
    groups = []
    for row in tqdm.tqdm(corpus):
        doc = nlp(row)
        new_row = []
        for token in doc:
            if token.pos_ in consider:
                new_row.append(token.lemma_)
        groups.append(' '.join(new_row))

    return groups

In [None]:
consider = ['NOUN']
noun_groups = corpus_by_POS(corpus, consider)

 73%|███████▎  | 151/208 [09:25<02:35,  2.72s/it]

In [None]:
print(f'got {len(noun_groups)} noun_groups')
print(noun_groups[0])

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.01, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(noun_groups)
feature_names = tfidf_vectorizer.get_feature_names_out()

tf_vectorizer = TfidfVectorizer(vocabulary=feature_names, use_idf=False, norm='l1')
tf_matrix = tf_vectorizer.fit_transform(noun_groups)

In [None]:
print(tfidf_matrix.shape)
print(tf_matrix.shape)

In [None]:
def create_model(matrix):
    n_topics = 10
    m = NMF(n_components=n_topics)
    m.fit(tfidf_matrix)

    return m

In [None]:
model = create_model(tfidf_matrix)

In [None]:
def get_topic_word_lists(topic_model):
    n_words = 10
    n_words_features = 100

    topic_list = []
    extended_topic_list = []
    topic_words = []

    for idx, topic in enumerate(model.components_):
        top_n = [feature_names[i] for i in topic.argsort()[-n_words:]][::-1]
        top_features = ' '.join(top_n)
        topic_list.append(f"topic_{'_'.join(top_n[:3])}")
        extended_topic_list.append(top_features)

        top_n = [feature_names[i] for i in topic.argsort()[-n_words_features:]][::-1]

        topic_words.append(top_n)

        print(f"Topic {idx}: {top_features}")

    return topic_list, extended_topic_list, topic_words

In [None]:
tl, etl, tw = get_topic_word_lists(model)
print(f'got {len(tl)} topics, {len(etl)} extended topics and {len(tw)} words.')

In [None]:
topic2word = model.components_ # TW
doc2topic = (tf_matrix * topic2word.T) # TN

print(f'tf_matrix: {tf_matrix.shape}')
print(f'topic2word: {topic2word.shape}')
print(f'doc2topic: {doc2topic.shape}')

In [None]:
def get_topic2_topic(tm):
    normalized_matrix = normalize(tm.components_, axis=1, norm='l1')
    print(f'normalized_matrix: {normalized_matrix.shape}')

    topic_to_topic = cosine_similarity(normalized_matrix)
    print(f'topic2topic: {topic_to_topic.shape}')

    return topic_to_topic

In [None]:
topic2topic = get_topic2_topic(model)

plt.matshow(topic2topic)
plt.show()

In [None]:
def create_topic2topic_graph(t2t, extended_topic_list):
    nodes = []
    count = 1

    for i, topic in enumerate(t2t):
        nodes.append({
            'id': count,
            'topicname': extended_topic_list[i]
        })
        count += 1

    graph = {
        'directed': False,
        'graph': 'semant_graph',
        'links': [],
        'nodes': nodes
    }

    for i, node_i in enumerate(graph['nodes']):
        for j, node_j in enumerate(graph['nodes']):
            if i < j:
                source = node_i['id']
                target = node_j['id']
                weight = t2t[i, j]
                if weight > 0.15:
                    link = {
                        'source': source,
                        'target': target,
                        'weight': weight
                    }
                    graph['links'].append(link)

    return graph

In [None]:
t2t_graph = create_topic2topic_graph(topic2topic, etl)

print(f"got a graph with {len(t2t_graph['nodes'])} nodes.")
print('the first node:')
print(t2t_graph['nodes'][0])

In [None]:
def get_graph_template(graph, properties):
    node_label = properties['nodelabel']
    node_coloring = properties['nodecoloring']

    lv = '//' if properties['edgevisibility'] else ''
    parts = '//' if not properties['particles'] else ''
    dm = '//' if not properties['darkmode'] else ''

    d3graph = {
        'nodes': graph['nodes'],
        'links': graph['links']
    }

    htmlcode = f"""<head>
        <style>
            body {{
                margin: 0;
                font-family: Arial;
            }}
            h3 {{text-align: center;}}
            .center {{
              display: block;
              margin-left: auto;
              margin-right: auto;
            }}
        </style>
        <script src="https://unpkg.com/force-graph"></script>
        <meta charset="UTF-8">
    </head>
    <body>
    <img src="Logo.png" height="150" width="300" class="center">
    <h3>DebSearch ist eine statistische Website, welche die aktuelle Legislaturperiode</h3>
    <h3>in verschiedenen Kategorieren auswertet und visualisiert.</h3>
    <div id="graph"></div>
    <script>
        var data = {d3graph};
        const elem = document.getElementById('graph');
        const Graph = ForceGraph()(elem)
            .graphData(data)
            .nodeLabel('{node_label}')
            .nodeRelSize(3)
            .nodeVal('nReden')
            .nodeAutoColorBy('{node_coloring}')
            {dm}.backgroundColor('#000000')
            {dm}.linkColor(() => 'rgba(255,255,255,0.2)')
            {lv}.linkVisibility('false')
            {parts}.linkDirectionalParticles(2)
            {parts}.linkDirectionalParticleWidth(1.4)
            .onNodeClick (node => {{window.open(`wordnet.html`, '_blank')}})
            //.onNodeHover(node => elem.style.cursor = node ? 'pointer' : null)
            .onNodeRightClick(node => {{
                // Center/zoom on node
                Graph.centerAt(node.x, node.y, 1000);
                Graph.zoom(4, 2000);
            }});
    </script>
    </body>
    """

    return { 'graph': htmlcode }

In [None]:
properties = {
    'nodecoloring':'topicname',
    'nodelabel': 'topicname',
    "darkmode": False,
    "edgevisibility": True,
    "particles": False
}

html_graph = get_graph_template(t2t_graph, properties)

In [None]:
def write_html_file(html):
    with open(f'./topic_network.html', 'w') as f:
        f.write(html['graph'])

In [None]:
write_html_file(html_graph)