### most common words in topic

In [2]:
# local utilities
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities.BTTools import filter_for
from utilities.BTTools import groupSpeechesByDiscussionTitle

# other stuff needed
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
import tqdm

import json

with open('../../data/speeches_20.jsonl', 'r', encoding='utf8') as fp:
    data = list(fp)
speeches_original = []
for line in data:
    speeches_original.append(json.loads(line))
    
speeches = speeches_original.copy()    

In [3]:
groupedby_discussion = groupSpeechesByDiscussionTitle(speeches)
len(groupedby_discussion)

208

In [46]:
corpus = []
for top in groupedby_discussion.values():
    #print(grouped[top])
    aggregate= []
    for ele in top:
        aggregate.append(ele['text'])
    corpus.append(' '.join(aggregate))

len(corpus)    


208

In [47]:
vectorizer_tfidf = TfidfVectorizer(max_df=0.8, min_df=0.01,lowercase = False)
tf_idf_matrix  = vectorizer_tfidf.fit_transform(corpus)
feature_names = vectorizer_tfidf.get_feature_names()
vectorizer_tf = TfidfVectorizer(vocabulary=feature_names,use_idf=False, norm="l1")
#vectorizer_counts = CountVectorizer(vocabulary=feature_names)
tf_matrix = vectorizer_tf.fit_transform(corpus)

tf_idf_matrix



<208x19567 sparse matrix of type '<class 'numpy.float64'>'
	with 300671 stored elements in Compressed Sparse Row format>

In [48]:
# Use NMF to look for 15 topics
n_topics = 10
model = NMF(n_components=n_topics)
model.fit(tf_idf_matrix)
#model.fit(tf_matrix)

# Print the top 10 words
n_words = 10
n_words_features = 100

topic_list = []
topic_list_ext = []
topic_words = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 
    topic_list_ext.append(top_features)
    
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words_features:]][::-1]
    topic_words.append(top_n)

    print(f"Topic {topic_idx}: {top_features}")
    
topic2word = model.components_   
#topic_words

Topic 0: Energien Gas erneuerbaren EEG Ausbau Strom Umlage Energieversorgung Energiewende LNG
Topic 1: Ukraine Russland Krieg NATO Putin Europa Waffen Bundeskanzler russischen Sanktionen
Topic 2: Impfpflicht Impfung Pandemie impfen Omikron Pflege Impfen geimpft Variante Impfungen
Topic 3: Euro Milliarden Haushalt Sondervermögen Nachtragshaushalt Investitionen Pandemie Wirtschaft Unternehmen Bundeswehr
Topic 4: Mali Mandat Soldaten Soldatinnen Bundeswehr Mission Südsudan Libyen Sea MINUSMA
Topic 5: Wohnungen Bauen Wohnen Mieter Vorkaufsrecht Mieterinnen Wohnraum CO2 Wohnungsbau Kommunen
Topic 6: Landwirtschaft Landwirte Hunger Ernährung Bauern Lebensmittel Millionen Flächen Welt Ukraine
Topic 7: Inflation Euro Einkommen Entlastung Antrag entlasten Progression Preise Rentner Energiepreispauschale
Topic 8: Antrag Demokratie Opfer Sicherheitsbehörden Hanau Rechtsextremismus Frauen 219a Antidiskriminierungsstelle Rechtsstaat
Topic 9: Kinder Kommunen Ganztagsbetreuung Frauen Schulen Bund Bil

In [49]:
topic_list_ext

['Energien Gas erneuerbaren EEG Ausbau Strom Umlage Energieversorgung Energiewende LNG',
 'Ukraine Russland Krieg NATO Putin Europa Waffen Bundeskanzler russischen Sanktionen',
 'Impfpflicht Impfung Pandemie impfen Omikron Pflege Impfen geimpft Variante Impfungen',
 'Euro Milliarden Haushalt Sondervermögen Nachtragshaushalt Investitionen Pandemie Wirtschaft Unternehmen Bundeswehr',
 'Mali Mandat Soldaten Soldatinnen Bundeswehr Mission Südsudan Libyen Sea MINUSMA',
 'Wohnungen Bauen Wohnen Mieter Vorkaufsrecht Mieterinnen Wohnraum CO2 Wohnungsbau Kommunen',
 'Landwirtschaft Landwirte Hunger Ernährung Bauern Lebensmittel Millionen Flächen Welt Ukraine',
 'Inflation Euro Einkommen Entlastung Antrag entlasten Progression Preise Rentner Energiepreispauschale',
 'Antrag Demokratie Opfer Sicherheitsbehörden Hanau Rechtsextremismus Frauen 219a Antidiskriminierungsstelle Rechtsstaat',
 'Kinder Kommunen Ganztagsbetreuung Frauen Schulen Bund Bildung Familien Länder Ministerin']

In [50]:
print(tf_matrix.shape)
print(topic2word.shape)
doc2topic = (tf_matrix * topic2word.T)
print(doc2topic.shape)

(208, 19567)
(10, 19567)
(208, 10)


In [51]:
topic2word.T[0]

array([0.0324893 , 0.02277637, 0.04251865, 0.02831381, 0.03472841,
       0.10979025, 0.03810056, 0.035589  , 0.03160826, 0.02825281])

In [52]:
feature_topic_num = []
feature_topic_name = []
for wx,word in enumerate(feature_names):
    if topic2word.T[wx].max() > 0:
        topic_num = topic2word.T[wx].argmax()
        topic_name = topic_list_ext[topic_num]
    else:
        topic_num = n_topics
        topic_name = 'not defined'
    
    feature_topic_num.append(topic_num)
    feature_topic_name.append(topic_name)
#feature_topic_num

top_topic_num = []
top_topic_name = []
for tx,top in enumerate(groupedby_discussion):
    if doc2topic[tx].max() > 0:
        topic_num = doc2topic[tx].argmax()
        topic_name = topic_list_ext[topic_num]
    else:
        topic_num = n_topics
        topic_name = 'not defined'
    
    top_topic_num.append(topic_num)
    top_topic_name.append(topic_name)
    
#feature_topic_num

In [54]:

topic_index = 0

for topic_index, topic_name in enumerate(topic_list):

    # select TOPs for max. relevant topic
    #topic_selection = []
    #for tx,top in enumerate(groupedby_discussion):
    #    if top_topic_num[tx] == topic_index:
    #        topic_selection.append(groupedby_discussion[top])
    #        #print(top)

    # Alternative using the corpus
    topic_selection = []
    top_indices = []
    top_names = []
    for tx,top in enumerate(corpus):
        if top_topic_num[tx] == topic_index:
            topic_selection.append(top)
            top_indices.append(tx)
            top_names.append(list(groupedby_discussion.keys())[tx])
            #print(top)
    
    print(f'Topic {topic_index} has {len(topic_selection)} TOPs')
    
    # remove words that are max. relevant for topic        
    topic_features = []
    for wx,word in enumerate(feature_names):
        if feature_topic_num[wx] != topic_index:
            topic_features.append(word)

    #len(topic_features)

    # compute semantic similarity for selection
    if len(topic_selection) == 0:
        print(f'Topic {topic_index} cannot be constructed')
    else:
        vectorizer_topic = TfidfVectorizer(vocabulary=topic_features,lowercase = False)
        tf_idf_matrix_topic  = vectorizer_topic.fit_transform(topic_selection)
        pairwise_similarity_topic = tf_idf_matrix_topic * tf_idf_matrix_topic.T 
        similarity_topic = pairwise_similarity_topic.toarray()

        print(similarity_topic.shape)


        # build network
        # nodes:
        nodes = []
        count = 1
        for tx,top_ix in enumerate(top_indices):
            node_dict = {
                'id' : count,
                'top' : top_ix,
                'date' : groupedby_discussion[ top_names[tx] ][0]['date'],
                'nReden' : len( groupedby_discussion[ top_names[tx] ] )
            }
            nodes.append(node_dict)
            count += 1

        # graph construct
        graph = {
            'directed': False,
            'graph': 'semant_graph',
            'links': [],
            'nodes': nodes,
        }   

        # edges:
        min_weight = 0.15
        for ix,nodeI in enumerate(graph['nodes']):
            for jx,nodeJ in enumerate(graph['nodes']):
                if ix < jx:
                    source = nodeI['id']
                    target = nodeJ['id']
                    weight = similarity_topic[ix,jx]
                    if weight > min_weight:
                        link_dict = {
                            'source':source,
                            'target':target,
                            'weight':weight       
                        }
                        graph['links'].append(link_dict)


        nn = len(graph['nodes'])
        ne = len(graph['links'])
        print( f"This graph has {nn} nodes and {ne} links.")


        # write to HTML

        data = graph
        d3graph = {"nodes": [], "links": []}
        d3graph["nodes"] = data["nodes"]
        d3graph["links"] = data["links"]

        htmlcode = f"""<head>
            <style> body {{margin: 0;}} </style>
            <script src="https://unpkg.com/force-graph"></script>
            <meta charset="UTF-8">
        </head>
        <body>
        <div id="graph"></div>
        <script>
            var data = {d3graph};
            const elem = document.getElementById('graph');
            const Graph = ForceGraph()(elem)
                .graphData(data)
                .nodeLabel('top')
                .nodeRelSize(1)
                .nodeVal('nReden')
                //.linkVisibility('true')
                //.onNodeClick (node => {{window.open(`wordnet.html`, '_blank')}})
                //.onNodeHover(node => elem.style.cursor = node ? 'pointer' : null)
                .onNodeRightClick(node => {{
                    // Center/zoom on node
                    Graph.centerAt(node.x, node.y, 1000);
                    Graph.zoom(4, 2000);
                }});
        </script>
        </body>
        """

        with open (f"./TOPnets/TOPnet4topic{topic_index}.html", "w") as f:
            f.write(htmlcode)

Topic 0 has 152 TOPs
(152, 152)
This graph has 152 nodes and 1562 links.


FileNotFoundError: [Errno 2] No such file or directory: './TOPnets/TOPnet4topic0.html'

In [4]:
groupedby_discussion.keys()

dict_keys(['Tagesordnungspunkt 2 2021-10-26', 'Tagesordnungspunkt 3 2021-10-26', 'Tagesordnungspunkt 5 2021-10-26', 'Tagesordnungspunkt 6 2021-10-26', 'Tagesordnungspunkt 1 2021-11-11', 'Tagesordnungspunkt 3 2021-11-11', 'Tagesordnungspunkt 4 2021-11-11', 'Tagesordnungspunkt 5 2021-11-11', 'Tagesordnungspunkt 6 2021-11-11', 'Tagesordnungspunkt 7 2021-11-11', 'Zusatzpunkt 2 2021-11-11', 'Tagesordnungspunkt 1 2021-11-18', 'Zusatzpunkt 1 2021-11-18', 'Tagesordnungspunkt 3 2021-11-18', 'Tagesordnungspunkt 4 2021-11-18', 'Tagesordnungspunkt 5 2021-11-18', 'Zusatzpunkt 2 2021-11-18', 'Tagesordnungspunkt 1 2021-12-07', 'Tagesordnungspunkt 2 2021-12-08', 'Tagesordnungspunkt 3 2021-12-08', 'Tagesordnungspunkt 5 2021-12-08', 'Tagesordnungspunkt 8 2021-12-09', 'Tagesordnungspunkt 9 2021-12-09', 'Tagesordnungspunkt 10 2021-12-09', 'Tagesordnungspunkt 11 2021-12-09', 'Zusatzpunkt 1 2021-12-09', 'Zusatzpunkt 2 2021-12-09', 'Tagesordnungspunkt 13 2021-12-10', 'Tagesordnungspunkt 1 2021-12-15', 'Tages

In [31]:
selection=['Tagesordnungspunkt 2 2021-10-26', 'Tagesordnungspunkt 3 2021-10-26', 'Tagesordnungspunkt 5 2021-10-26', 'Tagesordnungspunkt 6 2021-10-26']

In [37]:
mydata=[]
for ele in selection:
    top=groupedby_discussion[ele]
    #print(top)
    for rede in top:
        #print(rede['text'])
        mydata.extend(rede['text'].split())
    

In [39]:
mydata[0:5]

['Sehr', 'geehrter', 'Herr', 'Alterspräsident!', 'So']

In [45]:
import collections
counter = collections.Counter(mydata).most_common()[:15]
counter

[('die', 143),
 ('der', 127),
 ('und', 100),
 ('das', 74),
 ('wir', 68),
 ('in', 65),
 ('ist', 54),
 ('für', 49),
 ('dass', 49),
 ('zu', 46),
 ('–', 43),
 ('auch', 42),
 ('es', 41),
 ('Sie', 41),
 ('nicht', 36)]