# Generate semantic similarity networks of all speeches

This is a pipeline to create the semantic similarity network of all speeches.


#### 1. Import the basic stuff.

In [1]:
import json
import requests
from collections import Counter
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### 2. Load the preprocessed data.

In [2]:
with open('../data/speeches_preprocessed.json', 'r') as fp:
    data = json.load(fp)

In [3]:
reden = data.copy()

This step of data cleaning can be skipped!

In [4]:
# clean party labels
for rede in reden:
    rede['party']=rede['party'].replace(u'\xa0', u' ')
    if rede['party']=='Bündnis 90/Die Grünen':
        rede['party']='BÜNDNIS 90/DIE GRÜNEN'
    if rede['party']=='Fraktionslos':
        rede['party']='fraktionslos'

### (Optional) Data selection

Now select, if desired, a subset of speeches by party, parlamentarians, date, etc. Note that it does not work for the 'text'-field, but 'text_lem' should work.

To be checked.

In [None]:
#set(reden[0]['name'])


In [None]:

def filter_for(what, search_terms, speeches):
    filtered_speeches = []
    for speech in speeches:
        if ( speech[what] in set(search_terms) ):
            filtered_speeches.append(speech)
    filtered_speeches.sort(key = lambda x:x['date']) 
    return filtered_speeches


In [None]:

members = ['Hansjörg Durz','Birke Bull-Bischoff','Stefan Kaufmann','Ernst Dieter Rossmann','Götz Frömming','Katja Suding','Kai Gehring','Tankred Schipanski','Saskia Esken','Oliver Kaczmarek','Nicola Beer','Anke Domscheit-Berg','Tabea Rößner','Manuel Höferlin','Sven Lehmann','Karamba Diaby','Susann Rüthrich','Katarina Barley','Sylvia Pantel','Johannes Huber','Katrin Werner','Grigorios Aggelidis','Katja Dörner','Martin Reichardt','Nadine Schön','Nicole Höchst','Stefan Schwartze','Norbert Müller','Uwe Schulz','Maik Beermann','Josephine Ortleb','Cornelia Möhring','Ulle Schauws','Silke Launert','Wiebke Esdar','Gülistan Yüksel','Matthias Seestern-Pauly','Marcus Weinberg','Martin Patzelt','Dagmar Schmidt','Anna Christmann','Uwe Kamann','Silvia Breher','Nicole Bauer','Leni Breymaier','Katrin Helling-Plahr','Annalena Baerbock','Petra Sitte','Mariana Iris Harder-Kühnel','Katja Mast','Roman Müller-Böhm','Doris Achelwilm','Yvonne Magwas','Sönke Rix','Ronja Kemmer','Margit Stumpp','Manja Schüle','Jens Brandenburg','Nicole Gohlke','Katrin Staffler','Beate Walter-Rosenheimer','Bettina Margarethe Wiesmann','Ulrike Bahr','Franziska Giffey','Anja Karliczek','Michaela Noll','Yasmin Fahimi','Melanie Bernstein','Stephan Albani','Marja-Liisa Völlers','Thomas Sattelberger','Dietlind Tiemann','René Röspel','Albert Rupprecht','Michael Espendiller','Joana Cotar','Mario Brandenburg','Volker Münz','Astrid Mannes','Ekin Deligöz','Stefan Sauer','Svenja Stadler','Swen Schulz','Kerstin Radomski','Johannes Steiniger','Caren Marks','Andreas Steier','Dieter Janecek','Sybille Benning','Thomas Rachel','Dorothee Bär','Frank Pasemann','Lars Klingbeil','Ingrid Pahlmann','Markus Paschke','Elvan Korkmaz-Emre','Charlotte Schneidewind-Hartnagel']



In [None]:
reden_selection = filter_for('name', members, reden)



In [None]:

reden = reden_selection
len(reden)

In [None]:
print(len(reden[5102]['text_lem']))
print(reden[5102]['text'])

## Build corpus and count word frequencies per party

In [5]:
corpus = []
allwords = []

minlength = -20;

# prepare for per party counts
allwordsperparty = {
    'SPD':[],
    'FDP':[],
    'CDU/CSU':[],
    'DIE LINKE':[],
    'BÜNDNIS 90/DIE GRÜNEN':[],
    'AfD':[],
    'fraktionslos':[],
    'Bremen':[]   
}
        
#consider = ['PROPN']
#consider = ['ADJ']
consider = ['NOUN']
for rede in tqdm.tqdm(reden):
    if(len(rede['text_lem']) > minlength):
    #rel_lemmata = [ ele for ex,ele in enumerate(rede['text_lem']) if (rede['text_pos'][ex] in consider and len(ele) > 7 and len(ele) < 16)]
        rel_lemmata = [ ele for ex,ele in enumerate(rede['text_lem']) if rede['text_pos'][ex] in consider ]
        allwords.extend(rel_lemmata)
        allwordsperparty[rede['party']].extend(rel_lemmata)
        corpus.append( " ".join(rel_lemmata ))
   
len(corpus)

100%|██████████| 24666/24666 [00:02<00:00, 8914.12it/s] 


24666

### Compute Semantic Similarity Matrix

In [None]:
#vectorizer_selection = CountVectorizer(vocabulary = features02, decode_error='ignore' , lowercase=False, ngram_range=(1, 1))
vectorizer = CountVectorizer(decode_error='ignore' , lowercase=False , max_df=0.8, min_df=2, ngram_range=(1, 1))
mm = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(f'We now consider {len(features)} different words.')
print(mm.get_shape())


In [None]:
# note that via TFIDF this is way faster!
SemSimMat = cosine_similarity(mm)
print(SemSimMat.shape)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(lowercase=False,max_df=0.8, min_df=2/len(corpus))
mm_tfidf  = vectorizer_tfidf.fit_transform(corpus)
features = vectorizer_tfidf.get_feature_names()
mm_tfidf.get_shape()


(24666, 60262)

In [8]:
SemSimMat_tfidf = mm_tfidf * mm_tfidf.T
SemSimMat_tfidf.get_shape()

(24666, 24666)

## Construct Graph

In [None]:
#reden[0] 
print(SemSimMat_tfidf[10,2003])
print(SemSimMat[10,2003])

In [27]:

nodes = []
for rx,rede in enumerate(reden):
    node = {
        'id' :  rede['id'],
        'name' : rede['name'],
        'date' : rede['date'],
        'discussion_title' : rede['discussion_title'],
        'party' : rede['party'],
        'length' : len(rede['text_lem']),
        #'text' : rede['text']
    }
    
    nodes.append(node)
    

graph = {
    'directed': False,
    'graph': 'word_graph',
    'links': [],
    'nodes': nodes
}

In [28]:
nodes[0]
#SemSimMat_tfidf.mean()
SimMat = SemSimMat_tfidf.todense()
vectorizer = vectorizer_tfidf
mm = mm_tfidf

In [29]:
links = []

min_weight = 0.2222
for ix,nodeI in tqdm.tqdm(enumerate(graph['nodes'])):
    for jx,nodeJ in enumerate(graph['nodes']):
        if nodeI['id'] < nodeJ['id']:          
            source = nodeI['id']
            target = nodeJ['id']
            weight = SimMat[ix,jx]
            if weight > min_weight:
                #links.append([source,target,weight])
                link_dict = {
                    'source':source,
                    'target':target,
                    'weight':weight       
                }
                graph['links'].append(link_dict)

24666it [08:57, 45.92it/s]


In [30]:
nn = len(graph['nodes'])
ne = len(graph['links'])
print( f"This graph has {nn} nodes and {ne} links.")

#import matplotlib.pyplot as plt

#weights=[]
#for link in graph['links']:
#    weights.append(link['weight'])
#print(sum(weights))

#plt.hist(weights, bins=25)
#plt.title("Distribution of Weights")
#plt.xlabel("Wert")
#plt.ylabel("Häufigkeit")
#plt.show()

This graph has 24666 nodes and 313120 links.


In [None]:
# that's too much.

#newlinks = []
#for link in graph['links']:
#    if link['weight'] > 0.3:
#        newlinks.append(link)
#graph['links'] = newlinks 
#len(graph['links'])



In [None]:
print(f'The graph has {len(graph["links"])} links.')

## Append Information about relevant words

In [31]:


for count,node in tqdm.tqdm(enumerate(graph['nodes'])):
    vec_numbers = np.array(mm.getrow(count).toarray()[0])
    #maxWX = np.argmax(vec_numbers)
    #hilf.update({'vec_numbers': vec_numbers})
    msw = list(vectorizer.vocabulary_.keys())[list(vectorizer.vocabulary_.values()).index(np.argmax(vec_numbers))]
    #hilf.update({'maxTFIDF': msw})
    node.update({'msw' : msw})


24666it [02:02, 200.66it/s]


In [32]:
import networkx as nx

graphforgephi = nx.Graph()
for node in tqdm.tqdm(graph['nodes']):
    graphforgephi.add_node(node['id'],name = node['name'],date = node['date'],discussion_title = node['discussion_title'],party = node['party'], length = node['length'],msw = node['msw']);
print('nodes done')
for link in tqdm.tqdm(graph['links']):   
    graphforgephi.add_edge(link['source'],link['target'],weight = link['weight'])
print('links done')   
nx.write_gexf(graphforgephi, "../private/graphforgephi.gexf")
print('save done')

100%|██████████| 24666/24666 [00:00<00:00, 346954.75it/s]
 33%|███▎      | 102383/313120 [00:00<00:00, 503200.88it/s]

nodes done


100%|██████████| 313120/313120 [00:00<00:00, 508066.72it/s]


links done
save done
