# Search for sentences that contain certain words

## Import and load data

In [1]:
import requests
import json
import collections

import re

import spacy
nlp = spacy.load('de') #load spacy model

# for WordClouds 
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
with open("../../data/speeches_20.jsonl",'r',encoding = "utf8") as fp:
    data = list(fp)
speeches = []
for line in data:
    speeches.append(json.loads(line))
    

In [3]:
len(speeches)
#speeches[2776]

5318

## Search function

In [5]:

def filter_for(what, search_terms, speeches):
    filtered_speeches = []
    if what == 'text':
        search_terms_low = []
        for st in search_terms:
            search_terms_low.append(st.lower())
        for speech in speeches:
            match = [st in speech[what].lower() for st in search_terms_low]
            if all(st == True for st in match):
            #if any(st in speech[what] for st in search_terms):
                #print(match)
            #if ( search_terms in speech[what] ):
                filtered_speeches.append(speech)
    else:
        for speech in speeches:
            if ( speech[what].lower() in set(search_terms_low) ):
                filtered_speeches.append(speech)
        
    filtered_speeches.sort(key = lambda x:x['date'])   
    return filtered_speeches



## Find speeches with focal term

In [6]:
#focal_terms = ['Digitalisierung','Zusammenhalt','Demokratie']
#focal_terms = ['Digitalisierung']
#focal_terms = ['Plattform','Demokratie']
#focal_terms = ['extrem','Plattform']
#focal_terms = ['plattform','demokratie']
#focal_terms = ['Plattform','Meinung']
#focal_terms = ['sozial','Netzwerk','Meinung']
#focal_terms = ['social']
focal_terms = ['Grundeinkommen']
#subset = filter_for('party', ['CDU/CSU'], speeches)
subset = filter_for('text', focal_terms, speeches)
len(subset)

19


## Find sentences with focal term

Note: this requires spacy.

In [8]:
sentences = []
#focus = 'social'
focus = 'Grundeinkommen'

for rede in subset:
    doc = nlp(rede["text"])
    print(rede['name'],rede['party'])
    for sent in doc.sents:
        #print(sent.text)
        #if all(ft.lower() in sent.text.lower() for ft in focal_terms):
        #if focal_terms[0].lower() in sent.text.lower() and focal_terms[1].lower() in sent.text.lower():
        #for focus in focal_terms:    
        if focus.lower() in sent.text.lower():        
            sentences.append(sent)
            print(sent)
        
#len(sentences)

Ralph Brinkhaus CDU/CSU
Das ist die Vorstufe zum bedingungslosen Grundeinkommen.
Hermann Gröhe CDU/CSU
einen schrittweisen Weg in ein bedingungsloses Grundeinkommen lehnen wir entschieden ab.
Johannes Huber Fraktionslos
In Wahrheit versucht die Ampel, hier den ersten Schritt zum bedingungslosen Grundeinkommen zu unternehmen.
Hubertus Heil SPD
Frau Klose, ich danke Ihnen für die Frage, weil mir das die Gelegenheit gibt, dem Kollegen vielleicht noch einmal zu sagen: Wir führen kein bedingungsloses Grundeinkommen ein, sondern wir führen ein soziales Bürgergeld ein.
Jens Teutrine FDP
„Warum es kein bedingungsloses Grundeinkommen light gibt“.
Markus Reichel CDU/CSU
Wir als Union stehen zu genau dieser Leistungsgerechtigkeit und lehnen deswegen auch ein Sanktionsmoratorium und damit quasi den Einstieg in ein bedingungsloses Grundeinkommen ab.
Jens Teutrine FDP
Sie sagen, wir würden ein bedingungsloses Grundeinkommen einführen, ohne selbst einen Vorschlag zu machen.
Sie behaupten immer wieder

In [41]:
#sentences

# Advanced Section

## Construct a network of word-cooccurrence

In [9]:
relevantPOS = ['NOUN','ADJ','PROPN']
sentencesNN = []
words = []
for sen in sentences:
    lem = []
    for token in sen:
        if token.pos_ in relevantPOS:
            lem.append(token.lemma_)
    sentencesNN.append(lem)
    words.extend(lem)
#sentencesNN
#words

In [10]:
nodes=[]
curid=1
for word in set(words):
    node = {
    'id' :  curid,
    'name' : word
    }
    nodes.append(node)
    curid=curid+1
    
graph = {
    'directed': False,
    'graph': 'word_graph',
    'links': [],
    'nodes': nodes
}

links = []
linkedwords =[]
linkedids =[]
lx = 0;
for wx1,w1 in enumerate(nodes):
    #print(wx1)
    for wx2,w2 in enumerate(nodes):
        if(w2['id'] > w1['id']):
            for sen in sentencesNN:
                if w1['name'] in sen and w2['name'] in sen:
                    weight = len([ele for ele in linkedwords if ele == (' '.join([w1['name'],w2['name']]))])
                    #    links[]
                    #else:
                    #print(weight)
                    linkedwords.append(' '.join([w1['name'],w2['name']]))
                    #linkedids[ w1['id'] ] , w2['id'] )
                    link_dict = {
                    'source':w1['id'],
                    'target':w2['id'],
                    'sourceWD':w1['name'],
                    'targetWD':w2['name'],
                    'weight': weight+1      
                    }
                    #print(link_dict)
                    links.append(link_dict)
                    graph['links'].append(link_dict)
               
                    #for link in links:

#linkedwords
zähler = collections.Counter(linkedwords).most_common()
print(len(linkedwords),len(links))
#graph['links']=links
#print(zähler)

306 306


In [11]:
for link in links:
    if link['weight']>0:
        graph['links'].append(link)

In [12]:
print(len(graph['links']),
len(graph['nodes']))
#for link in graph['links']:
#    if link['weight']>1:
        #print(link)
    

612 66


### Store as HTML graph

In [13]:
json = {'data':graph, 
        'nodecoloring':'party', 
        'nodelabel': 'name', 
        #'nodelabel': 'mfic',
        "darkmode": False,
        "edgevisibility": True,
        "particles": False
       }
result = requests.post('https://penelope.vub.be/network-components/visualiser', json=json)

with open (f"./wordnet.html", "w") as f:
    f.write(result.json()['graph'])

### Store as Gephi graph

In [33]:
import networkx as nx
import tqdm

graphforgephi = nx.Graph()
for node in tqdm.tqdm(graph['nodes']):
    graphforgephi.add_node(node['id'],name = node['name']);
print('nodes done')
for link in tqdm.tqdm(graph['links']):
    #weight = all((' '.join([w1['name'],w2['name']]) in linkedwords)
    #print(weight)         
    graphforgephi.add_edge(link['source'],link['target'],weight=link['weight'])
print('links done')   
nx.write_gexf(graphforgephi, "graphforgephi.gexf")
print('save done')

100%|██████████| 219/219 [00:00<00:00, 320611.72it/s]
100%|██████████| 3550/3550 [00:00<00:00, 509993.81it/s]

nodes done
links done
save done



