In [2]:
import json
import requests
#import collections
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [3]:
with open('../../data/speeches_preprocessed.json', 'r') as fp:
    data = json.load(fp)

In [4]:
alleReden = data.copy()

In [5]:
reden_red = []
for rede in alleReden:
    if len(rede['text_lem']) > 300:
        reden_red.append(rede)
alleReden = reden_red
        

In [6]:

consider = ['NOUN']
for rede in tqdm(alleReden):
    rel_lemmata = [ ele for ex,ele in enumerate(rede['text_lem']) if rede['text_pos'][ex] in consider ]
    rede['text_lem'] = rel_lemmata
    rede['text_pos'] = [ele for ele in rede['text_pos'] if ele in consider]

100%|██████████████████████████████████████████████████████████████████████████| 19657/19657 [00:03<00:00, 4934.01it/s]


In [8]:
reden_clean = alleReden

In [9]:
def get_names_and_parties(reden_clean):
    names = []
    parties = []
    for rede in reden_clean:
        if rede['name'] not in names:
            names.append(rede['name'])
            parties.append(rede['party'])
    return names, parties

In [10]:
names, parties = get_names_and_parties(reden_clean)

In [11]:
def get_text_clean(name, reden):
    text = ''
    nReden = 0
    for rede in reden:
        if rede['name'] == name:
            nReden += 1
            text += ' '.join(rede['text_lem'])
    return text, nReden
   

In [12]:
def liste_von_parla_mit_dict_text(reden, names, parties):
    
    parlamentarier = []

    for count,name in enumerate(names):
        hilf = {
            'id':count+1,
            'name': names[count],
            'party': parties[count],
            'type': 'parla'
        }
        
        text, nReden = get_text_clean(name, reden) 
            
        hilf.update({'text_lem': text, 'nReden': nReden})
        
        parlamentarier.append(hilf)
    
    return parlamentarier

In [13]:
parlamentarier = liste_von_parla_mit_dict_text(reden_clean, names, parties)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [ parla['text_lem'] for parla in parlamentarier ]
vectorizer = CountVectorizer(max_df=0.8, min_df=(2/781))
f_word = vectorizer.fit_transform(corpus)

In [15]:
f_word

<777x56118 sparse matrix of type '<class 'numpy.int64'>'
	with 867760 stored elements in Compressed Sparse Row format>

In [16]:
corpus = [ parla['text_lem'] for parla in parlamentarier ]
vectorizer = TfidfVectorizer(max_df=0.8, min_df=(2/781))
tf_idf_matrix  = vectorizer.fit_transform(corpus)

In [17]:
tf_idf_matrix

<777x56118 sparse matrix of type '<class 'numpy.float64'>'
	with 867760 stored elements in Compressed Sparse Row format>

In [18]:
def liste_von_parla_mit_dict_vec(reden, parlamentarier, X_csr, vectorizer, f_word):
    
    parlamentarier_vec = []

    for count, parla in tqdm(enumerate(parlamentarier)):
        hilf = {
            'id':count+1,
            'name': parla['name'],
            'party': parla['party'],
            'nReden': parla['nReden'],
            'type': parla['type'],
            'text': parla['text_lem'],
        }
        
        vec_numbers = np.array(X_csr.getrow(count).toarray()[0])
        vec_numbers2 = np.array(f_word.getrow(count).toarray()[0])
        
        
        maxWX = np.argmax(vec_numbers)
          
        #vec_numbers = vec_numbers/np.linalg.norm(vec_numbers)
        
        hilf.update({'vec_numbers': vec_numbers})
        mfw = list(vectorizer.vocabulary_.keys())[list(vectorizer.vocabulary_.values()).index(np.argmax(vec_numbers2))]
        msw = list(vectorizer.vocabulary_.keys())[list(vectorizer.vocabulary_.values()).index(np.argmax(vec_numbers))]
        hilf.update({'maxTFIDF': msw, 'mfw': mfw})
        
        
        parlamentarier_vec.append(hilf)
    
    return parlamentarier_vec

In [19]:
parlamentarier_vec = liste_von_parla_mit_dict_vec(reden_clean, parlamentarier, tf_idf_matrix.copy(), vectorizer, f_word)

777it [00:08, 93.16it/s] 


In [20]:
pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T 

similarity = pairwise_similarity.toarray()

In [21]:
def cotop_graph_erstellen(parlamentarier, min_weight, similarity):

    graph = {
        'directed': False,
        'graph': 'semant_graph',
        'links': [],
        'nodes': parlamentarier,
    }

    for ix,nodeI in enumerate(graph['nodes']):
        for jx,nodeJ in enumerate(graph['nodes']):
            if nodeI['id'] < nodeJ['id']:
                source = nodeI['id']
                target = nodeJ['id']
                #weight = cos_sim(nodeI['vec_numbers'], nodeJ['vec_numbers'])
                #r = np.corrcoef(nodeI['vec_numbers'], nodeJ['vec_numbers'])
                #weight = r[0,1]
                weight = similarity[ix,jx]
                if weight > min_weight:
                    link_dict = {
                        'source':source,
                        'target':target,
                        'weight':weight,
                        'type': 'parla_parla'
                    }
                    graph['links'].append(link_dict)
    return graph

In [22]:
graph_parla = cotop_graph_erstellen(parlamentarier_vec, 0, similarity)

### Erstellen des redengraphs

In [23]:
reden_ohne_text = []
for rede in tqdm(alleReden):
    hilf = {
        'id':rede['id'],
        'name':rede['name'],
        'party':rede['party'],
        'type':'speech'}
    reden_ohne_text.append(hilf)

100%|████████████████████████████████████████████████████████████████████████| 19657/19657 [00:00<00:00, 620937.30it/s]


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [' '.join(rede['text_lem']) for rede in alleReden]
vectorizer = CountVectorizer(max_df=0.8, min_df=(2/len(corpus)))
f_word_speech = vectorizer.fit_transform(corpus)

In [25]:
f_word_speech

<19657x58189 sparse matrix of type '<class 'numpy.int64'>'
	with 1861821 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_tfidf(corpus):
    vectorizer = TfidfVectorizer(max_df=0.8, min_df=(2/len(corpus)))
    tf_idf_matrix  = vectorizer.fit_transform(corpus)
    return tf_idf_matrix, vectorizer.vocabulary_

In [27]:
tfidf, vocab = get_tfidf([' '.join(rede['text_lem']) for rede in alleReden])

In [28]:
tfidf

<19657x58189 sparse matrix of type '<class 'numpy.float64'>'
	with 1861821 stored elements in Compressed Sparse Row format>

In [29]:
similarity = cosine_similarity(tfidf, Y=None)

In [30]:
def cos_sim(vec_a, vec_b):
    
    return np.dot(vec_a, vec_b) /(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

In [31]:
def cotop_graph_erstellen(reden_clean, min_weight, similarity, tf_idf_matrix, f_word_speech):
    graph = {
        'directed': False,
        'graph': 'semant_graph_reden_knoten',
        'links': [],
        'nodes': reden_ohne_text,
    }
    links = []
    n=0
    for ix,nodeI in tqdm(enumerate(graph['nodes'])):
        msw = list(vocab.keys())[list(vocab.values()).index(np.argmax(tf_idf_matrix.getrow(ix).toarray()[0]))]
        mfw = list(vocab.keys())[list(vocab.values()).index(np.argmax(f_word_speech.getrow(ix).toarray()[0]))]
        nodeI.update({'msw':msw, 'mfw': mfw})
        for jx,nodeJ in enumerate(graph['nodes']):
            if ix < jx:
                source = nodeI['id']
                target = nodeJ['id']
                #weight = cos_sim(nodeI['vec_numbers'], nodeJ['vec_numbers'])
                #r = np.corrcoef(nodeI['vec_numbers'], nodeJ['vec_numbers'])
                #weight = r[0,1]
                #weight = similarity.getrow(ix).toarray()[jx]
                weight = similarity[ix,jx]
                if weight > min_weight:
                    link_dict = {
                        'source':source,
                        'target':target,
                        'weight':weight, 
                        'type':'rede_rede'
                    }
                    graph['links'].append(link_dict)
                else:
                    n+=1
                    
    
    return graph, n

In [32]:
graph_reden, nicht_drinne = cotop_graph_erstellen(alleReden, 0.2, similarity, tfidf, f_word_speech)

19657it [06:52, 47.68it/s]


## kombinieren der beiden graphen

In [33]:
links = []
for node in tqdm(graph_parla['nodes']):
    for rede in reden_ohne_text:
        if node['name']==rede['name']:
            link_dict = {
                        'source':node['id'],
                        'target':rede['id'],
                        'weight':1, 
                        'type':'parla_rede'
                    }
            links.append(link_dict)
            

100%|███████████████████████████████████████████████████████████████████████████████| 777/777 [00:05<00:00, 136.45it/s]


In [34]:
len(links)

19657

In [35]:
import networkx as nx

In [36]:
graphforgephi = nx.Graph()
for node in graph_reden['nodes']:
    node['type'] = 1
    graphforgephi.add_node(node['id'], name=node['name'], party=node['party'], typ=node['type'], msw=node['msw'], mfw=node['mfw']);

for node in graph_parla['nodes']:
    node['type'] = 10
    graphforgephi.add_node(node['id'],name = node['name'], party=node['party'], msw=node['maxTFIDF'], mfw = node['mfw'], nReden = node['nReden'], typ=node['type']);

print('nodes done')

for link in graph_reden['links']:
    if link['weight'] > 0: #0.22285057620424778:
        graphforgephi.add_edge(link['source'],link['target'],weight = link['weight'], typ=link['type'])
        
#for link in graph_parla['links']:
    #if link['weight'] > 0: #0.22285057620424778:
        #graphforgephi.add_edge(link['source'],link['target'],weight = link['weight'], typ=link['type'])
        
for link in links:
    if link['weight'] > 0: #0.22285057620424778:
        graphforgephi.add_edge(link['source'],link['target'],weight = link['weight'], typ=link['type'])
    

nodes done


In [37]:
nx.write_gexf(graphforgephi, "graphforgephi_reden_parla.gexf")
print('save done')

save done
