In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from scipy.stats import entropy
import os
from helpers import style
from helpers.afa import adaptive_filter
from itertools import groupby
from tqdm import tqdm
from collections import Counter
from itertools import combinations
from collections import defaultdict
import cdlib
import networkx as nx
import os

from helpers.linkage import mutual_information_smooth
from helpers.pathtools import *
from helpers.metrics import *
from helpers.visuals import add_cabinet_periods

style.load_style()

from helpers.dataloader import load
from config import *

In [3]:
k = 250
di_path = os.path.join(DATA_DIR_REVISIONS, f"doc-topics-1945-1994-{k}.tsv")
da_path = os.path.join(DATA_DIR_REVISIONS, "data.tsv")
ks_path = os.path.join(DATA_DIR_REVISIONS, f"topic-keys-1945-1994-{k}.tsv")

dists, dat, keys = load(
    dist_path=di_path,
    dat_path=da_path,
    keys_path=ks_path,
    quick_return=True
)

In [4]:
dists.columns = dists.columns.astype(int).map(dict(zip(keys.ix, keys['keys'].str[:50])))

In [5]:
# Average Distributions on Topics + Members

dists = dists.groupby(dat[['topic_id','member-ref']].astype(str).agg('_'.join,axis=1)).mean()

# Map Topic Dates
topic_dates = dict(zip(dat.topic_id,dat.date))
topic_dates = {topic:pd.Timestamp(year = _.year, month = 1 if _.month < 7 else 6, day = 1) for topic,_ in topic_dates.items()}

# Filter Non-Thematic (Policy) topics
dists = dists[sorted(keys[keys.policy_label==1]['keys'].str[:50].tolist())]
dists = dists.div(dists.sum(axis=1), axis=0)

# Select topic-member pairs based on above-average topic use
tm_dates = {i:(topic_dates[i.split('_')[0]],i.split('_')[1]) for i in dists.index}
mb_per_top = dists.groupby(dists.index.map(tm_dates)).mean().apply(zscore,axis=0)
mb_per_top_d = mb_per_top.apply(lambda row: row[row > 0].index.tolist(), axis=1).to_dict()

In [6]:
# Generate Dict. with relevant topics per member
mb_per_top_d = {}

for date, _ in dists.groupby(dists.index.str.split('_').str[0].map(topic_dates)):
    p_topic = _.mean()
    for mb,dd in _.groupby(_.index.str.split('_').str[1]):
        dd = dd.mean()
        dd = zscore(dd / p_topic)
        dd = dd[dd > 1].index.tolist()
        mb_per_top_d[(date,mb)] = dd

In [7]:
# Generate Linkage Networks

networks = {}
size_dicts = {}

for cd,(date, _) in enumerate(dists.groupby(dists.index.str.split('_').str[0].map(topic_dates))):
    rij, ri, m = mutual_information_smooth(theta = _.to_numpy())
    rij = pd.DataFrame(rij, index=_.columns.astype(str), columns=_.columns.astype(str))
    rij = rij.stack().reset_index().rename(columns={"level_0":"s","level_1":"t",0:"pmi"})
    rij = rij[(rij.s != rij.t) & (rij.pmi > 0)]

    sizes = _.mean()
    sizes.index = sizes.index
    sizes = sizes.to_dict()
    g = nx.from_pandas_edgelist(df = rij, source='s', target='t', edge_attr=['pmi'])
    networks[date] = g
    size_dicts[cd] = list(sizes.items())

In [41]:
# Dynamic Community Detection (Identify Paths of Communities)
tc, g, sizes, paths, metadata = get_tc(networks,
                                       size_dicts,
                                       overlap_threshold=.45,
                                       louvain_res=2.5,
                                       min_chain_len=4,
                                       verbose=False)

cid_to_path_index = {}
for idx, path in enumerate(paths):
    for cid in path:
        cid_to_path_index[cid] = idx

In [51]:
# Match relevant topics per member to paths

network_periods = list(sorted(networks.keys()))
mbs = set([mb for (date,mb),k in mb_per_top_d.items() if 'nl' in mb])

r = []

for mb_ in mbs:
    mbd = {date:k for (date, mb), k in mb_per_top_d.items() if mb == mb_}

    for c,p in enumerate(network_periods):

        if p in mbd:
            cms_period = tc.get_clustering_at(c).named_communities
            tps_mb = mbd[p]
            for cid,cmsp in cms_period.items():
                o = overlap_coefficient(set(cmsp), set(tps_mb))
                r.append({"date":network_periods[c],
                          "mb": mb_,
                          "cid":cid,
                          "overlap":o,
                          "len_member_topics":len(tps_mb),
                          "len_community":len(cmsp),
                          "path_id":cid_to_path_index.get(cid),
                          "topics_community":' | '.join(cmsp)})

rd = pd.DataFrame(r)

In [52]:
rd[rd.len_member_topics>3].nlargest(15,'overlap')

Unnamed: 0,date,mb,cid,overlap,len_member_topics,len_community,path_id,topics_community
1059,1969-01-01,nl.m.01815,47_12,1.0,6,6,33.0,stadsvernieuwing verbetering gemeente monument...
1078,1969-06-01,nl.m.01815,48_14,1.0,12,5,33.0,stadsvernieuwing verbetering gemeente monument...
1096,1970-01-01,nl.m.01815,49_14,1.0,9,5,33.0,zorg voorziening bejaardenoord gehandicapt bej...
2346,1992-01-01,nl.m.02857,93_17,1.0,7,3,102.0,onderwijs instelling wetenschappelijk hoog uni...
2564,1982-01-01,nl.m.02606,73_0,1.0,7,13,71.0,organisatie publiekrechtelijk bedrijfsleven be...
2865,1990-01-01,nl.m.02606,89_16,1.0,9,3,,ambtenaar salaris personeel hoog overleg overh...
4890,1954-01-01,nl.m.01326,17_13,1.0,10,4,9.0,kind gezin kinderbijslag groep inkomen klein o...
5561,1955-01-01,nl.m.00572,19_18,1.0,7,2,,lid functie benoeming koninklijk benoemen koni...
6196,1976-01-01,nl.m.01741,61_17,1.0,4,2,58.0,consument economisch reclame produkt verkoop v...
6774,1992-01-01,nl.m.02815,93_16,1.0,9,4,,belasting fiscaal inkomstenbelasting belasting...


In [66]:
Counter(' | '.join(rd[rd.path_id==15].topics_community).split(' | ')).most_common(5)

[('sociaal verzekering premie uitkering voorziening z', 864),
 ('werknemer ondernemingsraad werkgever onderneming o', 729),
 ('loon loonpolitiek bedrijfsleven bedrijfstak werkne', 714),
 ('uitkering minimumloon sociaal minima inkomen kabin', 675),
 ('inkomen partner gelijk ongehuwd vrouw uitkering be', 671)]

In [55]:
rd.groupby('path_id').overlap.mean().nlargest()

path_id
4.0     0.108811
73.0    0.097473
43.0    0.095880
15.0    0.095229
95.0    0.092392
Name: overlap, dtype: float64