In [None]:
%matplotlib inline
import pandas as pd
import os
import networkx as nx
import igraph as ig
import numpy as np
import louvain
import matplotlib.pyplot as plt

In [None]:
filepath = '/media/sf_VBox_Shared/CaseLaw/2018-01-29-lido/derived/'
links_df = pd.read_csv(os.path.join(filepath, 'case-to-article-links-unique.csv'))

In [None]:
art_nodes = pd.read_csv(os.path.join(filepath, 'article_nodes_nodup.csv'))
case_nodes = pd.read_csv(os.path.join(filepath, 'case_nodes_simple.csv'))

In [None]:
print(art_nodes.shape)
print(art_nodes.title.drop_duplicates().shape)
print(art_nodes.drop('id', axis=1).drop_duplicates().shape)

In [None]:
print(links_df.shape, links_df.drop_duplicates().shape)

In [None]:
# Check that title and label are always equal
art_nodes[art_nodes['title']!=art_nodes['label']]

In [None]:
# Wich have different authorities?
nr_authorities = art_nodes.groupby(['title', 'label']).nunique()['authority']
nr_authorities[nr_authorities>1].head()

In [None]:
art_nodes[art_nodes['title']=='Verordening op het bestuur, Artikel 1']

In [None]:
links_merged = links_df.merge(art_nodes, how='left', left_on='target', right_on='id')
links_merged.head()

In [None]:
# Group on article name
links_titles = links_merged.groupby(['source', 'title']).count()['id']

In [None]:
# Are there ever citations to multiple versions from one source?
links_titles[links_titles>1].head()

In [None]:
# Apparently this happens, look into one
list(links_df[links_df['source']=='http://linkeddata.overheid.nl/terms/jurisprudentie/id/ECLI:NL:CBB:2001:AB1986']['target'])

In [None]:
links_titles.reset_index().columns

In [None]:
links_case_title = links_titles.reset_index()[['source', 'title']]
links_case_title.columns = ['source', 'target']
links_case_title.head()

In [None]:
nodes_articles_titles = art_nodes[['title', 'label']].drop_duplicates()
nodes_articles_titles.columns = ['id', 'label']

In [None]:
nodes_articles_titles['book'] = nodes_articles_titles.label.str.split(',').map(lambda l: l[0])

In [None]:
links_case_title.to_csv(os.path.join(filepath, 'case_to_article_title_links.csv'), index=False)
nodes_articles_titles.to_csv(os.path.join(filepath, 'article_title_nodes.csv'), index=False)

In [None]:
g = nx.from_pandas_edgelist(links_case_title, source='source', target='target')

In [None]:
nx.set_node_attributes(g, {n: 'case' for n in case_nodes}, name='type')
nx.set_node_attributes(g, {n: 'article' for n in art_nodes}, name='type')

In [None]:
print(nx.info(g))

In [None]:
# Convert to iGraph
g_ig = ig.Graph.TupleList(g.edges())

att_list = set(np.array([list(d.keys()) for n, d in g.nodes(data=True)]).flatten())

for att in att_list:
    att_dict = nx.get_node_attributes(g, att)
    g_ig.vs[att] = [att_dict[n] for n in g_ig.vs['name']]

In [None]:
ccs = list(nx.connected_components(g))

In [None]:
ccs_sizes = np.array([len(c) for c in ccs])
print("Number of connected components:", len(ccs))
print("Relative size of largest component:", np.max(ccs_sizes)/np.sum(ccs_sizes))

## Community detction

In [None]:
p_01, p_0, p_1 = louvain.CPMVertexPartition.Bipartite(g_ig, resolution_parameter_01=0.01)
p_01.summary()

In [None]:
optimiser = louvain.Optimiser()
diff = optimiser.optimise_partition_multiplex([p_01, p_0, p_1], layer_weights=[1, -1, -1])

In [None]:
g_ig.vs['community'] = p_01.membership

In [None]:
p_01.summary()

In [None]:
p_0.summary()

In [None]:
len(p_0.membership)

In [None]:
cluster_df = pd.DataFrame({'name': g_ig.vs['name'], 'community': g_ig.vs['community'],  'type': g_ig.vs['type']})

In [None]:
cluster_df.to_csv(os.path.join(filepath, 'bimodal_clusters.csv'), index=False)

In [None]:
comm_counts = cluster_df.groupby(['community', 'type']).count()['name'].unstack()
comm_counts[:20]

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 20))

n = 50

comm_counts[:n].plot(kind='bar', ax=axes[0], title='Cases and articles')
comm_counts['article'][:n].plot(kind='bar', ax=axes[1], title='Articles')
comm_counts['case'][:n].plot(kind='bar', ax=axes[2], title='Cases')

In [None]:
# Merge with nodes
art_nodes = pd.read_csv(os.path.join(filepath, 'article_nodes_nodup_min5.csv'))
case_nodes = pd.read_csv(os.path.join(filepath, 'case_nodes_simple.csv'))

In [None]:
case_nodes = case_nodes.merge(cluster_df[cluster_df['type']=='case'].drop('type', axis=1), how='left', left_on='lido_id', right_on='name', suffixes=('', '_bimodal'))
case_nodes = case_nodes.drop('name', axis=1).rename(columns={'community': 'community_bimodal'})

In [None]:
case_nodes.head()

In [None]:
art_nodes = art_nodes.merge(cluster_df[cluster_df['type']=='article'].drop('type', axis=1), how='left', left_on='id', right_on='name', suffixes=('', '_bimodal')).drop('name', axis=1)

In [None]:
art_nodes.to_csv(os.path.join(filepath, 'article_nodes_nodup_min5_bimodal.csv'), index=False)
case_nodes.to_csv(os.path.join(filepath, 'case_nodes_simple_bimodal.csv'), index=False)

In [None]:
ctab = pd.crosstab(art_nodes['community'], art_nodes['community_bimodal'])
ctab.shape

In [None]:
im = plt.imshow(ctab.as_matrix(), aspect='equal')

In [None]:
# Which articles are cited in the largest community?
art_nodes[art_nodes.community_bimodal==0][['label', 'community']]