In [None]:
%matplotlib inline
import pandas as pd
import os
import networkx as nx
import igraph as ig
import numpy as np
import louvain
import matplotlib.pyplot as plt

In [None]:
filepath = '/media/sf_VBox_Shared/CaseLaw/2018-01-29-lido/derived/'

In [None]:
links_df = pd.read_csv(os.path.join(filepath, 'case_to_article_title_links.csv'))

In [None]:
case_to_case_links = pd.read_csv(os.path.join(filepath, 'case_to_case_lx_links.csv'))

In [None]:
art_nodes = pd.read_csv(os.path.join(filepath, 'article_title_nodes.csv'))
case_nodes = pd.read_csv(os.path.join(filepath, 'case_nodes_simple.csv'))

In [None]:
case_nodes['id'] = case_nodes['lido_id']
case_nodes['label'] = case_nodes['ecli'].str.split(':').map(lambda l: '{}:{}'.format(l[2], l[3]) if type(l)==list else l)

In [None]:
art_nodes.head()

In [None]:
case_nodes.head()

In [None]:
seed_articles = art_nodes[art_nodes.label.str.match('Burgerlijk Wetboek Boek 7, Artikel ((611)|(658))', case=False)]
seed_articles

In [None]:
linking_cases = links_df[links_df.target.isin(seed_articles.id)]['source'].unique()
len(linking_cases)

In [None]:
links_sub = links_df[links_df.source.isin(linking_cases)]
links_sub.shape

In [None]:
case_to_case_sub = case_to_case_links[case_to_case_links.source.isin(linking_cases)&case_to_case_links.target.isin(linking_cases)]
case_to_case_sub.shape

In [None]:
art_nodes_sub = art_nodes[art_nodes.id.isin(links_sub.target.unique())]
case_nodes_sub = case_nodes[case_nodes.lido_id.isin(linking_cases)]
print(art_nodes_sub.shape, case_nodes_sub.shape)

In [None]:
case_nodes_sub['node_type'] = 0
art_nodes_sub['node_type'] = 1

In [None]:
links_sub['link_type'] = 'case-article'
case_to_case_sub['link_type'] = 'case-case'

In [None]:
case_to_case_sub.to_csv(os.path.join(filepath, 'subnetwork', 'case-to-case-links.csv'), index=False)
links_sub.to_csv(os.path.join(filepath, 'subnetwork', 'case-to-article-links.csv'), index=False)
art_nodes_sub.to_csv(os.path.join(filepath, 'subnetwork', 'article-nodes.csv'), index=False)
case_nodes_sub.to_csv(os.path.join(filepath, 'subnetwork', 'case-nodes-sub.csv'), index=False)

In [None]:
case_to_case_sub = pd.read_csv(os.path.join(filepath, 'subnetwork', 'case-to-case-links.csv'))
links_sub = pd.read_csv(os.path.join(filepath, 'subnetwork', 'case-to-article-links.csv'))
art_nodes_sub = pd.read_csv(os.path.join(filepath, 'subnetwork', 'article-nodes.csv'))
case_nodes_sub = pd.read_csv(os.path.join(filepath, 'subnetwork', 'case-nodes-sub.csv'))

In [None]:
# Make smaller subnetwork
case_nodes_hr = case_nodes_sub[case_nodes_sub['court']=='HR']
links_sub_hr = links_sub[links_sub.source.isin(case_nodes_hr.id)]
art_nodes_hr = art_nodes_sub[art_nodes_sub.id.isin(links_sub_hr.target.unique())]
case_to_case_hr = case_to_case_sub[case_to_case_sub.source.isin(case_nodes_hr.id)&case_to_case_sub.target.isin(case_nodes_hr.id)]
case_nodes_hr.shape, links_sub_hr.shape, art_nodes_hr.shape, case_to_case_hr.shape

In [None]:
case_to_case_hr.to_csv(os.path.join(filepath, 'subnetwork', 'case-to-case-links-hr.csv'), index=False)
links_sub_hr.to_csv(os.path.join(filepath, 'subnetwork', 'case-to-article-links-hr.csv'), index=False)
art_nodes_hr.to_csv(os.path.join(filepath, 'subnetwork', 'article-nodes-hr.csv'), index=False)
case_nodes_hr.to_csv(os.path.join(filepath, 'subnetwork', 'case-nodes-hr.csv'), index=False)

In [None]:
## cocitation network
import scipy.sparse
links_series = links_sub.groupby(['source', 'target']).nunique()['source']
mat_sparse = scipy.sparse.coo_matrix(
                (links_series.values, 
                 (links_series.index.labels[0], links_series.index.labels[1])))

In [None]:
case_ids = links_series.index.levels[0]
article_ids = links_series.index.levels[1]
print(len(case_ids), len(article_ids))

In [None]:
mat_cocitation = mat_sparse.T.dot(mat_sparse)
mat_cocitation.shape

In [None]:
x_index, y_index, data = scipy.sparse.find(mat_cocitation)
df_cocitation = pd.DataFrame({'source':article_ids[x_index],
             'target': article_ids[y_index],
             'weight': data})
df_cocitation.shape

In [None]:
df_cocitation = df_cocitation[df_cocitation['source']<df_cocitation['target']]
df_cocitation.shape

In [None]:
df_cocitation_min5 = df_cocitation[df_cocitation.weight>=5]
df_cocitation_min5.shape

In [None]:
df_cocitation_min5.to_csv(os.path.join(filepath, 'subnetwork', 'legislation-cociation-min5.csv'), index=False)

## enrich
Use caselawnet for these nodes

In [None]:
import caselawnet

In [None]:
links_dict = [{'source': d['source'].split('/')[-1], 'target': d['target'].split('/')[-1]} for d in case_to_case_sub.to_dict(orient='records')]

In [None]:
enrich_again = False

if enrich_again:
    nodes_rich_list = caselawnet.enrich_eclis([s.split('/')[-1] for s in case_nodes_sub.id ], rootpath='/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/')
    nodes_rich = pd.DataFrame(nodes_rich_list)
    nodes_rich.to_csv(os.path.join(filepath, 'subnetwork', 'nodes-cases-rich.csv'), index=False)
else:
    nodes_rich = pd.read_csv(os.path.join(filepath, 'subnetwork', 'nodes-cases-rich.csv')).fillna('')
    nodes_rich_list = nodes_rich.to_dict(orient='records')

In [None]:
nodes_rich.head()

In [None]:
links_list = caselawnet.enrich_links(links_dict)

In [None]:
nodes_network, links_list = caselawnet.get_network(nodes_rich_list, links_list)

In [None]:
caselawnet.to_sigma_json(nodes_network, links_list, 'Employer liability', os.path.join(filepath, 'subnetwork', 'network.json'))

In [None]:
nodes_network_df = pd.DataFrame(nodes_network)
nodes_network_df.community.value_counts().head()

In [None]:
graph = caselawnet.network_analysis.get_network(nodes_rich_list, links_list)

In [None]:
len(graph)

In [None]:
graph = nx.readwrite.json_graph.node_link_graph({'nodes': nodes_rich_list, 'links': links_list},
                                       directed=True, multigraph=False)

In [None]:
graph.number_of_nodes(), graph.number_of_edges()

In [None]:
def add_network_statistics(nodes, links):
    if len(nodes)==0:
        return nodes
    graph = get_network(nodes, links)
    degree = nx.degree(graph)
    if max(dict(degree).values()) > 0:
        hubs, authorities = get_hits(graph)
        statistics = {
            'degree': degree,
            'in_degree': graph.in_degree(),
            'out_degree': graph.out_degree(),

            'degree_centrality': nx.degree_centrality(graph),
            'in_degree_centrality': nx.in_degree_centrality(graph),
            'out_degree_centrality': nx.out_degree_centrality(graph),
            'betweenness_centrality': nx.betweenness_centrality(graph),
            'closeness_centrality': nx.closeness_centrality(graph),
            'pagerank': get_pagerank(graph),
            'hubs': hubs,
            'authorities': authorities
        }
    else:
        statistics = {}

    # for relative in-degree we sort on date
    derive_date = lambda k: k['date'] if k['date']!='' else '{}-01-01'.format(k['year'])
    nodes.sort(key=derive_date, reverse=True)
    for i, node in enumerate(nodes):
        nodeid = node['id']
        for var in statistics.keys():
            node[var] = statistics[var][nodeid]
        if 'in_degree' in node:
            node['rel_in_degree'] = node['in_degree'] / float(max(i, 1))
    get_community(graph, nodes)
    return nodes

In [None]:
case_nodes_sub.court.value_counts()

In [None]:
art_nodes_sub.book.value_counts()