In [None]:
import os
import caselawnet
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json
import networkx as nx
import community

In [None]:
filepath = '/media/sf_VBox_Shared/CaseLaw/2018-01-29-lido/derived/'
nodes_df = pd.read_csv(os.path.join(filepath, 'article_nodes_nodup_min5.csv'), index_col='id')
edges_df = pd.read_csv(os.path.join(filepath, 'article_to_article_min5.csv'))

In [None]:
nodes_df.head()

In [None]:
graph = nx.from_pandas_edgelist(edges_df, source='source', target='target', edge_attr='weight')
attributes = ['title', 'authority', 'book', 'community']
for att in attributes:
    nx.set_node_attributes(graph, nodes_df[att].to_dict(), att)

## Assortativity

In [None]:
for att in attributes[1:]:
    ass = nx.attribute_assortativity_coefficient(graph, att)
    mod = community.modularity(nodes_df[att].to_dict(), graph)
    print(att, 'assortativity:', ass, 'modularity:', mod)

## Degree distributions

In [None]:
degree_hist = nx.degree_histogram(graph)

In [None]:
ax = plt.subplot(311)
ax.bar(range(len(degree_hist)), degree_hist);

ax = plt.subplot(312)
ax.bar(range(len(degree_hist)), degree_hist);
ax.set_xscale("log", nonposx='clip')


ax = plt.subplot(313)
ax.bar(range(len(degree_hist)), degree_hist);
ax.set_xscale("log")
ax.set_yscale("log")

## Connected components

In [None]:
import numpy as np

In [None]:
ccs = list(nx.connected_components(graph))

In [None]:
ccs_multiple = [c for c in ccs if len(c) > 1]
ccs_sizes = np.array([len(c) for c in ccs])
ccs_multiple_sizes = np.array([len(c) for c in ccs_multiple])
print("Number of connected components:", len(ccs))
print("Relative size of largest component:", np.max(ccs_sizes)/np.sum(ccs_sizes))
print("Number of non-singleton components:", len(ccs_multiple_sizes))
print("Relative size of largest component without singletons:", np.max(ccs_multiple_sizes)/np.sum(ccs_multiple_sizes))

In [None]:
plt.bar(range(len(ccs)), sorted(ccs_sizes, reverse=True))
plt.gca().set_xscale('log')
plt.gca().set_yscale('log')

In [None]:
# Save largest cc
gcc_ids = list(ccs[np.argmax(ccs_sizes)])
nodes_gcc = nodes_df.loc[gcc_ids]
edges_gcc = edges_df[edges_df['source'].isin(gcc_ids) & edges_df['target'].isin(gcc_ids)]
print(len(nodes_gcc), len(edges_gcc))
nodes_gcc.to_csv(os.path.join(filepath, 'article_nodes_nodup_min5_gcc.csv'))
edges_gcc.to_csv(os.path.join(filepath, 'article_to_article_min5_gcc.csv'), index=False)

## communities

In [None]:
community_sizes = nodes_df.groupby('community').size()
print("Number of communities:", len(community_sizes))
print("Average size of community", community_sizes.mean())