In [None]:
%matplotlib inline
import pandas as pd
import os
import scipy.sparse
import numpy as np
import matplotlib.pyplot as plt

In [None]:
fp = '/media/sf_VBox_Shared/CaseLaw/2018-01-29-lido/derived/'

In [None]:
case_to_article_links = pd.read_csv(os.path.join(fp, 'case_to_article_title_links.csv'))

In [None]:
case_to_article_links.shape

In [None]:
case_to_article_links.head()

In [None]:
# Select only one entry for each source,target combination
case_to_article_links_distinct = case_to_article_links.groupby(['source', 'target']).nunique()['source']

In [None]:
# Store the order of the identifiers
case_ids = case_to_article_links_distinct.index.levels[0]
article_ids = case_to_article_links_distinct.index.levels[1]

In [None]:
len(case_ids), len(article_ids)

In [None]:
# put edgelist in sparse matrix format
mat_sparse = scipy.sparse.coo_matrix(
                (case_to_article_links_distinct.values, 
                 (case_to_article_links_distinct.index.labels[0], case_to_article_links_distinct.index.labels[1])))

In [None]:
mat_sparse

In [None]:
# the cocitation matrix is simply the dot product
mat_cocitation = mat_sparse.T.dot(mat_sparse)

In [None]:
mat_cocitation

In [None]:
# retrieve indices of non-zero entries
x_index, y_index, data = scipy.sparse.find(mat_cocitation)

In [None]:
df_cocitation = pd.DataFrame({'source':article_ids[x_index],
             'target': article_ids[y_index],
             'weight': data})

In [None]:
# Drop self-loops and duplicates
df_cocitation = df_cocitation[df_cocitation['source']<df_cocitation['target']]

In [None]:
df_cocitation.index.name = 'id'

In [None]:
df_cocitation.shape

In [None]:
# Write to database
import sqlalchemy
engine = sqlalchemy.create_engine('mysql+pymysql://dafne@localhost/caselaw?charset=utf8')
df_cocitation.to_sql('links_article_to_article_title', engine)

In [None]:
df_cocitation.to_csv(os.path.join(fp, 'article_to_article_title.csv'))

In [None]:
df_cocitation.weight.max()

In [None]:
# Look at the weight distribution
df_cocitation['weight'].hist(bins=range(50));

In [None]:
# What part of the netwerk remains if we cutoff beyond a certain weight value?
df_cocitation['weight'].hist(bins=range(20), cumulative=True, histtype='step', normed=1);

In [None]:
# How many nodes do we have?
len(set(df_cocitation['source'].unique()).union(set(df_cocitation['target'].unique())))

In [None]:
# How large is the network (weights and edges) for each cut-off value?
w_max = 20

n = np.zeros(w_max)
m = np.zeros(w_max)

for i in range(w_max):
    df_sub =  df_cocitation[df_cocitation['weight']>=i+1]
    n[i] = len(set(df_sub['source'].unique()).union(set(df_sub['target'].unique())))
    m[i] = len(df_sub)

In [None]:
plt.plot(np.arange(1, w_max+1), n, label='# nodes')
plt.plot(np.arange(1, w_max+1), m, label='# edges')
plt.legend()
plt.yscale('log')
plt.show()

In [None]:
plt.plot(np.arange(1, w_max+1), n, label='# nodes')
plt.plot(np.arange(1, w_max+1), m, label='# edges')
plt.legend()
plt.show()

In [None]:
# So if we cut off at 5:
n[4], m[4]

In [None]:
df_min5 =  df_cocitation[df_cocitation['weight']>=5].copy()

In [None]:
df_min5.to_csv(os.path.join(fp, 'article_to_article_title_min5.csv'))

In [None]:
df_min5.sort_values('weight', ascending=False).head()

## Create nodes

In [None]:
article_ids = set(df_min5.source.unique()).union(set(df_min5.target.unique()))
len(article_ids)

In [None]:
nodes_min5 = pd.DataFrame({'id': list(article_ids), 'title': list(article_ids), 'label': list(article_ids)})
nodes_min5['book'] = nodes_min5.title.str.split(',').map(lambda l: l[0])

In [None]:
nodes_min5.to_csv(os.path.join(fp, 'article_title_nodes_min5.csv'), index=False)

In [None]:
print('nodes: {}, edges:{}'.format(len(nodes_min5), len(df_min5)))

## Network statistics

In [None]:
import networkx as nx
import community

In [None]:
graph = nx.from_pandas_edgelist(df_min5, 'source', 'target', edge_attr=True)

In [None]:
ccs = nx.connected_components(graph)
largest_cc = max(ccs, key=len)

In [None]:
len(largest_cc)

In [None]:
statistics = {
    'degree': graph.degree(),
    'degree_centrality': nx.degree_centrality(graph),
    'betweenness_centrality': nx.betweenness_centrality(graph),
    'closeness_centrality': nx.closeness_centrality(graph),
}

In [None]:
partition = community.best_partition(graph)

In [None]:
len(set(partition.values()))

In [None]:
partition01 = community.best_partition(graph, resolution=0.1)
len(set(partition01.values()))

In [None]:
modularity = community.modularity(partition, graph)
modularity

In [None]:
nodes_min5['community'] = [str(partition[n_id]) for n_id in nodes_min5['id']]

In [None]:
nodes_min5.to_csv(os.path.join(fp, 'article_title_nodes_min5.csv'), index=False)

In [None]:
partition_dendograms = community.generate_dendrogram(graph)

In [None]:
partitions = [community.partition_at_level(partition_dendograms, i) for i in range(len(partition_dendograms))]
partition_sizes = [len(set(p.values())) for p in partitions]
modularities = [community.modularity(p, graph) for p in partitions]

In [None]:
x = np.arange(len(partitions))
plt.bar(x, partition_sizes)
plt.title('nr of communities')
plt.show()
plt.bar(x, modularities)
plt.title('modularity')
plt.show()