In [78]:
import numpy as np
import networkx as nx
from sklearn.cluster import SpectralClustering, DBSCAN, AffinityPropagation
from sklearn import metrics
import json
import pandas as pd

### JSON Node/Edge to Adj Matrix

In [63]:
with open('g2000.json') as f:
    data = json.load(f)
nodes = data['nodes']
edges = data['links']

In [64]:
# Init the matrix
num_nodes = len(nodes)
adj_mat = np.zeros(shape=(num_nodes,num_nodes))
adj_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [65]:
# Create the index mapping
ix = {val['id']:i for i,val in enumerate(nodes)}
ix;

In [66]:
for e in edges:
    src = e['source']
    trg = e['target'] 
    adj_mat[ix[src]][ix[trg]] = 1
    adj_mat[ix[trg]][ix[src]] = 1 # For symmetry
adj_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [129]:
sc = SpectralClustering(7,
    affinity='precomputed', 
    n_init=100)
cluster_labels = sc.fit_predict(adj_mat)
cluster_labels

array([3, 5, 5, ..., 6, 0, 3], dtype=int32)

#### Experimenting to improve clustering
Boosting edge weights if they are in the same cluster

In [135]:
cluster_labels==3

array([ True, False, False, ..., False, False,  True])

### Update the groupings in the JSON

In [130]:
for i,n in enumerate(nodes):
    n['group'] = 
    int(cluster_labels[i]) # JSON doesn't recognise numpy data types e.g. int32
    n['group'] = 

In [131]:
with open('g2000_clustered.json', 'w') as outfile:
    json.dump(data, outfile)

### Let's try DBSCAN

This should be handy to remove noisy data points that don't sit nicely in a category than try and force membership

In [92]:
adj_matrix_dist = (adj_mat-1)*-10+1

In [118]:
dbscan = DBSCAN(metric='precomputed', eps=10, min_samples=5)
#clusters = dbscan.fit(adj_mat)
clusters = dbscan.fit_predict(adj_matrix_dist)
clusters

array([0, 0, 0, ..., 0, 0, 0])

In [119]:
len(set(clusters)) - (1 if -1 in clusters else 0)

1

In [120]:
min(clusters)

-1

### Affinity propagation

In [37]:
clustering = AffinityPropagation(affinity='precomputed').fit(adj_mat)
clustering

AffinityPropagation(affinity='precomputed', convergence_iter=15, copy=True,
          damping=0.5, max_iter=200, preference=7, verbose=False)

In [38]:
clustering.labels_

array([   0,    1,    2, ..., 1984, 1985, 1986])