In [1]:
import networkx as nx
from urllib import request
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.svm import SVC
from sklearn.metrics import *

In [2]:
G = nx.read_edgelist("cora.cites",nodetype=int)

node_labels = {}
node_features = {}
labels = {}
with open('cora.content') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        if row[-1] not in labels:
            labels[row[-1]] = len(labels)
            
        node_labels[int(row[0])] = labels[row[-1]]
        node_features[int(row[0])] = row[1:-1]

In [3]:
 from sklearn.cluster import SpectralClustering

In [4]:
connComps = sorted(nx.connected_components(G), key=len, reverse=True)
lConnComp = G.subgraph(connComps[0])

In [5]:
labels=[]
features=[]
for i in lConnComp.nodes():
  labels.append(node_labels[i])
  features.append(node_features[i])

In [11]:
featureClusters = SpectralClustering(n_clusters=7,assign_labels='kmeans', random_state=0).fit(features)

[1.69795259e-13 4.14114604e-06 1.41721735e-05 1.11840982e-05
 7.57386610e-06 7.73250037e-06 6.45270474e-06 2.48318983e-05]
not reaching the requested tolerance 1e-05.
Use iteration 1958 instead with accuracy 
7.582774199101469e-06.

  _, diffusion_map = lobpcg(
[1.71111985e-13 4.04507142e-06 1.41485154e-05 1.05844850e-05
 7.10142232e-06 7.28997308e-06 5.21304967e-06 1.22797830e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(


In [12]:
featureOut=featureClusters.labels_

In [6]:
topologicalClusters = SpectralClustering(n_clusters=7,assign_labels='kmeans', random_state=0, affinity='precomputed').fit(nx.adjacency_matrix(lConnComp).todense())

In [7]:
topologicalOut=topologicalClusters.labels_

In [9]:
from sklearn import metrics

In [13]:
metrics.rand_score(labels, featureOut)

0.18724229434578485

In [10]:
metrics.rand_score(labels, topologicalOut)

0.3032520404228916

### Attempting with 5 clusters

In [14]:
featureClusters5 = SpectralClustering(n_clusters=5,assign_labels='kmeans', random_state=0).fit(features)
featureOut5=featureClusters5.labels_

In [15]:
topologicalClusters5 = SpectralClustering(n_clusters=5,assign_labels='kmeans', random_state=0, affinity='precomputed').fit(nx.adjacency_matrix(lConnComp).todense())
topologicalOut5=topologicalClusters5.labels_

In [16]:
metrics.rand_score(labels, featureOut5)

0.18746196988695457

In [17]:
metrics.rand_score(labels, topologicalOut5)

0.2888205237868434

Compared to the features, the adjacency matrix provided superior information about the graph, as evidenced by the higher rand score achieved when using it for clustering. I believe that by combining the topological information with the node features, we could achieve even better clustering results. This can be accomplished by concatenating the adjacent neighbor information with the node features.

#### USING ADJUSTED RAND METRIC

In [18]:
metrics.adjusted_rand_score(labels, featureOut)

-7.966724644120632e-05

In [19]:
metrics.adjusted_rand_score(labels, topologicalOut)

-0.013354426626133611

In [20]:
featureClusters10 = SpectralClustering(n_clusters=10,assign_labels='kmeans', random_state=0).fit(features)
featureOut10=featureClusters10.labels_

[2.57467275e-13 4.17338966e-06 2.99362592e-06 2.13794636e-06
 4.15551102e-06 8.13243757e-06 5.18725903e-06 1.10897395e-05
 1.96276999e-05 4.36888637e-05 3.18772176e-05]
not reaching the requested tolerance 1e-05.
Use iteration 1589 instead with accuracy 
4.459938423839388e-06.

  _, diffusion_map = lobpcg(
[2.68793322e-13 1.62463413e-06 1.45675669e-06 1.72706340e-06
 2.12154874e-06 1.83478151e-06 2.30839512e-06 2.04603215e-06
 7.38003674e-06 1.33417634e-05 1.52185500e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(


In [21]:
topologicalClusters10 = SpectralClustering(n_clusters=10,assign_labels='kmeans', random_state=0, affinity='precomputed').fit(nx.adjacency_matrix(lConnComp).todense())
topologicalOut10 = topologicalClusters10.labels_

In [23]:
metrics.adjusted_rand_score(labels, featureOut10)

0.0009751241414141551

In [24]:
metrics.adjusted_rand_score(labels, topologicalOut10)

0.17551980888044477

The use of adjusted rand score resulted in significant score drops compared to the rand score. Despite this, I believe that the adjacency list provides better information, as indicated by the consistently higher rand scores obtained using the adjacency matrix. Furthermore, even with a cluster size of 10, the adjusted rand score was still higher when using the adjacency list.