In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle as pkl

import scipy as sp
import matplotlib.pyplot as plt
import math

from sklearn.cluster import SpectralClustering

In [3]:
with open('../graphs/exports_graphs.pkl', 'rb') as file:
    export_graphs = pkl.load(file)

In [4]:
from sklearn.metrics import silhouette_score

silhouettes = []
for c in range(2,20):
    train_clusters = []

    for g in export_graphs:
        # Compute the normalized Laplacian matrix of the graph
        laplacian = nx.normalized_laplacian_matrix(g)

        # Perform spectral clustering
        clustering = SpectralClustering(n_clusters=c, assign_labels='discretize', random_state=0).fit(laplacian.toarray())

        train_clusters.append(clustering.labels_)

        silhouette = silhouette_score(laplacian.toarray(), clustering.labels_)

    silhouettes.append(np.mean(silhouette))



In [5]:
silhouettes

[0.0250025745252816,
 0.019876355904485072,
 0.021412634273147883,
 0.022639744053378177,
 0.024204332431567492,
 0.024483526125772877,
 0.017783850156443896,
 0.016019558289040895,
 0.010952748969405084,
 0.008188840695440803,
 0.00940068137535393,
 0.008790045016909333,
 0.007747684920472309,
 0.00941848552662331,
 0.006354600951699029,
 0.00252628438558454,
 0.00046027944557227457,
 0.0011489467568314012]

In [6]:
train_clusters = []

for g in export_graphs:
    # Compute the normalized Laplacian matrix of the graph
    laplacian = nx.normalized_laplacian_matrix(g)

    # Perform spectral clustering
    clustering = SpectralClustering(n_clusters=2, assign_labels='discretize', random_state=0).fit(laplacian.toarray())

    train_clusters.append(clustering.labels_)



In [7]:
from sklearn.metrics import adjusted_rand_score

ari_scores_train = []

# Compute the Adjusted Rand Index
for c_1 in train_clusters:
    current_ari_scores = []
    for c_2 in train_clusters:
        ari = adjusted_rand_score(c_1, c_2)
        current_ari_scores.append(ari)
    ari_scores_train.append(current_ari_scores)

In [8]:
silhouettes = []
for c in range(10,25):
    train_graph_clustering = SpectralClustering(n_clusters=c, assign_labels='discretize', random_state=0).fit(ari_scores_train)

    silhouette = silhouette_score(ari_scores_train, train_graph_clustering.labels_)
    silhouettes.append(silhouette)



In [9]:
silhouettes

[0.36432700275396473,
 0.3610210689680595,
 0.3757965162225819,
 0.3831242557788831,
 0.3890737458351662,
 0.36832132087699293,
 0.36746634561618285,
 0.36449246422459874,
 0.36652079485162403,
 0.3687517482690087,
 0.37504879912847505,
 0.36491745755295324,
 0.36378572957118893,
 0.3654412615234035,
 0.36702567239992495]

In [10]:
train_graph_clustering = SpectralClustering(n_clusters=14, assign_labels='discretize', random_state=0).fit(ari_scores_train)



In [11]:
labels = train_graph_clustering.labels_

In [12]:
labels

array([ 6,  9,  6,  5,  9,  5,  5,  1, 13, 13,  1,  1, 13,  1,  1,  1, 13,
        0,  1,  1,  1,  0,  0,  0, 11, 13,  7,  7, 11,  7, 12, 10, 10, 13,
       12, 12, 12, 12,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  8,  8,
        4,  4,  4,  4,  4,  1,  4,  1,  8])

In [13]:
centroids = []
for i in np.unique(labels):
    positions = []
    for t, j in enumerate(labels):
        if j == i:
            positions.append(t)
    mean_position = np.mean(positions)

    differences = np.abs(positions - mean_position)

    # Find the value in the array closest to the value
    closest_value = positions[np.argmin(differences)]

    centroids.append(closest_value)


In [14]:
centroids

[21, 20, 46, 40, 54, 5, 0, 27, 50, 1, 31, 24, 34, 16]

In [15]:
change_point_years = []
for i in centroids:
    change_point_years.append(i + 1962)

In [16]:
change_point_years

[1983,
 1982,
 2008,
 2002,
 2016,
 1967,
 1962,
 1989,
 2012,
 1963,
 1993,
 1986,
 1996,
 1978]