In [72]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [51]:
def read_graph_from_file(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = tuple(map(int, line.strip().split()))
            edges.append(edge)

    num_nodes = max(max(edge) for edge in edges) + 1
    adj_matrix = np.zeros((num_nodes, num_nodes))

    for edge in edges:
        adj_matrix[edge[0], edge[1]] = 1

    return adj_matrix

In [52]:
def pagerank(adj_matrix, damping_factor=0.85, max_iterations=100, tol=1e-6):
    num_nodes = len(adj_matrix)

    # Check for zero-sum rows and replace them with a uniform distribution
    row_sums = adj_matrix.sum(axis=1, keepdims=True)
    zero_sum_rows = row_sums.flatten() == 0
    non_zero_sum_rows = ~zero_sum_rows

    # Handle zero-sum rows separately to avoid division by zero
    if np.any(zero_sum_rows):
        adj_matrix[zero_sum_rows, :] = 1 / num_nodes
        row_sums[zero_sum_rows] = 1

    # Normalize the adjacency matrix to make it a stochastic matrix
    transition_matrix = adj_matrix / row_sums

    pagerank_vector = np.ones(num_nodes) / num_nodes

    for iteration in range(max_iterations):
        new_pagerank_vector = (1 - damping_factor) / num_nodes + \
                              damping_factor * transition_matrix.T.dot(pagerank_vector)

        if np.linalg.norm(new_pagerank_vector - pagerank_vector, 1) < tol:
            break

        pagerank_vector = new_pagerank_vector

    return pagerank_vector


In [65]:
# Example usage with a file
file_path = 'v2v.txt'
adj_matrix = read_graph_from_file(file_path)
pagerank_scores = pagerank(adj_matrix)

# Print PageRank scores for each node
for node, score in enumerate(pagerank_scores):
    print(f"Node {node}: PageRank = {score}")

Node 0: PageRank = 0.00010643489039990465
Node 1: PageRank = 0.00016612176773607635
Node 2: PageRank = 7.894344877641445e-05
Node 3: PageRank = 0.00011019126518506193
Node 4: PageRank = 7.894344877641445e-05
Node 5: PageRank = 0.00014058727357847293
Node 6: PageRank = 8.554478024626848e-05
Node 7: PageRank = 0.0002486221318806854
Node 8: PageRank = 0.00016113612975635267
Node 9: PageRank = 0.00013921820515074967
Node 10: PageRank = 0.00013103155238346355
Node 11: PageRank = 0.00021806549793062445
Node 12: PageRank = 0.0001245993899234451
Node 13: PageRank = 0.00015856060558518505
Node 14: PageRank = 0.0001254893354079383
Node 15: PageRank = 8.843203483885142e-05
Node 16: PageRank = 0.0001940952140629226
Node 17: PageRank = 0.00020112217888436677
Node 18: PageRank = 8.843203483885142e-05
Node 19: PageRank = 0.00016066963671608104
Node 20: PageRank = 0.00037959807279742367
Node 21: PageRank = 0.00010581671035666508
Node 22: PageRank = 0.00011945117250279393
Node 23: PageRank = 9.02536971

In [70]:
# Specify the nodes of interest
nodes_of_interest = [1777, 999, 876, 277, 387, 688, 6319]

# Find neighbors for each node
neighbors_dict = {}
for node in nodes_of_interest:
    neighbors = np.where(adj_matrix[node] == 1)[0]
    neighbors_dict[node] = neighbors

# Calculate cosine similarity between nodes based on PageRank scores
similarity_matrix = cosine_similarity(pagerank_scores.reshape(1, -1), adj_matrix.T)

# Find and print the 10 most similar nodes for each node of interest
for node in nodes_of_interest:
    neighbor_similarities = similarity_matrix[0, neighbors_dict[node]]
    most_similar_nodes = neighbors_dict[node][np.argsort(neighbor_similarities)[-10:]][::-1]

    print(f"\nTop 10 most similar nodes to Node {node}:")
    for idx, similar_node in enumerate(most_similar_nodes):
        similarity_score = neighbor_similarities[np.where(neighbors_dict[node] == similar_node)[0][0]]
        print(f"{idx + 1}. Node {similar_node} (Similarity Score: {similarity_score:.4f})")


Top 10 most similar nodes to Node 1777:
1. Node 509 (Similarity Score: 0.0730)
2. Node 2066 (Similarity Score: 0.0518)
3. Node 3035 (Similarity Score: 0.0455)
4. Node 4200 (Similarity Score: 0.0352)
5. Node 269 (Similarity Score: 0.0346)
6. Node 2923 (Similarity Score: 0.0322)
7. Node 3220 (Similarity Score: 0.0310)
8. Node 1587 (Similarity Score: 0.0280)
9. Node 4718 (Similarity Score: 0.0271)
10. Node 2248 (Similarity Score: 0.0255)

Top 10 most similar nodes to Node 999:
1. Node 2303 (Similarity Score: 0.0503)
2. Node 17 (Similarity Score: 0.0495)
3. Node 917 (Similarity Score: 0.0483)
4. Node 3246 (Similarity Score: 0.0479)
5. Node 3806 (Similarity Score: 0.0473)
6. Node 6377 (Similarity Score: 0.0466)
7. Node 6301 (Similarity Score: 0.0451)
8. Node 4298 (Similarity Score: 0.0434)
9. Node 5079 (Similarity Score: 0.0427)
10. Node 1689 (Similarity Score: 0.0396)

Top 10 most similar nodes to Node 876:
1. Node 886 (Similarity Score: 0.0767)
2. Node 881 (Similarity Score: 0.0767)
3. N

In [74]:
# Reshape the pagerank_scores to be a column vector
pagerank_features = pagerank_scores.reshape(-1, 1)

# Specify the number of clusters (you may adjust this)
num_clusters = 10

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(pagerank_features)

# Print the cluster assignments
for node, cluster in enumerate(clusters):
    print(f"Node {node}: Cluster {cluster}")

Node 0: Cluster 7
Node 1: Cluster 5
Node 2: Cluster 3
Node 3: Cluster 7
Node 4: Cluster 3
Node 5: Cluster 0
Node 6: Cluster 3
Node 7: Cluster 6
Node 8: Cluster 5
Node 9: Cluster 0
Node 10: Cluster 0
Node 11: Cluster 2
Node 12: Cluster 0
Node 13: Cluster 5
Node 14: Cluster 0
Node 15: Cluster 3
Node 16: Cluster 2
Node 17: Cluster 2
Node 18: Cluster 3
Node 19: Cluster 5
Node 20: Cluster 1
Node 21: Cluster 7
Node 22: Cluster 0
Node 23: Cluster 7
Node 24: Cluster 6
Node 25: Cluster 5
Node 26: Cluster 2
Node 27: Cluster 3
Node 28: Cluster 2
Node 29: Cluster 3
Node 30: Cluster 5
Node 31: Cluster 4
Node 32: Cluster 5
Node 33: Cluster 7
Node 34: Cluster 0
Node 35: Cluster 7
Node 36: Cluster 6
Node 37: Cluster 5
Node 38: Cluster 6
Node 39: Cluster 7
Node 40: Cluster 0
Node 41: Cluster 3
Node 42: Cluster 3
Node 43: Cluster 7
Node 44: Cluster 5
Node 45: Cluster 0
Node 46: Cluster 0
Node 47: Cluster 4
Node 48: Cluster 9
Node 49: Cluster 5
Node 50: Cluster 0
Node 51: Cluster 3
Node 52: Cluster 0
Nod

  super()._check_params_vs_input(X, default_n_init=10)
