# Page Rank 
##### Authors: Chiara Frizzarin, Leonardo Gusson, Luca Rao
_____
Power Iteration implementation with with Sparse matrix moltiplication

## Helper function
Used to identify the running hardware and load corresponding libraries

In [60]:
import sys
import platform 
import scipy.sparse as sp
import numpy as np 


def get_platform():
    """
    Detects hardware in use and returns (platform_name, library_to_use).
    """
    if sys.platform == "darwin" and platform.processor() == "arm":
        try:
            import mlx.core as mx
            return "mlx", mx
        except ImportError:
            pass
    try:
        import torch
        if torch.cuda.is_available():
            return "gpu", torch
    except ImportError:
        pass
    return "cpu", np

___

## Main Function

To efficiently compute PageRank we implemented a custom version of the Power Method. 
This implementation uses sparse matrix operations to enable hardware acceleration (GPU/MLX) and handles **dangling nodes** (nodes with no outgoing edges) to ensure numerical stability.

####  Mathematical Formulation

We implement the algebraic formulation of PageRank where the score vector $\mathbf{r}$ is the stationary distribution of a modified Markov chain.

The standard transition calculation is adjusted to account for "leaked" probability mass from dangling nodes (sinks). Instead of constructing a dense stochastic matrix, we use a rank-one update strategy (to use sparse matrix). 
The iterative update rule implemented in our code is:

$$
\mathbf{r}_{next} = \alpha \mathbf{P}^T \mathbf{r} + (\alpha \cdot \text{sink\_mass} + (1-\alpha)) \mathbf{v}
$$

Where the variables correspond to:

* $\mathbf{r}$ (Code: `pr`): The PageRank vector at the current iteration.
* $\mathbf{r}_{next}$ (Code: `pr_next`): The updated PageRank vector for the next iteration.
* $\alpha$ (Code: `alpha`): The damping factor (default 0.85).
* $\mathbf{P}^T$ (Code: `P_matrix`): The transpose of the row-normalized adjacency matrix. This is stored as a sparse matrix (CSC/COO) to optimize memory. It represents the probability flow from non-dangling nodes.
* $\text{sink\_mass}$ (Code: `sink_mass`): The scalar sum of the current PageRank scores of all dangling nodes ($\sum_{j \in \text{sinks}} r_j$). This represents the total probability mass that "enters" a sink and must be redistributed.
* $\mathbf{v}$ (Code: `teleport_v`): The teleportation vector, representing the probability of jumping to any specific node. In our implementation, `teleport_v` stores the constant term $\frac{1-\alpha}{N}$.


In [61]:
def get_pagerank (G, alpha = 0.85, tol= 1e-5, max_iter =1000, force_cpu = False):
    '''
    Compute PageRank Score usign Power iteration method

    Args:  
        G           is the loaded largest connected component of the graph (loaded pickle file)
        alpha       (dumping factor)the probability that a random surfer continues clicking on links rather than jumping to a random page
        tol         (tolerance) determines when the iterative calculation stops (how little the new score vector should change from the 
                    previus computation to determine we have a solution)
        max_iter    maximum number of iteration (default = 1000)
        force_cpu   is a flag to force computation on cpu ignoring platform (ignore mlx or gpu)
    
    Returns:
        a standard Python dictionary where:
        - Keys (node_labels): node identifiers  (ASIN strings like '0827229534')
        - Values (final_ranks): the computed PageRank scores (floats)
    '''
    
    # Extract Adjacency Matrix
    if hasattr(G, 'adjacency'):
        node_labels = list(G.nodes())
        # Convert to Scipy CSC matrix 
        adj = nx.to_scipy_sparse_array(G, format='csc', dtype=np.float32)
    else:
        adj = G.tocsc()
        node_labels = list(range(adj.shape))
    
    #number of nodes 
    n_nodes = adj.shape[0]
    
    if __name__ == "__main__":
        display(n_nodes)
    
    #we now can calcualate the out degree of each node (that given the adjaceny matrix 
    # is simply the sum along colums )âˆš
    out_degrees = np.array(adj.sum(axis=1)).flatten()
    is_sink = (out_degrees == 0)                 #another check if a node is a sink
    
    # Normalize transition probabilities: P_ij = 1 / out_degree(j) (ignoring sinks)
    norm_out_degrees = np.where(is_sink, 1.0, out_degrees)
    adj.data = adj.data / norm_out_degrees[np.repeat(np.arange(n_nodes), np.diff(adj.indptr))]
    
    if not force_cpu: 
        platform_name, engine = get_platform()
        print(platform_name)
    else:
        platform_name = 'cpu'
        print("Forcing CPU computing")
    
    
    P_matrix = adj.T.tocsc()
    
    if platform_name == "mlx":
        import mlx.core as mx
        
        counts = np.diff(P_matrix.indptr)
        targets_np = np.repeat(np.arange(n_nodes), counts)
        
        # Move arrays to Unified Memory
        indices = mx.array(P_matrix.indices)
        data = mx.array(P_matrix.data)
        targets = mx.array(targets_np)
        sink_mask = mx.array(is_sink.astype(np.float32))
        
        pr = mx.full((n_nodes,), 1.0 / n_nodes)
        teleport_v = (1.0 - alpha) / n_nodes

        @mx.compile
        def update_step(r_prev):
            source_ranks = r_prev[targets]
            # Weighted values to sum
            weighted = data * source_ranks
            
            res = mx.zeros((n_nodes,))
            res = res.at[indices].add(weighted)
            
            # Sink correction
            sink_mass = mx.sum(r_prev * sink_mask)
            return (alpha * (res + sink_mass / n_nodes)) + teleport_v

        for i in range(max_iter):
            pr_next = update_step(pr)
            mx.eval(pr_next) 
            
            if mx.sum(mx.abs(pr_next - pr)) < tol:
                print(f"Converged at iteration {i}")
                break
            pr = pr_next
    
        pr = pr / mx.sum(pr)
        final_ranks = np.array(pr)
    
    
    elif platform_name == "gpu ":
        import torch
        device = torch.device("cuda")
        P_torch = torch.sparse_csc_tensor(
            torch.from_numpy(P_matrix.indptr).to(torch.int64),
            torch.from_numpy(P_matrix.indices).to(torch.int64),
            torch.from_numpy(P_matrix.data).to(torch.float32),
            size=(n_nodes, n_nodes)
        ).to(device)
        
        sinks = torch.from_numpy(is_sink).to(device)
        pr = torch.full((n_nodes, 1), 1.0 / n_nodes, device=device)
        teleport_v = (1.0 - alpha) / n_nodes

        for i in range(max_iter):
            #Sparse MAtrix_vect multiplication (more efficient)
            pr_next = torch.sparse.mm(P_torch, r)
            sink_mass = torch.sum(r[sinks])
            pr_next = alpha * (pr_next + sink_mass / n_nodes) + teleport_v
            
            if torch.norm(pr_next - pr, p=1) < tol:
                print(f"Converged at iteration {i}")
                break
            pr = pr_next
        #NOTE cpu() transfer from VRAM ----> RAM     
        final_ranks = pr.cpu().numpy().flatten()
    
    else:
        #we create the pagerank vector (initialized as every node as probability 1/n)
        pr = np.full(n_nodes, 1.0/n_nodes)
        teleport_const = (1.0 - alpha) / n_nodes
        for i in range(max_iter):
            
            pr_next = P_matrix.dot(pr)
            sink_mass = np.sum(pr[is_sink])
            pr_next = alpha*(pr_next +sink_mass / n_nodes)+teleport_const
            
            # check if maximum difference is lower than tol, if yes not much imprvement so we break
            if np.linalg.norm(pr_next - pr,1)<tol:
                break
            #else we update the pagerank vector 
            pr = pr_next
            
        final_ranks = pr
        
        
    return dict(zip(node_labels, final_ranks))

In [62]:
#Load of oicke Graph 
import pickle
import sys, os
from pathlib import Path

DATA_DIR = Path("../data")
PROCESSED_DATA_DIR = DATA_DIR / "processed"
PICKLE_PATH = PROCESSED_DATA_DIR / "amazon_graph.pickle"

In [63]:
with open(PICKLE_PATH, "rb") as f:
    G = pickle.load(f)

In [64]:
import pandas as pd
handpagerank_dict = get_pagerank(G, force_cpu=False)
df_handpagerank = pd.DataFrame.from_dict(handpagerank_dict,orient='index', columns=['HandPageRank'])
df_handpagerank.index.name = "ASIN"
df_handpagerank.head()

#the sum must be as close as possibile to one, some error are possible due to float errors even if i tried to fix those
display(df_handpagerank['HandPageRank'].sum())

334843

mlx
Converged at iteration 45


np.float32(1.0)

In [65]:
import networkx as nx 

#we set the same params as the hand pagerank and we save it directly in the 
df_nxpagerank= pd.DataFrame.from_dict(nx.pagerank(G,alpha=0.85, max_iter=1000),orient='index', columns=['NxPageRank'])
df_nxpagerank.index.name = "ASIN"
df_nxpagerank.head()


Unnamed: 0_level_0,NxPageRank
ASIN,Unnamed: 1_level_1
827229534,7.860731e-06
738700797,7.739031e-06
842328327,8.426177e-07
1577943082,1.608055e-06
486220125,4.739314e-07


In [67]:
df_pageranks = df_handpagerank.join(df_nxpagerank)
display(df_pageranks.head(10))

display(df_pageranks.corr(method="spearman"))

Unnamed: 0_level_0,HandPageRank,NxPageRank
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
0827229534,5.411765e-06,7.860731e-06
0738700797,9.57772e-06,7.739031e-06
0842328327,7.027543e-07,8.426177e-07
1577943082,1.421435e-06,1.608055e-06
0486220125,4.74084e-07,4.739314e-07
B00000AU3R,5.48329e-06,7.002084e-06
0231118597,3.837248e-06,6.672506e-06
0375709363,4.74084e-07,4.739314e-07
0871318237,4.74084e-07,4.739314e-07
1590770218,5.627329e-06,4.291182e-06


Unnamed: 0,HandPageRank,NxPageRank
HandPageRank,1.0,0.983531
NxPageRank,0.983531,1.0
