In [None]:
import os
import urllib
import shutil
import numpy as np
import gzip

In [None]:
url = "https://snap.stanford.edu/data/web-Google.txt.gz"
# file
f= 'web-Google.txt.gz'

if not os.path.exists(f):
    r = urllib.request.urlopen(url)
    with open(f, 'wb') as fo:
        shutil.copyfileobj(r, fo)


In [None]:


def load_Gdata(name):
    graph = {}
    try:
        with gzip.open(name, 'rt') as file:
            for line in file:
                if line.startswith("#"):
                    continue
                parts = line.strip().split()
                if len(parts) < 2:
                    print(f"Skipping line with insufficient data: {line}")
                    continue
                source, target = int(parts[0]), int(parts[1])
                if source not in graph:
                    graph[source] = []
                graph[source].append(target)
    except FileNotFoundError:
        print(f"File not found: {name}")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")

    return graph


In [None]:

graph = load_Gdata(f)

In [39]:
# Initialize PageRank values for all vertices
num_vertices = len(graph)
pagerank = np.ones(num_vertices) / num_vertices

In [40]:
pagerank

array([1.35234917e-06, 1.35234917e-06, 1.35234917e-06, ...,
       1.35234917e-06, 1.35234917e-06, 1.35234917e-06])

In [48]:
def powerP(x, aPt, r=0.1, maxn=1000, tol=1e-10):
    for n in range(maxn):
        # Calculate (Pt)^n * x using the given aPt function
        Ptx = aPt(x)

        # Calculate the next iteration of x with the restart probability
        next_x = (1 - r) * Ptx + r / len(x)

        # Check for convergence
        if np.linalg.norm(next_x - x) < tol:
            return next_x

        x = next_x

    # Return the result after maxn iterations (may not have converged)
    return x