In [3]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [4]:
def get_links_from_page(url):
    """Extract links from a given web page."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        links = {link.get('href') for link in soup.find_all('a', href=True)}
        # Filter links to avoid JavaScript, None, or malformed URLs
        return {link for link in links if link.startswith("http")}
    except Exception as e:
        print(f"Error fetching links from {url}: {e}")
        return set()

In [5]:
def build_web_graph(seed_urls, max_pages=10):
    """Build a web graph starting from seed URLs."""
    web_graph = defaultdict(set)
    visited = set()

    to_visit = seed_urls[:]
    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url not in visited:
            visited.add(url)
            links = get_links_from_page(url)
            web_graph[url].update(links)
            # Add new links to the to_visit list if not already visited
            to_visit.extend(links - visited)

    return web_graph

In [6]:
def page_rank(graph, iterations=100, damping_factor=0.85):
    """Calculate PageRank scores using the iterative method."""
    # Initialize PageRank values
    num_pages = len(graph)
    ranks = {page: 1 / num_pages for page in graph}

    for _ in range(iterations):
        new_ranks = {}
        for page in graph:
            rank_sum = 0
            for other_page in graph:
                if page in graph[other_page]:
                    rank_sum += ranks[other_page] / len(graph[other_page])
            new_ranks[page] = (1 - damping_factor) / num_pages + damping_factor * rank_sum
        ranks = new_ranks

    return ranks

In [7]:
# Example usage
seed_urls = [
    "https://medium.com/@lelambonzo/implementing-pagerank-algorithm-in-python-for-web-graph-analysis-718f34c9e4fb",  # Replace with real URLs
    "https://www.geeksforgeeks.org/page-rank-algorithm-implementation/"
]
web_graph = build_web_graph(seed_urls)
ranks = page_rank(web_graph)

print("PageRank Scores:")
for page, score in ranks.items():
    print(f"{page}: {score}")


PageRank Scores:
https://medium.com/@lelambonzo/implementing-pagerank-algorithm-in-python-for-web-graph-analysis-718f34c9e4fb: 0.015000000000000003
https://www.geeksforgeeks.org/page-rank-algorithm-implementation/: 0.015000000000000003
https://github.com/lelambonzo/pagerank-graph-analysis.git: 0.016062500000000004
https://policy.medium.com/medium-privacy-policy-f03bf92035c9?source=post_page-----718f34c9e4fb--------------------------------: 0.016062500000000004
https://speechify.com/medium?source=post_page-----718f34c9e4fb--------------------------------: 0.016062500000000004
https://gist.github.com/lelambonzo/ac2b04101d45556d05f6a8d9608a8b31: 0.016062500000000004
https://blog.medium.com/?source=post_page-----718f34c9e4fb--------------------------------: 0.016062500000000004
https://medium.statuspage.io/?source=post_page-----718f34c9e4fb--------------------------------: 0.016062500000000004
https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F718f34c9e4fb&%7Efeature=Lo