In [1]:
import os
import random
import re
import sys
import numpy as np
import pandas as pd

DAMPING = 0.85
SAMPLES = 10000

#------------------------------------#

def main(dir):
    corpus = crawl(dir)
    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
    print(f"PageRank Results from Sampling (n = {SAMPLES})")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")
    ranks = iterate_pagerank(corpus, DAMPING)
    print(f"PageRank Results from Iteration")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")

def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}

    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages

#--------------------------------------------#


def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.

    With probability `damping_factor`, choose a link at random
    linked to by `page`. With probability `1 - damping_factor`, choose
    a link at random chosen from all pages in the corpus.
    """
    l=len(corpus)
    prob = dict()
    for link in corpus:
        prob[link]=(1-damping_factor)/l
    for link in corpus[page]:
        prob[link]+=damping_factor/len(corpus[page])
    
    return prob
    
#-----------------------------------------_#

def sample_pagerank(corpus, damping_factor, n):
    """
    Return PageRank values for each page by sampling `n` pages
    according to transition model, starting with a page at random.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    prob = dict()
    sample = random.choice(list(corpus))
    prob_old = transition_model(corpus,sample,damping_factor)
    for links in prob_old:
            prob[links]=prob_old[links]
    for i in range(SAMPLES-1):
        prob_new=dict()
        sample_dict=dict()
        sample_dict=prob_old
        for links in prob_old:
            prob_new[links]=0
        for links in prob_old:
            if(prob_old[links]>0):
                new_dict=transition_model(corpus,links,damping_factor)
                for link in new_dict:
                    prob_new[link]+=new_dict[link]*prob_old[links]
        for links in prob_old:
            prob[links]+=prob_new[links]
        prob_old=prob_new
    for links in corpus:
        prob[links]=(prob[links])/n
    
    return prob
        
#---------------------------------------------#

def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    PageRank =dict()
    tempdict =dict()
    for links in corpus:
        tempdict[links]=1/len(corpus)
        PageRank[links]=1/len(corpus)
    
    while (True):
        sach=True
        for links in corpus:
            tempdict[links]=((1-damping_factor)/len(corpus))
        for links in corpus:
            for link in corpus[links]:
                tempdict[link]+=((damping_factor*PageRank[links])/len(corpus[links]))
        for links in corpus:
            if (abs(PageRank[links]-tempdict[links])>0.001):
                sach=False
                break
        if(sach):
            break
        else:
            for links in corpus:
                PageRank[links]=tempdict[links]
    return PageRank
    
#----------------------------------------------#

path2=os.path.abspath(os.getcwd())
path2=str(path2)+'/corpus1'
# put the folder name starting with a forward slash where all the html files are stored whose page rank is to be determined

# Function call #
main(path2)


PageRank Results from Sampling (n = 10000)
  bfs.html: 0.1149
  dfs.html: 0.0806
  games.html: 0.2280
  minesweeper.html: 0.1183
  minimax.html: 0.1309
  search.html: 0.2090
  tictactoe.html: 0.1183
PageRank Results from Iteration
  bfs.html: 0.1151
  dfs.html: 0.0806
  games.html: 0.2272
  minesweeper.html: 0.1183
  minimax.html: 0.1305
  search.html: 0.2100
  tictactoe.html: 0.1183
