In [116]:
def saveFile(filename, PR, Lq, inlinks_dict):
   
    with open(filename, 'w') as file:
        print("{:<50.50} {:<25} {:<20} {:<15}\n".format("Page", "Page Rank", "No. of Outlinks", "No. of Inlinks"))
        file.write("{:<50.50} {:<25} {:<20} {:<15}\n".format("Page", "Page Rank", "No. of Outlinks", "No. of Inlinks"))
        
        for doc in sorted(PR, key=PR.get, reverse=True)[:500]:
            outlinks_count = Lq.get(doc, 0)  
            inlinks_count = len(inlinks_dict.get(doc, []))  
            
            print("{:<50.50} {:<25} {:<6} {:<6}".format(doc, PR[doc], outlinks_count, inlinks_count))
            file.write("{:<50.50} {:<25} {:<6} {:<6}\n".format(doc, PR[doc], outlinks_count, inlinks_count))
            


### Page Rank Algorithm

In [101]:
def calculatePerplexity(PR):

    entropy = 0
    for p in PR:
        p = PR[p]
        if p > 0:
            entropy -= p * math.log(p, 2)
    try:
        perplexity = math.exp(entropy)
    except OverflowError:
        perplexity = float('inf')
    return perplexity

def page_rank(inlinks_dict, P, S, d, PR, Lq, convergenceItr, filepath):
  
    i = 0
    j = 0
    for key,values in inlinks_dict.items():
        P.append(key)
        Lq[key] = 0


    new_entries = []
    for values in inlinks_dict.values():  # Finding the number of out-links for every page
        for value in values:
            if value in Lq:
                Lq[value] += 1
            else:
                new_entries.append(value)

    for entry in new_entries:
        Lq[entry] = 1
        inlinks_dict[entry] = []

    for page in Lq.keys():  
        if Lq[page] == 0:
            S.append(page)

    for p in inlinks_dict.keys():
        PR[p] = 1/len(P)
    oldPerplexity = calculatePerplexity(PR)

    while i < convergenceItr:
        sinkPR = 0
        newPR = {}
        N = len(P)
        for p in S:
            sinkPR += PR[p]
        for p in inlinks_dict.keys():
            newPR[p] = (1 - d) / N  
            newPR[p] += d * sinkPR / N  
            for q in inlinks_dict[p]:  
                newPR[p] += d * PR[q] / Lq[q] 
        for p in inlinks_dict.keys():
            PR[p] = newPR[p]
        j += 1
        newPerplexity = calculatePerplexity(PR)
        
        if abs(oldPerplexity - newPerplexity) < 1:
            i += 1
        oldPerplexity = newPerplexity  

    saveFile(filepath, PR, Lq, inlinks_dict)



## wt2g Page Rank

In [102]:
import math

P = []
S = []
d = 0.85
PR = {}
Lq = {}
convergenceItr = 4

file = open('wt2g_inlinks.txt', 'r')
inlinks_dict = {}
for line in file:
    line = line.replace('\n', '')
    data = line.split()
    key = data[0]
    values = data[1:len(data)]
    values = list(set(values))
    inlinks_dict[key] = values
page_rank(inlinks_dict, P, S, d, PR, Lq, convergenceItr, 'wt2g_pagerank.txt')

Page            Page Rank                 No. of Outlinks      No. of Inlinks 

WT21-B37-76     0.0026944708785777444     5                    2568           
WT21-B37-75     0.0015331771293438034     1                    1704           
WT25-B39-116    0.0014685087868547102     1                    169            
WT23-B21-53     0.0013735335821988344     1                    198            
WT24-B26-10     0.001276215008801935      1                    291            
WT24-B40-171    0.0012452591223336598     209                  270            
WT23-B39-340    0.0012428612869828874     395                  274            
WT23-B37-134    0.0012054273922617123     2                    207            
WT08-B18-400    0.0011447764367003175     0                    990            
WT13-B06-284    0.001136550377992955      2                    454            
WT13-B06-273    0.0010549175801714342     11                   452            
WT01-B18-225    0.0009553812196934016     0        

In [129]:
# These are out of top 50 pageranks
# 'WT13-B39-321' has 52 inlinks 
# 'WT06-B14-69' has 57 inlinks
#  'WT23-B38-87' has one inlink
for ids in inlinks_dict['WT13-B39-321']:
    print(ids)


WT13-B39-354
WT13-B39-343
WT13-B39-316
WT13-B39-351
WT13-B39-347
WT13-B39-314
WT13-B39-357
WT13-B39-333
WT13-B39-327
WT13-B39-339
WT13-B39-323
WT13-B39-325
WT13-B39-302
WT13-B39-308
WT13-B39-329
WT13-B39-322
WT13-B39-350
WT13-B39-313
WT13-B39-318
WT13-B39-359
WT13-B39-328
WT13-B39-337
WT13-B39-330
WT13-B39-324
WT13-B39-356
WT13-B39-336
WT13-B39-320
WT13-B39-326
WT13-B39-307
WT13-B40-163
WT13-B39-311
WT13-B39-300
WT13-B39-340
WT13-B39-353
WT13-B39-335
WT13-B39-298
WT13-B39-341
WT13-B39-358
WT13-B39-317
WT13-B39-346
WT13-B39-345
WT13-B39-301
WT13-B39-304
WT13-B40-364
WT13-B39-504
WT13-B39-305
WT13-B39-297
WT13-B39-332
WT13-B39-344
WT13-B39-348
WT13-B39-310
WT13-B39-342


In [130]:
for ids in inlinks_dict['WT06-B14-69']:
    print(ids)
# one of the inlinks is ranked 7 WT23-B39-340    0.0012428612869828874     

WT06-B14-3
WT06-B14-19
WT06-B16-84
WT06-B15-91
WT06-B13-194
WT06-B13-219
WT06-B15-70
WT06-B15-66
WT06-B13-221
WT06-B14-42
WT06-B14-11
WT06-B13-193
WT06-B15-160
WT06-B14-35
WT06-B13-220
WT06-B15-163
WT06-B15-158
WT06-B16-87
WT06-B15-76
WT06-B14-38
WT06-B13-222
WT06-B14-36
WT06-B14-14
WT06-B15-80
WT06-B14-69
WT06-B15-162
WT06-B13-244
WT06-B13-223
WT06-B15-164
WT06-B14-34
WT06-B13-245
WT06-B15-165
WT06-B13-192
WT06-B14-12
WT06-B13-196
WT06-B15-79
WT06-B14-2
WT06-B16-4
WT06-B15-99
WT06-B15-30
WT06-B15-159
WT06-B13-224
WT06-B16-139
WT06-B13-189
WT06-B14-80
WT06-B14-39
WT06-B15-75
WT06-B13-234
WT06-B14-20
WT06-B13-195
WT06-B16-141
WT06-B15-65
WT06-B16-140
WT06-B13-249
WT06-B14-9
WT06-B16-3
WT06-B13-235


In [131]:
for ids in inlinks_dict['WT23-B38-87']:
    print(ids)
 # WT23-B37-134 has the page rank 8 

WT23-B37-134


Out of top 50 the below pages have fewer inlinks but higher pagerank
'WT13-B39-321' has 52 inlinks 
'WT06-B14-69' has 57 inlinks
'WT23-B38-87' has one inlinks
We can look at the example of 'WT23-B38-87' having one inlink WT23-B37-134 having a pagerank of 8. Eventhough inlinks are low since the pagerank of the inlinks are high it gives a higher pagerank.


In [24]:
from elasticsearch7 import Elasticsearch
from elasticsearch7.client import IndicesClient

INDEX_NAME = 'crawler'
CLOUD_ID = "0feeb24636464a578a9c7a1ce9739181:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyQyMzcyNjZmYzcwMzg0ZTA2OTM1MTJkZGIxMDgzYTRmMyQ1N2RhZjIzZTNiMWM0MjAwYjBhMDQ0MGY1ZTEyZTc2Yw=="
es = Elasticsearch(request_timeout = 10000, cloud_id= CLOUD_ID,
                   http_auth=("elastic", "pETnMazDlmfyCT2rZ2NAWh2V"))
es.ping()

True

In [None]:
from elasticsearch7.helpers import scan

scroll_size = 1000
 
# Initialize scroll
scroll = scan(es, index=INDEX_NAME, query={"query": {"match_all": {}}}, scroll='5m', size=scroll_size)
 
# Iterate through scroll
for result in tqdm(scroll):
    # Extract document ID (assuming URL is the ID)
    doc_id = result['_id']
    
    # Extract inlinks and outlinks from the current result
    inlinks = result['_source'].get('inlinks', [])
    outlinks = result['_source'].get('outlinks', [])
    
    # Store inlinks and outlinks in separate dictionaries
    inlinks_dict[doc_id] = inlinks
    outlinks_dict[doc_id] = outlinkss

In [106]:
with open('inlinks.json', 'r') as file:
    crawled_inlinks_dict = json.load(file)

In [107]:
with open('outlinks.json', 'r') as file:
    all_outlinks_dict = json.load(file)

In [108]:
with open('inlinks.json', 'r') as file:
    all_inlinks_dict = json.load(file)

In [109]:
len(crawled_inlinks_dict)

181761

In [112]:
for key, values in crawled_inlinks_dict.items():
    crawled_inlinks_dict[key] = list(set(values))

## Crawled Page Rank

In [117]:
P = []
S = []
d = 0.85
PR = {}
Lq = {}
convergenceItr = 4

page_rank(crawled_inlinks_dict, P, S, d, PR, Lq, convergenceItr, 'crawled_pagerank.txt')

Page                                               Page Rank                 No. of Outlinks No. of Inlinks

https://wikimediafoundation.org/                   0.003054477073860598      54     7096  
https://www.mediawiki.org/wiki/MediaWiki           0.0021888858035014835     80     4485  
https://developer.wikimedia.org/                   0.002080621304738688      2      4233  
https://support.apple.com/?cid=gn-ols-home-hp-tab  0.001998266388381666      21     838   
https://clinicaltrials.gov/policy/reporting-requir 0.001996959077386701      1      190   
https://oxfordmosaic.web.ox.ac.uk/                 0.001415092417026366      29     5805  
https://apps.apple.com/us/app/apple-store/id375380 0.001173776232657226      17     114   
https://github.com/                                0.0011597987047498508     21     1480  
https://proquest.libguides.com/termsofuse          0.0010454282279149833     1      260   
https://www.nih.gov/                               0.0010352003666634993

## HITS - crawl

In [49]:
# from elasticsearch7 import Elasticsearch

# INDEX_NAME = 'crawler'
# CLOUD_ID = "0feeb24636464a578a9c7a1ce9739181:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyQyMzcyNjZmYzcwMzg0ZTA2OTM1MTJkZGIxMDgzYTRmMyQ1N2RhZjIzZTNiMWM0MjAwYjBhMDQ0MGY1ZTEyZTc2Yw=="
# es = Elasticsearch(request_timeout=10000, cloud_id=CLOUD_ID, http_auth=("elastic", "pETnMazDlmfyCT2rZ2NAWh2V"))

def create_root_set(topic, size=1000):
    query = {
        "query": {
            "multi_match": {
                "query": topic,
                "fields": ["title", "content"]
            }
        },
        "size": size
    }

    response = es.search(index=INDEX_NAME, body=query)

    root_set = []
    for hit in response['hits']['hits']:
        doc_id = hit['_id']
        score = hit['_score']
        root_set.append((doc_id, score))

    return root_set

# Usage
topic = "Covid 19"
root_set = create_root_set(topic)
print(f"Root set size: {len(root_set)}")
print("Root set documents:")
for doc_id, score in root_set:
    print(f"Document ID: {doc_id}, Score: {score}")

  response = es.search(index=INDEX_NAME, body=query)


Root set size: 1000
Root set documents:
Document ID: https://en.wikipedia.org/wiki/COVID-19, Score: 13.198513
Document ID: https://www.cdc.gov/coronavirus/2019-ncov/your-health/about-covid-19.html, Score: 13.198513
Document ID: https://www.npr.org/tags/804916759/covid-19, Score: 13.198513
Document ID: https://inews.co.uk/topic/covid-19?ico=in-line_link, Score: 13.198513
Document ID: https://meta.wikimedia.org/wiki/COVID-19, Score: 13.198513
Document ID: https://inews.co.uk/topic/covid-19, Score: 13.198513
Document ID: https://blog.politics.ox.ac.uk/tag/covid-19/, Score: 13.198513
Document ID: https://meta.wikimedia.org/wiki/COVID-19/fr, Score: 13.198513
Document ID: https://www.nhs.uk/conditions/covid-19/, Score: 13.198513
Document ID: https://en.wikipedia.org/wiki/Coronavirus_disease_2019, Score: 13.198513
Document ID: https://en.wikipedia.org/wiki/Category:Transmission_of_COVID-19, Score: 12.153702
Document ID: https://www.covid19treatmentguidelines.nih.gov/, Score: 12.153702
Documen

In [55]:
root_set_list = []
for url, score in root_set:
    root_set_list.append(url)

In [67]:
root_set_list = set(root_set_list)

In [98]:
import random

def expand_root_set(root_set, d=200, num_iterations=2):
    base_set = set(root_set) 

    for _ in range(num_iterations):
        new_pages = set()
        for doc_id in base_set:
            if doc_id not in all_outlinks_dict:
                continue
            doc_outlinks = all_outlinks_dict[doc_id]
            new_pages.update(doc_outlinks)
            doc_inlinks = set(all_inlinks_dict[doc_id])
            if len(doc_inlinks) <= 200:
                new_pages.update(doc_inlinks)
            else:
                random_elements = random.sample(doc_inlinks, random.randint(1, 199))
                new_pages.update(random_elements)
            if len(base_set) + len(new_pages) > 11000 :
                base_set.update(new_pages)
                return base_set
        base_set.update(new_pages)        
    return base_set

def compute_hits(base_set, num_iterations=10):
    authority_scores = {doc_id: 1.0 for doc_id in base_set}
    hub_scores = {doc_id: 1.0 for doc_id in base_set}

    for _ in range(num_iterations):
        # Update authority scores
        new_authority_scores = {doc_id: 0.0 for doc_id in authority_scores}
        for doc_id in base_set:
            if doc_id not in all_inlinks_dict:
                continue
            incoming_links = list(set(all_inlinks_dict[doc_id]))
            for incoming_doc_id in incoming_links:
                if incoming_doc_id in hub_scores:
                    new_authority_scores[doc_id] += hub_scores[incoming_doc_id]

        # Update hub scores
        new_hub_scores = {doc_id: 0.0 for doc_id in hub_scores}
        for doc_id in base_set:
            if doc_id not in all_outlinks_dict:
                continue
            outgoing_links = list(set(all_outlinks_dict[doc_id]))
            for outgoing_doc_id in outgoing_links:
                if outgoing_doc_id in authority_scores:
                    new_hub_scores[doc_id] += authority_scores[outgoing_doc_id]

        # Normalize scores
        auth_norm = sum(score ** 2 for score in new_authority_scores.values()) ** 0.5
        hub_norm = sum(score ** 2 for score in new_hub_scores.values()) ** 0.5
        authority_scores = {doc_id: score / auth_norm for doc_id, score in new_authority_scores.items()}
        hub_scores = {doc_id: score / hub_norm for doc_id, score in new_hub_scores.items()}

    return authority_scores, hub_scores



In [119]:
# Usage
#root_set = create_root_set(topic)
base_set = expand_root_set(root_set_list)
print(f"Base set size: {len(base_set)}")

Base set size: 11925


since Python 3.9 and will be removed in a subsequent version.
  random_elements = random.sample(doc_inlinks, random.randint(1, 199))


In [120]:
authority_scores, hub_scores = compute_hits(base_set)

In [121]:
with open('top_authorities.txt', 'w') as file:
    top_authorities = sorted(authority_scores.items(), key=lambda x: x[1], reverse=True)[:500]
    for doc_id, score in top_authorities:
        file.write(f"{doc_id}\t{score}\n")
        print(f"{doc_id}\t{score}")



https://www.bmj.com/company/your-privacy/	0.04953772256565161
https://journals.bmj.com/	0.04950214891282642
https://www.bmj.com/company/legal-information/	0.04949488001409312
https://authors.bmj.com/	0.04949488001409312
https://www.bmj.com/company/legal-information/accessibility/	0.04949329699453884
https://www.bmj.com/company/openaccess/	0.04949329699453884
https://www.bmj.com/company/	0.04948846469647341
https://www.bmj.com/company/americas/librarian-hub/edi-policy/	0.04948783247929487
https://www.bmj.com/company/americas/meet-the-team/	0.04948783247929487
https://www.bmj.com/company/benefits/	0.04948783247929487
https://www.bmj.com/company/americas/rights-licensing-and-permissions/	0.04948783247929487
https://www.bmj.com/company/americas/education/	0.04948783247929487
https://www.bmj.com/company/americas/contact-us/	0.04948783247929487
https://www.bmj.com/company/americas/who-we-are-americas/	0.04948783247929487
https://www.bmj.com/company/americas/ebm-resources/	0.04948783247929487

In [122]:
with open('top_hubs.txt', 'w') as file:
    top_hubs = sorted(hub_scores.items(), key=lambda x: x[1], reverse=True)[:500]
    for doc_id, score in top_hubs:
        file.write(f"{doc_id}\t{score}\n")
        print(f"{doc_id}\t{score}")

https://www.bmj.com/company/bmj-tag/	0.051559668340021216
https://www.bmj.com/company/anti-racism-at-bmj-2/	0.05155828909157505
https://www.bmj.com/company/apha2021/	0.05155818279332216
https://www.bmj.com/company/work-at-bmj-today/	0.05155818130879347
https://www.bmj.com/company/global-health-ii/climate-change-and-infectious-diseases/	0.05155818126301736
https://www.bmj.com/company/newsroom/the-world-can-learn-from-chinas-response-to-the-pandemic-say-experts/	0.051558181199064725
https://www.bmj.com/company/newsroom/male-scientists-frame-their-research-findings-more-positively-than-women/	0.051557850714690455
https://www.bmj.com/company/newsroom/the-nhs-paid-private-hospitals-2bn-in-the-pandemic-but-some-treated-more-private-patients-than-nhs-ones/	0.051557850714690455
https://www.bmj.com/company/newsroom/significant-racial-disparities-in-care-of-heart-patients-during-first-pandemic-wave/	0.051557850714690455
https://www.bmj.com/company/global-health-ii/making-the-most-of-your-time/	0