In [1]:
import numpy as np
import os
import pickle

In [2]:
ref_list = pickle.load(open('referencelist.pkl', 'rb'))

# Page Rank algorithm

In [3]:
def calculate_PageRank(outlinks):
    # lambda
    lamb = 0.15
    

    size = outlinks.shape[0]

    # list to hold page ranks
    page_ranks = [1/size for i in range(size)]

    # Calculating the out degree of each node and storing in a list
    out_degrees = []
    for i in range(size):
        sums = 0
        for j in range(size):
            sums += outlinks[i][j]
        out_degrees.append(sums)

    for _ in range(100):
        for j in range(size):
            temp = 0
            for i in range(size):
                if outlinks[i][j] == 1:
                    temp += page_ranks[i] / out_degrees[i]
            temp *= (1-lamb)
            temp += (lamb/size)
            page_ranks[j] = round(temp, 4)

    return page_ranks

In [4]:
outlinks = pickle.load(open('adjacencymatrix.pkl', 'rb'))

In [5]:
page_ranks = calculate_PageRank(outlinks)

In [6]:
page_rank=sorted([(ref_list[i],page_ranks[i]) for i in range(len(page_ranks))])
page_rank=sorted(page_rank,key=lambda x:x[1],reverse=True)

In [18]:
for i in range(10):
    print(page_rank[i])

('https://en.wikipedia.org/wiki/Cricket', 0.0431)
('https://en.wikipedia.org/wiki/International_Cricket_Council', 0.023)
('https://en.wikipedia.org/wiki/Test_cricket', 0.0214)
('https://en.wikipedia.org/wiki/Wikipedia:Protection_policy', 0.0207)
('https://en.wikipedia.org/wiki/Twenty20', 0.0204)
('https://en.wikipedia.org/wiki/Cricket_ball', 0.0181)
('https://en.wikipedia.org/wiki/Women%27s_Twenty20_International', 0.0172)
('https://en.wikipedia.org/wiki/One_Day_International', 0.0166)
('https://en.wikipedia.org/wiki/Wicket', 0.0162)
('https://en.wikipedia.org/wiki/Women%27s_Test_cricket', 0.0161)


# HITS algorithm
query for tf-idf = cricket world cup

In [8]:
def authority_hub_score(outlinks, hubs_auth_index):
    
    size = len(hubs_auth_index)
    

    hub_scores = [1.0/size for i in range(size)]
    authority_scores = [1.0/size for i in range(size)]
    
    
    for _ in range(100):
        for jind, j in enumerate(hubs_auth_index):
            temp_auth = 0.0
            for iind, i in enumerate(hubs_auth_index):
                if outlinks[i][j] == 1:
                    temp_auth += hub_scores[iind]
            authority_scores[jind] = temp_auth


        auth_sum = sum(authority_scores)
        for i in range(len(authority_scores)):
            authority_scores[i] /= auth_sum

        for iind, i in enumerate(hubs_auth_index):
            temp_hub = 0.0
            for jind,j in enumerate(hubs_auth_index):
                if outlinks[i][j] == 1:
                    temp_hub += authority_scores[jind]
            hub_scores[iind] = temp_hub
    
        hub_sum = sum(hub_scores)
        for i in range(len(hub_scores)):
            hub_scores[i] /= hub_sum
            
    return authority_scores, hub_scores

In [9]:
outlinks = pickle.load(open('adjacencymatrix.pkl', 'rb'))

In [10]:
# contain base set documents after appling Tf-idf model with query(cricket world cup)
hubs_auth_index = pickle.load(open('query_hub_auth.pkl', 'rb'))

In [11]:
authority_scores, hub_scores = authority_hub_score(outlinks, hubs_auth_index)

In [12]:
hub_score=sorted([(ref_list[hubs_auth_index[i]],hub_scores[i]) for i in range(len(hubs_auth_index))])
hub_score=sorted(hub_score,key=lambda x:x[1],reverse=True)

HUB score 
(docid, score) descending order

In [22]:
for i in range(10):
    print(hub_score[i])

('https://en.wikipedia.org/wiki/Cricket', 0.035212398485153715)
('https://en.wikipedia.org/wiki/Test_cricket', 0.030226559846599877)
('https://en.wikipedia.org/wiki/History_of_cricket_to_1725', 0.029723298477211967)
('https://en.wikipedia.org/wiki/Twenty20', 0.029466032335733947)
('https://en.wikipedia.org/wiki/One_Day_International', 0.0294104298653599)
('https://en.wikipedia.org/wiki/Limited_overs_cricket', 0.02939412970732032)
('https://en.wikipedia.org/wiki/100-ball_cricket', 0.02915985323989019)
('https://en.wikipedia.org/wiki/History_of_cricket_(1772%E2%80%931815)', 0.029156390915217042)
('https://en.wikipedia.org/wiki/First-class_cricket', 0.02912724512726092)
('https://en.wikipedia.org/wiki/History_of_cricket', 0.029085272148994295)


In [23]:
auth_score=sorted([(ref_list[hubs_auth_index[i]],authority_scores[i]) for i in range(len(hubs_auth_index))])
auth_score=sorted(auth_score,key=lambda x:x[1],reverse=True)

authority score 
(docid, score) descending order

In [24]:
for i in range(10):
    print(auth_score[i])

('https://en.wikipedia.org/wiki/Cricket', 0.02425417114947134)
('https://en.wikipedia.org/wiki/Test_cricket', 0.021263807644114584)
('https://en.wikipedia.org/wiki/One_Day_International', 0.02053739337805246)
('https://en.wikipedia.org/wiki/Twenty20_International', 0.02004489060534341)
('https://en.wikipedia.org/wiki/Women%27s_One_Day_International', 0.019512855960840184)
('https://en.wikipedia.org/wiki/Women%27s_Test_cricket', 0.019341402349387384)
('https://en.wikipedia.org/wiki/Women%27s_Twenty20_International', 0.01927341581768351)
('https://en.wikipedia.org/wiki/Limited_overs_cricket', 0.018932508009949268)
('https://en.wikipedia.org/wiki/Twenty20', 0.01883448986406042)
('https://en.wikipedia.org/wiki/First-class_cricket', 0.018796029945375617)


Page rank score 
(docid, score) descending order

In [21]:
for i in range(10):
    print(page_rank[i])

('https://en.wikipedia.org/wiki/Cricket', 0.0431)
('https://en.wikipedia.org/wiki/International_Cricket_Council', 0.023)
('https://en.wikipedia.org/wiki/Test_cricket', 0.0214)
('https://en.wikipedia.org/wiki/Wikipedia:Protection_policy', 0.0207)
('https://en.wikipedia.org/wiki/Twenty20', 0.0204)
('https://en.wikipedia.org/wiki/Cricket_ball', 0.0181)
('https://en.wikipedia.org/wiki/Women%27s_Twenty20_International', 0.0172)
('https://en.wikipedia.org/wiki/One_Day_International', 0.0166)
('https://en.wikipedia.org/wiki/Wicket', 0.0162)
('https://en.wikipedia.org/wiki/Women%27s_Test_cricket', 0.0161)
