# PageRank

## Download data, import dependencies

In [1]:
# Download resources https://drive.google.com/drive/folders/1UTIXOfer55GTRTSR1Pwihn8aGYnTsXPF
!mkdir -p resources
%cd ./resources
!gdown 1TqgmOY7U4MGKJz01gYZrirpkZzHcEN34 # clean data
%cd ..

/content/resources
Downloading...
From: https://drive.google.com/uc?id=1TqgmOY7U4MGKJz01gYZrirpkZzHcEN34
To: /content/resources/clean_data.json
100% 23.3M/23.3M [00:00<00:00, 77.1MB/s]
/content


In [2]:
import gc
import pandas as pd
from tqdm import tqdm
import numpy as np
import json
import networkx as nx
import requests
from time import sleep
tqdm.pandas()

In [3]:
df = pd.read_json('resources/clean_data.json')
df

Unnamed: 0,paperId,title,abstract,year,referenceCount,citationCount,authors,k_references
0,63d8426ba1f51a8525dd19fd8ec92934ec71aea5,A Survey of Data Augmentation Approaches for NLP,Data augmentation has recently seen increased ...,2021,196,117,"[{'authorId': '152913678', 'name': 'Steven Y. ...",[{'paperId': '00ea88920eca898909bd8dd455df25ec...
1,33ec7eb2168e37e3007d1059aa96b9a63254b4da,Beyond Accuracy: Behavioral Testing of NLP Mod...,Although measuring held-out accuracy has been ...,2020,33,386,"[{'authorId': '78846919', 'name': 'Marco Tulio...",[{'paperId': '05dd7254b632376973f3a1b4d39485da...
2,642038c7a49caa9f0ac5b37b01fab5b2b8d981d5,ERASER: A Benchmark to Evaluate Rationalized N...,State-of-the-art models in NLP are now predomi...,2019,75,217,"[{'authorId': '48727916', 'name': 'Jay DeYoung...",[{'paperId': '0754982927fa07a6689fb0f2cbeb8e3d...
3,58ed1fbaabe027345f7bb3a6312d41c5aac63e22,Retrieval-Augmented Generation for Knowledge-I...,Large pre-trained language models have been sh...,2020,71,339,"[{'authorId': '145222654', 'name': 'Patrick Le...",[{'paperId': '016368185723d0ec99aafa4b59273005...
4,d47a682723f710395454687319bb55635e653105,Language (Technology) is Power: A Critical Sur...,We survey 146 papers analyzing “bias” in NLP s...,2020,238,324,"[{'authorId': '3422038', 'name': 'Su Lin Blodg...",[{'paperId': '00059087c954c1af6ece33115315e3e0...
...,...,...,...,...,...,...,...,...
7131,4b94eb44acd9f9989726e5fed1642b1a4af7f34f,Cyclic dependencies in modular performance ana...,The Modular Performance Analysis based on Real...,2008,12,69,"[{'authorId': '144988710', 'name': 'B. Jonsson...",[{'paperId': '02e4670310b2f22f7777e5416c258214...
7132,b1517ea8b702523fc26d87f40ddf506d28785745,Novel line conditioner with voltage up/down ca...,"In this paper, a novel pulsewidth-modulated li...",2002,9,66,"[{'authorId': '40660668', 'name': 'B. Kwon'}, ...",[{'paperId': '17336409df36d2e74018c99e29642ccc...
7133,0e7a792ef33af26c26970ffc275d0ae82ee8f5d1,A Deep Regression Architecture with Two-Stage ...,Regression based facial landmark detection met...,2017,41,180,"[{'authorId': '3489669', 'name': 'Jiang-Jing L...",[{'paperId': '04eed24e26d9e6aaf2ca434cad20facd...
7134,77b44f1985995edfb434f83c6879872c36f12507,Attention is Not All You Need: Pure Attention ...,Attention-based architectures have become ubiq...,2021,58,74,"[{'authorId': '145595795', 'name': 'Yihe Dong'...",[{'paperId': '044e13d7dd4e0655eb76f0bd00b2c1bd...


## Create matrix

In [4]:
def get_edges():
    edges = []
    for index, paper in df.iterrows():
        paperId = paper['paperId']
        references = [dic['paperId'] for dic in paper['k_references']]
        for reference in references:
            edges.append((paperId, reference))
    return edges

In [5]:
G = nx.DiGraph()
G.add_edges_from(get_edges())
print(f"Number of nodes = {G.number_of_nodes()}")
print(f"Number of edges = {G.size()}")
print(f"Maximum in_degree = {max([in_degree for node, in_degree in G.in_degree(G.nodes)])}")

Number of nodes = 39738
Number of edges = 62510
Maximum in_degree = 517


## PageRank

In [6]:
pr = nx.pagerank(G, alpha=0.9)

In [7]:
k = 20
prob = sorted(pr.items(), key = lambda x : x[1], reverse = True)
dic = {"Node":[],
       "Score":[],
       "In_degree":[]}
for i in range(len(prob)):
    dic["Node"].append(prob[i][0])
    dic["Score"].append(prob[i][1])
    dic["In_degree"].append(G.in_degree(prob[i][0]))
pd.DataFrame(dic).head(k)

Unnamed: 0,Node,Score,In_degree
0,204e3073870fae3d05bcbc2f6a8e263d9b72e776,0.001153,517
1,0b44fcbeea9415d400c5f5789d6b892b6f98daff,0.000726,280
2,05dd7254b632376973f3a1b4d39485da17814df5,0.000538,189
3,077f8329a7b6fa3b7c877a57b81eb6c18b5f87de,0.000518,227
4,0b544dfe355a5070b60986319a3f51fb45d1348e,0.000508,208
5,44d2abe2175df8153f465f6c39b68b76a0d40ab9,0.000502,203
6,1af68821518f03568f913ab03fc02080247a27ff,0.000434,170
7,330da625c15427c6e42ccfa3b747fb29e5835bf0,0.00041,176
8,2c03df8b48bf3fa39054345bafabfeff15bfd11d,0.000349,149
9,084c55d6432265785e3ff86a2e900a49d501c00a,0.000333,132


## Show PageRank result

In [8]:
def request_papers_by_id(IDs, fields):
    papers = []
    for id in tqdm(IDs):
        response = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{id}?fields={fields}')
        js = response.json()
        papers.append(js)
        # sleep(3.1)
    return papers

In [9]:
fields = 'title,url,year,fieldsOfStudy,citationCount,referenceCount'
base_papers = [prob[i][0] for i in range (k)]
papers = request_papers_by_id(base_papers, fields)

100%|██████████| 20/20 [00:04<00:00,  4.51it/s]


In [10]:
pagerank_results = pd.DataFrame(papers)
pagerank_results

Unnamed: 0,paperId,url,title,year,referenceCount,citationCount,fieldsOfStudy
0,204e3073870fae3d05bcbc2f6a8e263d9b72e776,https://www.semanticscholar.org/paper/204e3073...,Attention is All you Need,2017,44,36605,[Computer Science]
1,0b44fcbeea9415d400c5f5789d6b892b6f98daff,https://www.semanticscholar.org/paper/0b44fcbe...,Building a Large Annotated Corpus of English: ...,1993,78,8207,[Computer Science]
2,05dd7254b632376973f3a1b4d39485da17814df5,https://www.semanticscholar.org/paper/05dd7254...,"SQuAD: 100,000+ Questions for Machine Comprehe...",2016,31,4343,[Computer Science]
3,077f8329a7b6fa3b7c877a57b81eb6c18b5f87de,https://www.semanticscholar.org/paper/077f8329...,RoBERTa: A Robustly Optimized BERT Pretraining...,2019,58,7615,[Computer Science]
4,0b544dfe355a5070b60986319a3f51fb45d1348e,https://www.semanticscholar.org/paper/0b544dfe...,Learning Phrase Representations using RNN Enco...,2014,39,15281,"[Computer Science, Mathematics]"
5,44d2abe2175df8153f465f6c39b68b76a0d40ab9,https://www.semanticscholar.org/paper/44d2abe2...,Long Short-Term Memory,1997,68,52487,"[Computer Science, Medicine]"
6,1af68821518f03568f913ab03fc02080247a27ff,https://www.semanticscholar.org/paper/1af68821...,Neural Machine Translation of Rare Words with ...,2015,53,4856,[Computer Science]
7,330da625c15427c6e42ccfa3b747fb29e5835bf0,https://www.semanticscholar.org/paper/330da625...,Efficient Estimation of Word Representations i...,2013,43,22196,[Computer Science]
8,2c03df8b48bf3fa39054345bafabfeff15bfd11d,https://www.semanticscholar.org/paper/2c03df8b...,Deep Residual Learning for Image Recognition,2015,61,97909,[Computer Science]
9,084c55d6432265785e3ff86a2e900a49d501c00a,https://www.semanticscholar.org/paper/084c55d6...,Foundations of statistical natural language pr...,1999,294,7875,[Computer Science]


## HITS

In [11]:
hubs, authorities = nx.hits(G, max_iter = 50, normalized = True)
sorted_authorities = [(k, v) for k, v in authorities.items()]
sorted_authorities.sort(key = lambda x: x[1], reverse = True)
print('sorted_authorities:')
dic = {"Node":[],
       "Score":[],
       "In_degree":[]}
for i in range(len(sorted_authorities)):
    dic["Node"].append(sorted_authorities[i][0])
    dic["Score"].append(sorted_authorities[i][1])
    dic["In_degree"].append(G.in_degree(sorted_authorities[i][0]))
pd.DataFrame(dic).head(k)

sorted_authorities:


Unnamed: 0,Node,Score,In_degree
0,204e3073870fae3d05bcbc2f6a8e263d9b72e776,0.069937,517
1,077f8329a7b6fa3b7c877a57b81eb6c18b5f87de,0.028291,227
2,05dd7254b632376973f3a1b4d39485da17814df5,0.021492,189
3,1af68821518f03568f913ab03fc02080247a27ff,0.021385,170
4,2c03df8b48bf3fa39054345bafabfeff15bfd11d,0.013816,149
5,0b544dfe355a5070b60986319a3f51fb45d1348e,0.011824,208
6,3febb2bed8865945e7fddc99efd791887bb7e14f,0.01153,113
7,1e077413b25c4d34945cc2707e17e46ed4fe784a,0.01134,92
8,0e6824e137847be0599bb0032e37042ed2ef5045,0.010779,93
9,44d2abe2175df8153f465f6c39b68b76a0d40ab9,0.009105,203


In [12]:
sorted_hubs = [(k, v) for k, v in hubs.items()]
sorted_hubs.sort(key = lambda x: x[1], reverse = True)
print('sorted_hubs:')
sorted_hubs[:k]

sorted_hubs:


[('f4a8480cffa491020bdbb8c4c4e7a7e923b1c2c1', 0.0022141472045760393),
 ('17d5884215b5afa53545cd7cb6135de5478da4ec', 0.0021570089930638392),
 ('7a064df1aeada7e69e5173f7d4c8606f4470365b', 0.0021387560137760544),
 ('748629cb0b8e5a5708e1c6605f71b36eb525a3ce', 0.00213816216804431),
 ('2ffcf8352223c95ae8cef4daaec995525ecc926b', 0.002130942187652925),
 ('9f1c5777a193b2c3bb2b25e248a156348e5ba56d', 0.0020921194320356216),
 ('2bc1c8bd00bbf7401afcb5460277840fd8bab029', 0.0020878890317070236),
 ('477d66dcd2c08243dcc69822d6da7ec06393773a', 0.002073765465084217),
 ('b0b0dddb8310e01b9407a21674c2d33a23a6e967', 0.002058197605179818),
 ('80cf2a6af4200ecfca1c18fc89de16148f1cd4bf', 0.0020519261874808476),
 ('1359d2ef45f1550941e22bf046026c89f6edf315', 0.002037965636666151),
 ('f64e1d6bc13aae99aab5449fc9ae742a9ba7761e', 0.002023091644236163),
 ('a54b56af24bb4873ed0163b77df63b92bd018ddc', 0.002012886038460904),
 ('18318b10e7c2dd4ad292208f4399eb1d4dca5768', 0.0020047898803744763),
 ('4fa37d012ad0014552a6a5a03

As you can see above, the scores are close to each other because we pick the first ten references for each paper.

## Show authorities result

In [13]:
base_papers_hits = [sorted_authorities[i][0] for i in range (k)]
papers_hits = request_papers_by_id(base_papers_hits, fields)

100%|██████████| 20/20 [00:04<00:00,  4.51it/s]


In [14]:
hits_results = pd.DataFrame(papers_hits)
hits_results

Unnamed: 0,paperId,url,title,year,referenceCount,citationCount,fieldsOfStudy
0,204e3073870fae3d05bcbc2f6a8e263d9b72e776,https://www.semanticscholar.org/paper/204e3073...,Attention is All you Need,2017,44,36605,[Computer Science]
1,077f8329a7b6fa3b7c877a57b81eb6c18b5f87de,https://www.semanticscholar.org/paper/077f8329...,RoBERTa: A Robustly Optimized BERT Pretraining...,2019,58,7615,[Computer Science]
2,05dd7254b632376973f3a1b4d39485da17814df5,https://www.semanticscholar.org/paper/05dd7254...,"SQuAD: 100,000+ Questions for Machine Comprehe...",2016,31,4343,[Computer Science]
3,1af68821518f03568f913ab03fc02080247a27ff,https://www.semanticscholar.org/paper/1af68821...,Neural Machine Translation of Rare Words with ...,2015,53,4856,[Computer Science]
4,2c03df8b48bf3fa39054345bafabfeff15bfd11d,https://www.semanticscholar.org/paper/2c03df8b...,Deep Residual Learning for Image Recognition,2015,61,97909,[Computer Science]
5,0b544dfe355a5070b60986319a3f51fb45d1348e,https://www.semanticscholar.org/paper/0b544dfe...,Learning Phrase Representations using RNN Enco...,2014,39,15281,"[Computer Science, Mathematics]"
6,3febb2bed8865945e7fddc99efd791887bb7e14f,https://www.semanticscholar.org/paper/3febb2be...,Deep Contextualized Word Representations,2018,65,8123,[Computer Science]
7,1e077413b25c4d34945cc2707e17e46ed4fe784a,https://www.semanticscholar.org/paper/1e077413...,Universal Language Model Fine-tuning for Text ...,2018,57,2302,[Computer Science]
8,0e6824e137847be0599bb0032e37042ed2ef5045,https://www.semanticscholar.org/paper/0e6824e1...,Aligning Books and Movies: Towards Story-Like ...,2015,52,1454,[Computer Science]
9,44d2abe2175df8153f465f6c39b68b76a0d40ab9,https://www.semanticscholar.org/paper/44d2abe2...,Long Short-Term Memory,1997,68,52487,"[Computer Science, Medicine]"


# Show hubs result

In [17]:
base_papers_hits = [sorted_hubs[i][0] for i in range (k)]
papers_hits = request_papers_by_id(base_papers_hits, fields)

100%|██████████| 20/20 [00:04<00:00,  4.69it/s]


In [18]:
hits_results_hubs = pd.DataFrame(papers_hits)
hits_results_hubs

Unnamed: 0,paperId,url,title,year,referenceCount,citationCount,fieldsOfStudy
0,f4a8480cffa491020bdbb8c4c4e7a7e923b1c2c1,https://www.semanticscholar.org/paper/f4a8480c...,Reducing Transformer Depth on Demand with Stru...,2019,63,253,"[Computer Science, Mathematics]"
1,17d5884215b5afa53545cd7cb6135de5478da4ec,https://www.semanticscholar.org/paper/17d58842...,CERT: Contrastive Self-supervised Learning for...,2020,42,125,"[Computer Science, Mathematics]"
2,7a064df1aeada7e69e5173f7d4c8606f4470365b,https://www.semanticscholar.org/paper/7a064df1...,ALBERT: A Lite BERT for Self-supervised Learni...,2019,83,2821,[Computer Science]
3,748629cb0b8e5a5708e1c6605f71b36eb525a3ce,https://www.semanticscholar.org/paper/748629cb...,On Layer Normalization in the Transformer Arch...,2020,53,224,"[Computer Science, Mathematics]"
4,2ffcf8352223c95ae8cef4daaec995525ecc926b,https://www.semanticscholar.org/paper/2ffcf835...,Adversarial Training for Large Neural Language...,2020,71,75,[Computer Science]
5,9f1c5777a193b2c3bb2b25e248a156348e5ba56d,https://www.semanticscholar.org/paper/9f1c5777...,Cloze-driven Pretraining of Self-attention Net...,2019,43,154,[Computer Science]
6,2bc1c8bd00bbf7401afcb5460277840fd8bab029,https://www.semanticscholar.org/paper/2bc1c8bd...,Unicoder-VL: A Universal Encoder for Vision an...,2019,46,402,[Computer Science]
7,477d66dcd2c08243dcc69822d6da7ec06393773a,https://www.semanticscholar.org/paper/477d66dc...,Multilingual is not enough: BERT for Finnish,2019,52,157,[Computer Science]
8,b0b0dddb8310e01b9407a21674c2d33a23a6e967,https://www.semanticscholar.org/paper/b0b0dddb...,Byte Pair Encoding is Suboptimal for Language ...,2020,31,55,[Computer Science]
9,80cf2a6af4200ecfca1c18fc89de16148f1cd4bf,https://www.semanticscholar.org/paper/80cf2a6a...,Patient Knowledge Distillation for BERT Model ...,2019,41,381,[Computer Science]


## Compare Algorithms

In [19]:
similars = hits_results.copy(deep = True)
similars['HITS rank'] = range(1, len(hits_results)+1)
similars['PageRank rank']  = np.nan
for count, id in enumerate(similars['paperId']):
    if id in pagerank_results['paperId'].tolist():
        rank = pagerank_results.paperId[pagerank_results.paperId == id].index.tolist()[0] + 1
        similars['PageRank rank'].iloc[count] = rank

similars.dropna(inplace=True)
similars.reset_index(drop=True, inplace=True)
similars = similars.astype({'PageRank rank': int})
similars

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,paperId,url,title,year,referenceCount,citationCount,fieldsOfStudy,HITS rank,PageRank rank
0,204e3073870fae3d05bcbc2f6a8e263d9b72e776,https://www.semanticscholar.org/paper/204e3073...,Attention is All you Need,2017,44,36605,[Computer Science],1,1
1,077f8329a7b6fa3b7c877a57b81eb6c18b5f87de,https://www.semanticscholar.org/paper/077f8329...,RoBERTa: A Robustly Optimized BERT Pretraining...,2019,58,7615,[Computer Science],2,4
2,05dd7254b632376973f3a1b4d39485da17814df5,https://www.semanticscholar.org/paper/05dd7254...,"SQuAD: 100,000+ Questions for Machine Comprehe...",2016,31,4343,[Computer Science],3,3
3,1af68821518f03568f913ab03fc02080247a27ff,https://www.semanticscholar.org/paper/1af68821...,Neural Machine Translation of Rare Words with ...,2015,53,4856,[Computer Science],4,7
4,2c03df8b48bf3fa39054345bafabfeff15bfd11d,https://www.semanticscholar.org/paper/2c03df8b...,Deep Residual Learning for Image Recognition,2015,61,97909,[Computer Science],5,9
5,0b544dfe355a5070b60986319a3f51fb45d1348e,https://www.semanticscholar.org/paper/0b544dfe...,Learning Phrase Representations using RNN Enco...,2014,39,15281,"[Computer Science, Mathematics]",6,5
6,3febb2bed8865945e7fddc99efd791887bb7e14f,https://www.semanticscholar.org/paper/3febb2be...,Deep Contextualized Word Representations,2018,65,8123,[Computer Science],7,13
7,1e077413b25c4d34945cc2707e17e46ed4fe784a,https://www.semanticscholar.org/paper/1e077413...,Universal Language Model Fine-tuning for Text ...,2018,57,2302,[Computer Science],8,19
8,0e6824e137847be0599bb0032e37042ed2ef5045,https://www.semanticscholar.org/paper/0e6824e1...,Aligning Books and Movies: Towards Story-Like ...,2015,52,1454,[Computer Science],9,12
9,44d2abe2175df8153f465f6c39b68b76a0d40ab9,https://www.semanticscholar.org/paper/44d2abe2...,Long Short-Term Memory,1997,68,52487,"[Computer Science, Medicine]",10,6


In [20]:
pagerank_results.to_json('pagerank_result.json')
hits_results.to_json('hits_result.json')
hits_results_hubs.to_json('hits_results_hubs.json')