In [2]:
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import sys
import time
from typing import List
import os
import json

In [1]:
start_url = 'https://www.researchgate.net/'
curr_paper = 'Co-locating Online Workload and Offline Workload in the Cloud: An Interference Analysis'
is_first_run = False

In [138]:
# from requests_html import AsyncHTMLSession

dr = driver = webdriver.Chrome()

def get_refs_and_citations(paper_title, dr):
    dr.get(start_url)

    time.sleep(1)

    search_bar = dr.find_element(By.XPATH, "//input[@placeholder='Search publications']")

    search_bar.send_keys(paper_title)

    time.sleep(0.5)

    submit_button = dr.find_element(By.XPATH, "//div[@class='index-search-field']//button//*[name()='svg']")
    submit_button.click()
    time.sleep(1)

    # Select first one in list
    paper_tiles = dr.find_elements(By.XPATH, "//div[contains(@class, 'nova-legacy-v-publication-item__title')]")
    if len(paper_tiles) == 0:
        raise Exception(f"paper {paper_title} not found")

    paper_tile = paper_tiles[0]

    if paper_title.lower() != paper_tile.text.lower():
        raise Exception(f"Name mismatch in paper {paper_title}")

    paper_tile = paper_tile.find_elements(By.XPATH, "*")[0] # link
    paper_tile.click()

    time.sleep(1)

    dr.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    time.sleep(1)

    citations_page = dr.page_source

    time.sleep(1)

    dr.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    dr.execute_script("window.scrollTo(0, 0);")

    time.sleep(1)
    references_tab = dr.find_element(By.XPATH, "//div[contains(text(),'References')]")
    driver.execute_script("arguments[0].click();", references_tab)

    time.sleep(1)

    references_page = dr.page_source
    bs_references = BeautifulSoup(references_page,"html.parser")
    bs_citations = BeautifulSoup(citations_page,"html.parser")
    all_references = bs_references.find(id='references')
    all_citations = bs_citations.find(id = 'citations')
    # Lists of references and citations
    refs = [ t.text for t in all_references.findAll("div", {"class" : "nova-legacy-v-publication-item__title"})]
    citations = [ t.text for t in all_citations.findAll("div", {"class" : "nova-legacy-v-publication-item__title"})]
    return {"refs": refs, "citations": citations}

# For testing
r_and_c = get_refs_and_citations(curr_paper, dr)

dr.close()


## Checking the names

In [81]:


# assuming one is NOT on the home page
# checks if the name in researchgate is same as title, depending on the result the title is added to ok_and_fixed or not_found
def checkIfPaperHasSameTitle(title, dr, not_found : List, ok_and_fixed: List):
    search_bar = dr.find_element(By.XPATH, "//input[@placeholder='Search ResearchGate']")
    search_bar.clear()
    search_bar.send_keys(title)
    submit_button = dr.find_element(By.XPATH, "//button[@type='submit']")
    submit_button.click()

    paper_tiles = dr.find_elements(By.XPATH, "//div[contains(@class, 'nova-legacy-v-publication-item__title')]")
    if len(paper_tiles) != 0:
        paper_tile = paper_tiles[0]
        if paper_tile.text.lower() != title.lower():
            if paper_tile.text.lower().startswith(title.lower()[:10]) or paper_tile.text.lower().endswith(title.lower()[-5:]):
                print(f"Title OK with a slight mistmatch: {title}")
                ok_and_fixed.append(paper_tile.text)
            else:
                print(f"Title mismatch: in file: {title}, in ResearchGate: {paper_tile.text}")
                not_found.append(title)
        else:
            print(f"Title OK: {title}")
            ok_and_fixed.append(paper_tile.text)
    else:
        print(f"paper: {title} NOT FOUND")
        not_found.append(title)


    time.sleep(1)


In [3]:
# checking if all papers are present on ResearchGate (with names)
if is_first_run:
    with open("papers.txt", "r") as papers_file:
        lines = papers_file.readlines()

        not_found = []
        ok_and_fixed = []

        # first
        title = lines[0].strip()

        dr = driver = webdriver.Chrome()

        dr.get(start_url)

        time.sleep(0.5)

        search_bar = dr.find_element(By.XPATH, "//input[@placeholder='Search publications']")

        search_bar.send_keys(title)

        time.sleep(0.5)

        submit_button = dr.find_element(By.XPATH, "//div[@class='index-search-field']//button//*[name()='svg']")
        submit_button.click()
        time.sleep(0.5)

        # Select first one in list
        paper_tiles = dr.find_elements(By.XPATH, "//div[contains(@class, 'nova-legacy-v-publication-item__title')]")
        if len(paper_tiles) != 0:
            paper_tile = paper_tiles[0]
            if paper_tile.text.lower() != title.lower():
                if paper_tile.text.lower().startswith(title.lower()[:10]) or paper_tile.text.lower().endswith(title.lower()[-5:]):
                    print(f"Title OK with a slight mistmatch: {title}")
                    ok_and_fixed.append(paper_tile.text)
                else:
                    print(f"Title mismatch: in file: {title}, in ResearchGate: {paper_tile.text}")
                    not_found.append(title)
            else:
                print(f"Title OK: {title}")
                ok_and_fixed.append(paper_tile.text)
        else:
            print(f"paper: {title} NOT FOUND")
            not_found.append(title)


        # rest
        for line in lines[1:]:
            stripped = line.strip()
            checkIfPaperHasSameTitle(stripped, dr, not_found, ok_and_fixed)

        dr.close()

## Reload important vars


In [4]:
if not is_first_run:
    with open('saved_vars.json', 'r') as saved_vars:
        text = saved_vars.read()
        vars_dict = json.loads(text)
        edges = vars_dict['edges']
        ok_and_fixed = vars_dict['ok_and_fixed']
        papers_dict_list = vars_dict['papers_dict_list']


Using the one with corrected names.


## Visualizing

In [26]:
from pyvis.network import Network
# edges mean a -> references -> b
net = Network(directed=True)#, select_menu=True, filter_menu=True)

In [4]:
papers_dict_list = [ {"node_id" : p[0], "title" : p[1]} for p in  enumerate(ok_and_fixed) ]

def find_node_id(title):
    res_pairs = [ p for p in papers_dict_list if p['title'] == title ]
    if len(res_pairs) == 0:
        return -1
    else:
        return res_pairs[0]['node_id']


In [27]:
for paper_dict in papers_dict_list:
    nodeId = paper_dict['node_id']
    title = paper_dict['title']
    net.add_node(nodeId, title)

In [8]:
edges

[{'from': 32, 'to': 23, 'arrows': 'to'},
 {'from': 30, 'to': 32, 'arrows': 'to'},
 {'from': 0, 'to': 22, 'arrows': 'to'},
 {'from': 0, 'to': 15, 'arrows': 'to'},
 {'from': 0, 'to': 27, 'arrows': 'to'},
 {'from': 1, 'to': 0, 'arrows': 'to'},
 {'from': 1, 'to': 5, 'arrows': 'to'},
 {'from': 1, 'to': 9, 'arrows': 'to'},
 {'from': 1, 'to': 22, 'arrows': 'to'},
 {'from': 2, 'to': 1, 'arrows': 'to'},
 {'from': 2, 'to': 24, 'arrows': 'to'},
 {'from': 2, 'to': 5, 'arrows': 'to'},
 {'from': 2, 'to': 9, 'arrows': 'to'},
 {'from': 2, 'to': 28, 'arrows': 'to'},
 {'from': 2, 'to': 3, 'arrows': 'to'},
 {'from': 3, 'to': 29, 'arrows': 'to'},
 {'from': 3, 'to': 4, 'arrows': 'to'},
 {'from': 30, 'to': 3, 'arrows': 'to'},
 {'from': 7, 'to': 3, 'arrows': 'to'},
 {'from': 28, 'to': 4, 'arrows': 'to'},
 {'from': 21, 'to': 4, 'arrows': 'to'},
 {'from': 5, 'to': 9, 'arrows': 'to'},
 {'from': 5, 'to': 28, 'arrows': 'to'},
 {'from': 6, 'to': 5, 'arrows': 'to'},
 {'from': 6, 'to': 9, 'arrows': 'to'},
 {'from': 

In [147]:
if is_first_run:
    dr = driver = webdriver.Chrome()
    edges = []
    for paper_dict in papers_dict_list:
        title = paper_dict['title']
        src = find_node_id(title)
        print(f"Getting refs and citations for {title}")
        r_and_c = get_refs_and_citations(title, dr)
        for ref in r_and_c['refs']:
            if ref in ok_and_fixed:
                print(f"Found new edge! {title} ----> {ref}")
                dest = find_node_id(ref)
                edges.append({'from' : src, 'to': dest})
        for citation in r_and_c['citations']:
            if citation in ok_and_fixed:
                print(f"Found new edge! {citation} ----> {title}")
                src_citation = find_node_id(citation)
                edges.append({'from': src_citation, 'to': src})
    dr.close()

    # Save variables
    to_save = {}
    to_save['edges'] = edges
    to_save['ok_and_fixed'] = ok_and_fixed
    to_save['papers_dict_list'] = papers_dict_list
    with open('saved_vars.json', "w") as f:
        f.write(json.dumps(to_save))


Getting refs and citations for ServerMore: Opportunistic Execution of Serverless Functions in the Cloud
Found new edge! ServerMore: Opportunistic Execution of Serverless Functions in the Cloud ----> FnSched: An Efficient Scheduler for Serverless Functions
Found new edge! ServerMore: Opportunistic Execution of Serverless Functions in the Cloud ----> PARTIES: QoS-Aware Resource Partitioning for Multiple Interactive Services
Found new edge! ServerMore: Opportunistic Execution of Serverless Functions in the Cloud ----> PerfIso: Performance Isolation for Commercial Latency-sensitive Services
Found new edge! Locality-aware Load-Balancing For Serverless Clusters ----> ServerMore: Opportunistic Execution of Serverless Functions in the Cloud
Getting refs and citations for Locality-aware Load-Balancing For Serverless Clusters
Found new edge! Locality-aware Load-Balancing For Serverless Clusters ----> FaaSRank: Learning to Schedule Functions in Serverless Platforms
Found new edge! Locality-aware 

#### Removing duplicates from edges.

In [5]:
res = [i for n, i in enumerate(edges) if i not in edges[:n]]
edges = res

In [28]:
for edge in edges:
    net.add_edge(edge['from'], edge['to'])

In [29]:
net.toggle_physics(True)
net.show_buttons(filter_=['physics'])
net.show("test.html", notebook=False)

test.html


In [30]:
net.save_graph("graph.html")

Determining the in degree (nr of references)

In [17]:
refs_df = pd.DataFrame(net.edges).drop_duplicates()
sorted = refs_df.groupby('to').count()['from'].sort_values(ascending=False)
sorted['to'] = sorted.index
sorted_df = pd.DataFrame(sorted)
sorted_df['to'] = sorted_df.index
sorted_df.columns = ['count', 'to']
sorted_df

Unnamed: 0_level_0,count,to
to,Unnamed: 1_level_1,Unnamed: 2_level_1
15,8,15
9,5,9
14,5,14
19,5,19
22,4,22
16,3,16
3,3,3
4,3,4
5,3,5
26,3,26


In [18]:
n = 5

In [19]:
top_n = sorted_df[:n]
top_n['title'] = top_n['to'].apply(lambda x : ok_and_fixed[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_n['title'] = top_n['to'].apply(lambda x : ok_and_fixed[x])


In [20]:
top_n

Unnamed: 0_level_0,count,to,title
to,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15,8,15,PARTIES: QoS-Aware Resource Partitioning for M...
9,5,9,ENSURE: Efficient Scheduling and Autonomous Re...
14,5,14,CoPart: Coordinated Partitioning of Last-Level...
19,5,19,Alita: Comprehensive Performance Isolation thr...
22,4,22,FnSched: An Efficient Scheduler for Serverless...
