In [1]:
# %pip install transformers
import pandas as pd
from cogdl.oag import oagbert
import torch
import re
import numpy as np
import ipywidgets as widgets
import requests
import json
from dataclasses import dataclass
from typing import Dict, List, Optional
import os

In [2]:
# Parameters
max_depth = 2
ignore_related = False
ignore_referenced = False
base_works_url = "https://api.openalex.org/works"

In [3]:
@dataclass
class Article:
    # Keeping track of some needed paper details
    id: str
    title: str
    inverted_abstract: Dict[str, List[int]]
    authors: List[str]
    host_venue: str
    affiliations: List[str]
    concepts: List[str]
    references: List[str]
    related: List[str]

    def get_abstract(self) -> str:
        abstract = dict()
        for k, v in self.inverted_abstract.items():
            for i in v:
                abstract[i] = k

        final = ""
        for i in sorted(abstract.keys()):
            final += abstract[i] + " "
        return final
    
    def fetch_references_queries(self):
        # open alex only allows 50 OR joins per request
        queries = list()
        for i in range(0, len(self.references), 50):
            queries.append('|'.join(self.references[i:i+50]))
        return queries
    
    def fetch_related_queries(self):
        # open alex only allows 50 OR joins per request
        queries = list()
        for i in range(0, len(self.related), 50):
            queries.append('|'.join(self.related[i:i+50]))
        return queries
    
    def __str__(self):
        return f"{self.id}: {self.title}\n{self.get_abstract()}"

In [4]:
def fetch_article(result):
    work_id = result["id"].split('/')[-1]
    title = result["title"]
    inverted_abstract = result['abstract_inverted_index']
    authors = [authorship['author']['display_name'] for authorship in result['authorships']]
    host_venue = result['host_venue']['publisher']
    institutions = list()

    for authorship in result['authorships']:
        for institution in authorship['institutions']: 
            if institution['display_name'] not in institutions:
                institutions.append(institution['display_name'])

    concepts = [concept['display_name'] for concept in result['concepts'] if float(concept['score']) > 0.5]
    referenced_works = [work.split('/')[-1] for work in result['referenced_works']]
    related_works = [work.split('/')[-1] for work in result['related_works']]

    return Article(
        work_id,
        title if title else "",
        inverted_abstract if inverted_abstract else {"": [0]},
        authors,
        host_venue if host_venue else "",
        institutions,
        concepts,
        referenced_works,
        related_works
    )

# Search for Article Title
Edit the title variable below to search for a paper. If not exact then returns 25 most relevant papers in the OpenAlex dataset. Select the paper in the dropdown menu.

In [5]:
title = "Attention is all"
title = title.replace(" ", "%20")
req = requests.get(base_works_url+f"?filter=title.search:{title}")
response = json.loads(req.content)

relevant_titles = [result['title'] for result in response['results']]
title_selector = widgets.Dropdown(
    options=relevant_titles,
    value=relevant_titles[0],
    description="Title: "
)
display(title_selector)

Dropdown(description='Title: ', options=('Attention is All you Need', 'Attention Is All You Need', 'All That G…

In [6]:
raise Exception("Please select correct title above. If done, run all cells below this one.")

Exception: Please select correct title above. If done, run all cells below this one.

In [7]:
index = relevant_titles.index(title_selector.value)
papers = dict()
root_id = response['results'][index]['id'].split('/')[-1]

papers[root_id] = fetch_article(response['results'][index])

In [8]:
papers['W2963403868']

Article(id='W2963403868', title='Attention is All you Need', inverted_abstract={'The': [0, 19], 'dominant': [1], 'sequence': [2], 'transduction': [3], 'models': [4, 23, 59], 'are': [5], 'based': [6, 41], 'on': [7, 52], 'complex': [8], 'recurrent': [9], 'orconvolutional': [10], 'neural': [11], 'networks': [12], 'in': [13], 'an': [14, 31], 'encoder': [15, 27], 'and': [16, 28, 49, 68], 'decoder': [17, 29], 'configuration.': [18], 'best': [20, 90], 'performing': [21], 'such': [22], 'also': [24], 'connect': [25], 'the': [26, 88, 102], 'through': [30], 'attentionm': [32], 'echanisms.': [33], 'We': [34], 'propose': [35], 'a': [36, 111], 'novel,': [37], 'simple': [38], 'network': [39], 'architecture': [40], 'solely': [42], 'onan': [43], 'attention': [44], 'mechanism,': [45], 'dispensing': [46], 'with': [47, 77, 105], 'recurrence': [48], 'convolutions': [50], 'entirely.Experiments': [51], 'two': [53], 'machine': [54], 'translation': [55], 'tasks': [56], 'show': [57], 'these': [58], 'to': [60], 

In [9]:
use_references = ignore_referenced != True
use_related = ignore_related != True

related_works: Dict[int, List[Article]] = {}

def get_relevant_papers(current_depth: int, previous: List[Article]):
    related_works[current_depth] = []
    print(current_depth)
    for parent in previous:
        if use_references and len(parent.references) > 0:
            for query in parent.fetch_references_queries():         
                req = requests.get(base_works_url + f'?filter=openalex_id:{query}')
                res = json.loads(req.content)
                for result in res["results"]:
                    paper_id = result['id'].split('/')[-1]
                    if paper_id not in papers.keys():
                        temp = fetch_article(result)
                        papers[temp.id] = temp
                        related_works[current_depth].append(temp)
            
        if use_related and len(parent.related) > 0:
            for query in parent.fetch_related_queries():  
                req = requests.get(base_works_url + f'?filter=openalex_id:{query}')
                res = json.loads(req.content)
                for result in res["results"]:
                    paper_id = result['id'].split('/')[-1]
                    if paper_id not in papers.keys():
                        temp = fetch_article(result)
                        papers[temp.id] = temp
                        related_works[current_depth].append(temp)

    if current_depth < max_depth:
        get_relevant_papers(current_depth+1, related_works[current_depth])

In [10]:
get_relevant_papers(1, [papers[root_id]])

1
2


In [11]:
print(len(papers.keys()))
print(papers.keys())

422
dict_keys(['W2963403868', 'W2194775991', 'W2064675550', 'W2108598243', 'W2964121744', 'W2250539671', 'W2963341956', 'W2163605009', 'W2101105183', 'W2157331557', 'W2964308564', 'W2095705004', 'W2153579005', 'W2962739339', 'W2130942839', 'W1902237438', 'W2965373594', 'W2962784628', 'W2154652894', 'W2525778437', 'W2970597249', 'W2097117768', 'W1903029394', 'W2102605133', 'W1536680647', 'W2031489346', 'W1677182931', 'W2147800946', 'W2107878631', 'W2117812871', 'W2124509324', 'W2147238549', 'W1984309565', 'W1997542937', 'W1932847118', 'W1976921161', 'W4242212377', 'W4238404964', 'W2159979951', 'W2964103341', 'W4231990273', 'W4212915314', 'W2518214538', 'W3099850646', 'W2005051400', 'W2733060750', 'W4245792239', 'W2743258233', 'W2972435282', 'W3178127657', 'W2773120646', 'W3108696707', 'W2007431958', 'W1971129545', 'W2143503258', 'W2103452139', 'W2154890045', 'W2048060899', 'W2123716044', 'W2121553911', 'W2036317923', 'W1984375561', 'W2156960699', 'W2086756055', 'W2114471683', 'W19822911

In [12]:
print(papers['W4210794429'])

W4210794429: On Automating Hyperparameter Optimization for Deep Learning Applications
Given a large amount of data and appropriate hyperparameters, deep learning techniques can deliver impressive performance if several challenging issues with training, such as vanishing gradients, can be overcome. Often, deep learning training techniques produce suboptimal results because the parameter search space is large and populated with many less-than-ideal solutions. Automatic hyperparameter tuning algorithms, known as autotuners, offer an attractive alternative for automating the training process, though they can be computationally expensive. Additionally, autotuners democratize state-of-the-art machine learning approaches and increase the accessibility of deep learning technology to different scientific communities and novice users. In this paper, we investigate the efficacy of autotuning using Keras Tuner on both synthetic and real-world datasets. We show that autotuning performed well on syn

In [13]:
tokenizer, model = oagbert("oagbert-v2")

In [20]:
if not os.path.exists("./embeddings/"):
    os.mkdir("./embeddings/")

files = os.listdir("./embeddings/")
files = [file.split('.')[0] for file in files]

for key in papers.keys():
    if key not in files:
        curr_paper = papers[key]
        input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(
            title=curr_paper.title, 
            abstract=curr_paper.get_abstract(), 
            venue=curr_paper.host_venue, 
            authors=curr_paper.authors, 
            concepts=curr_paper.concepts, 
            affiliations=curr_paper.affiliations
        )

        sequence_output, pooled_output = model.bert.forward(
            input_ids=torch.LongTensor(input_ids).unsqueeze(0),
            token_type_ids=torch.LongTensor(token_type_ids).unsqueeze(0),
            attention_mask=torch.LongTensor(input_masks).unsqueeze(0),
            output_all_encoded_layers=False,
            checkpoint_activations=False,
            position_ids=torch.LongTensor(position_ids).unsqueeze(0),
            position_ids_second=torch.LongTensor(position_ids_second).unsqueeze(0)
        )

        torch.save(pooled_output, f"./embeddings/{curr_paper.id}.pt")
        

In [22]:
root_paper_embeddings = torch.load(f"./embeddings/{root_id}.pt")
comparison_batch_size = 25

paper_keys = list(papers.keys())
paper_keys.remove(root_id)

for i in range(0, len(paper_keys), 25):
    key_slice = paper_keys[i:i+25]
    for key in key_slice:

422


421