In [1]:
# %pip install transformers
import pandas as pd
from cogdl.oag import oagbert
import torch
import re
import numpy as np
import ipywidgets as widgets
import requests
import json
from dataclasses import dataclass
from typing import Dict, List, Optional
import os
import pymilvus
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection
)

In [2]:
# Parameters
max_depth = 2
ignore_related = True
ignore_referenced = False
base_works_url = "https://api.openalex.org/works"

In [3]:
@dataclass
class Article:
    # Keeping track of some needed paper details
    id: str
    title: str
    inverted_abstract: Dict[str, List[int]]
    authors: List[str]
    host_venue: str
    affiliations: List[str]
    concepts: List[str]
    references: List[str]
    related: List[str]

    def get_abstract(self) -> str:
        abstract = dict()
        for k, v in self.inverted_abstract.items():
            for i in v:
                abstract[i] = k

        final = ""
        for i in sorted(abstract.keys()):
            final += abstract[i] + " "
        return final
    
    def fetch_references_queries(self):
        # open alex only allows 50 OR joins per request
        queries = list()
        for i in range(0, len(self.references), 50):
            queries.append('|'.join(self.references[i:i+50]))
        return queries
    
    def fetch_related_queries(self):
        # open alex only allows 50 OR joins per request
        queries = list()
        for i in range(0, len(self.related), 50):
            queries.append('|'.join(self.related[i:i+50]))
        return queries
    
    def __str__(self):
        return f"{self.id}: {self.title}\n{self.get_abstract()}"

In [4]:
def fetch_article(result):
    work_id = result["id"].split('/')[-1]
    title = result["title"]
    inverted_abstract = result['abstract_inverted_index']
    authors = [authorship['author']['display_name'] for authorship in result['authorships']]
    host_venue = result['host_venue']['publisher']
    institutions = list()

    for authorship in result['authorships']:
        for institution in authorship['institutions']: 
            if institution['display_name'] not in institutions:
                institutions.append(institution['display_name'])

    concepts = [concept['display_name'] for concept in result['concepts'] if float(concept['score']) > 0.5]
    referenced_works = [work.split('/')[-1] for work in result['referenced_works']]
    related_works = [work.split('/')[-1] for work in result['related_works']]

    return Article(
        work_id,
        title if title else "",
        inverted_abstract if inverted_abstract else {"": [0]},
        authors,
        host_venue if host_venue else "",
        institutions,
        concepts,
        referenced_works,
        related_works
    )

In [5]:
connection = pymilvus.connections.connect(
    alias='default',
    host='localhost',
    port='19530'
)

In [17]:
fields = [
    FieldSchema(name='pk', dtype=DataType.VARCHAR, max_length=32, is_primary=True),
    FieldSchema(name='embeddings', dtype=DataType.FLOAT_VECTOR, dim=768)
]
collection_name = 'paper_trail_test'
schema = CollectionSchema(fields, "Testing")
paper_trail_collection = Collection(collection_name, schema)

In [16]:
utility.drop_collection(collection_name)

# Search for Article Title
Edit the title variable below to search for a paper. If not exact then returns 25 most relevant papers in the OpenAlex dataset. Select the paper in the dropdown menu.

In [10]:
title = "Attention is all you need"
title = title.replace(" ", "%20")
req = requests.get(base_works_url+f"?filter=title.search:{title}")
response = json.loads(req.content)

relevant_titles = [result['title'] for result in response['results']]
title_selector = widgets.Dropdown(
    options=relevant_titles,
    value=relevant_titles[0],
    description="Title: "
)
display(title_selector)

Dropdown(description='Title: ', options=('Attention is All you Need', 'Attention Is All You Need', 'Channel At…

In [81]:
raise Exception("Please select correct title above. If done, run all cells below this one.")

Exception: Please select correct title above. If done, run all cells below this one.

In [11]:
index = relevant_titles.index(title_selector.value)
papers = dict()
root_id = response['results'][index]['id'].split('/')[-1]

papers[root_id] = fetch_article(response['results'][index])

In [12]:
use_references = ignore_referenced != True
use_related = ignore_related != True

related_works: Dict[int, List[Article]] = {}

def get_relevant_papers(current_depth: int, previous: List[Article]):
    related_works[current_depth] = []
    print(current_depth)
    for parent in previous:
        if use_references and len(parent.references) > 0:
            for query in parent.fetch_references_queries():         
                req = requests.get(base_works_url + f'?filter=openalex_id:{query}')
                res = json.loads(req.content)
                for result in res["results"]:
                    paper_id = result['id'].split('/')[-1]
                    if paper_id not in papers.keys():
                        temp = fetch_article(result)
                        papers[temp.id] = temp
                        related_works[current_depth].append(temp)
            
        if (use_related and len(parent.related) > 0) or len(parent.references) == 0:
            for query in parent.fetch_related_queries():  
                req = requests.get(base_works_url + f'?filter=openalex_id:{query}')
                res = json.loads(req.content)
                for result in res["results"]:
                    paper_id = result['id'].split('/')[-1]
                    if paper_id not in papers.keys():
                        temp = fetch_article(result)
                        papers[temp.id] = temp
                        related_works[current_depth].append(temp)

    if current_depth < max_depth:
        get_relevant_papers(current_depth+1, related_works[current_depth])

In [13]:
get_relevant_papers(1, [papers[root_id]])

1
2


In [14]:
tokenizer, model = oagbert("oagbert-v2")

In [18]:
if not os.path.exists("./embeddings/"):
    os.mkdir("./embeddings/")

files = os.listdir("./embeddings/")
files = [file.split('.')[0] for file in files]

for key in papers.keys():
    curr_paper = papers[key]
    input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(
        title=curr_paper.title, 
        abstract=curr_paper.get_abstract(), 
        venue=curr_paper.host_venue, 
        authors=curr_paper.authors, 
        concepts=curr_paper.concepts, 
        affiliations=curr_paper.affiliations
    )

    sequence_output, pooled_output = model.bert.forward(
        input_ids=torch.LongTensor(input_ids).unsqueeze(0),
        token_type_ids=torch.LongTensor(token_type_ids).unsqueeze(0),
        attention_mask=torch.LongTensor(input_masks).unsqueeze(0),
        output_all_encoded_layers=False,
        checkpoint_activations=False,
        position_ids=torch.LongTensor(position_ids).unsqueeze(0),
        position_ids_second=torch.LongTensor(position_ids_second).unsqueeze(0)
    )

    pooled_normalized = torch.nn.functional.normalize(pooled_output, p=2, dim=1)

    paper_trail_collection.insert([
            [key], 
            [pooled_normalized.tolist()[0]]
        ])
    
paper_trail_collection.flush()

In [19]:
print(paper_trail_collection.num_entities)

300


In [30]:
index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

paper_trail_collection.create_index(field_name="embeddings", index_params=index_params)

Status(code=0, message=)

In [25]:
paper_trail_collection.load()
root_paper_embeddings = paper_trail_collection.query(
    expr = f'pk == "{root_id}"',
    output_fields=['embeddings']
)
root_paper_embeddings = torch.Tensor(root_paper_embeddings[0]['embeddings'])
root_paper_embeddings.shape

torch.Size([768])

In [47]:
paper_trail_collection.load()
root_paper_embeddings = paper_trail_collection.query(
    expr = f'pk == "{root_id}"',
    output_fields=['embeddings']
)
root_paper_embeddings = torch.Tensor([root_paper_embeddings[0]['embeddings']])

paper_keys = list(papers.keys())
paper_keys.remove(root_id)

cols = ["id", "title", "score"]
similarities = pd.DataFrame(columns=cols)

for key in paper_keys:
    paper_embeddings = paper_trail_collection.query(
        expr = f'pk == "{key}"',
        output_fields=['embeddings']
    )
    paper_embeddings = torch.Tensor([paper_embeddings[0]['embeddings']])
    sim = torch.mm(root_paper_embeddings, paper_embeddings.transpose(0, 1))
    results = {
        "id": [key],
        "title": [papers[key].title],
        "score": [sim.detach().numpy()]
    }

    similarities = pd.concat([similarities, pd.DataFrame(results)], ignore_index=True)

In [48]:
similarities.sort_values(by="score", ascending=False).head(25)

Unnamed: 0,id,title,score
170,W2250489405,Joint Language and Translation Modeling with R...,[[0.99704975]]
208,W2131462252,A Scalable Hierarchical Distributed Language M...,[[0.9967903]]
240,W2253807446,Building high-level features using large scale...,[[0.9967419]]
168,W2964335273,How to Construct Deep Recurrent Neural Networks,[[0.99667966]]
213,W2120861206,A fast and simple algorithm for training neura...,[[0.99649143]]
233,W2127141656,Connectionist temporal classification,[[0.9964779]]
268,W2148708890,"A Simple, Fast, and Effective Reparameterizati...",[[0.9963628]]
234,W179875071,Recurrent neural network based language model,[[0.99634004]]
230,W3016169217,Neural Tree Indexers for Text Understanding,[[0.9962524]]
261,W1599016936,The Winograd schema challenge,[[0.9961974]]


# Print Root Paper Abstract and 5 most similar papers

In [49]:
print(papers[root_id])

W2963403868: Attention is All you Need
The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms. We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On English-to-French translation, we outperform the previoussingle state-of-the-art with model by 0.7 BLEU, achieving a BLEU score of 41.1. 


In [50]:
ids = similarities.sort_values(by="score", ascending=False).head(5)["id"]
for id in ids.values:
    print(papers[id])
    print("\n\n")

W2250489405: Joint Language and Translation Modeling with Recurrent Neural Networks
We present a joint language and translation model based on a recurrent neural network which predicts target words based on an unbounded history of both source and target words. The weaker independence assumptions of this model result in a vastly larger search space compared to related feedforward-based language or translation models. We tackle this issue with a new lattice rescoring algorithm and demonstrate its effectiveness empirically. Our joint model builds on a well known recurrent neural network language model (Mikolov, 2012) augmented by a layer of additional inputs from the source language. We show competitive accuracy compared to the traditional channel model features. Our best results improve the output of a system trained on WMT 2012 French-English data by up to 1.5 BLEU, and by 1.1 BLEU on average across several test sets. 



W2131462252: A Scalable Hierarchical Distributed Language Model
N

# Testing Milvus Search Functionality (cosine similarity)

In [26]:
root_paper_embeddings = root_paper_embeddings.tolist()

In [27]:
len(root_paper_embeddings)

768

In [28]:
search_params = {"metric_type": "IP", "params": {"nprobe": 10}, "offset": 5}

In [29]:
results = paper_trail_collection.search(
	data=[root_paper_embeddings], 
	anns_field="embeddings", 
	param=search_params,
	limit=10, 
	expr=None,
	consistency_level="Strong"
)

In [34]:
results[0].ids

['W1591801644', 'W2962741254', 'W1566289585', 'W2130942839', 'W2996428491', 'W2551396370', 'W2167510172', 'W2108598243', 'W2118434577', 'W2950178297']

In [35]:
results[0].distances

[0.5404819250106812, 0.5387341380119324, 0.5350673794746399, 0.5233733654022217, 0.5161704421043396, 0.5135420560836792, 0.5115315318107605, 0.5077880620956421, 0.5074553489685059, 0.5040091276168823]

In [36]:
res_list = results[0].ids[:5]
res_dist = results[0].distances[:5]

In [37]:
for i in range(4, -1, -1):
    print(papers[res_list[i]])
    print(f"Distance: {res_dist[i]}\n\n")

W2996428491: ALBERT: A Lite BERT for Self-supervised Learning of Language
  Representations
Increasing model size when pretraining natural language representations often results in improved performance on downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations and longer training times. To address these problems, we present two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows that our proposed methods lead to models that scale much better compared to the original BERT. We also use a self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and \squad benchmarks while having fewer parameters compared to BERT-large. The code and the pretrained models are available at h