In [36]:
import json
import pandas as pd
import os
import re
import string

from collections import Counter
from time import time
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer


DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2020
YEAR_PATTERN = r"(19|20[0-9]{2})"

In [37]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF:
                    yield paper

In [38]:
df = pd.DataFrame(papers())

In [39]:
df.head(10)

Unnamed: 0,id,title,year,authors,categories,abstract
0,707.094,Sobolev regularity of solutions of the cohomol...,2021,Giovanni Forni,"math.DS,math.AP",We refine the theory of the cohomological eq...
1,712.1975,Reentrant spin glass transition in LuFe2O4,2020,"Fan Wang, Jungho Kim, G. D. Gu, and Young-June...","cond-mat.str-el,cond-mat.mtrl-sci",We have carried out a comprehensive investig...
2,803.0437,On finiteness of odd superperfect numbers,2020,Tomohiro Yamada,math.NT,Some new results concerning the equation $\s...
3,804.3104,"Teichm\""uller Structures and Dual Geometric Gi...",2020,Yunping Jiang,"math.DS,math.CV",The Gibbs measure theory for smooth potentia...
4,805.3666,On the derivation of exact eigenstates of the ...,2021,Andrey Pereverzev and Eric R. Bittner,quant-ph,We construct the states that are invariant u...
5,807.4709,Bosonic Fields in Causal Set Theory,2021,Roman Sverdlov,physics.gen-ph,In this paper we will define a Lagrangian fo...
6,809.1832,Glimpses on the micro black hole Planck phase,2020,Fabio Scardigli,"hep-th,gr-qc","Mass thresholds, lifetimes, entropy and heat..."
7,810.3615,Exact results for the Wigner transform phase s...,2020,K. Bencheikh and L.M. Nieto,physics.atom-ph,Closed form analytical expressions are obtai...
8,810.484,The Pursuit of Uniqueness: Extending Valiant-V...,2022,"Dorit Aharonov, Michael Ben-Or, Fernando G.S.L...","quant-ph,cs.CC",Valiant-Vazirani showed in 1985 [VV85] that ...
9,904.0276,Linear Operators and Operator Functions Associ...,2020,Vladimir Ryzhov,"math-ph,math.MP",The paper develops a theory of spectral boun...


In [40]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [41]:
# Load model from sentence-transformers and set max seq len
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
model.max_seq_length = 512


No sentence-transformers model found with name /Users/tyler.hutcherson/.cache/torch/sentence_transformers/allenai_scibert_scivocab_uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/tyler.hutcherson/.cache/torch/sentence_transformers/allenai_scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you

In [None]:
df['vector'] = df.apply(lambda r: model.encode(clean_description(r['title'] + ' ' + r['abstract'])), axis=1)