# arXiv Paper Embedding

## On a Single GPU
This notebook utilizes an NVIDIA T4 on Saturn Cloud.

In [5]:
import cudf
import pandas as pd
import json
import os
import re
import string
import pickle


DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_PATTERN = r"(19|20[0-9]{2})"

## Step 1: Data pre processing
Before we do anything else, we need to load the papers dataset, do some basic cleaning, and get it into a workable format. Below,
we will use CuDF to house the data and apply seom transformations in a generator, loading from file.

In [6]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [7]:
# Generator functions that iterate through the file and process/load papers

def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        # Attempt to parse the date using Regex: this could be improved
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract'],
        'input': clean_description(paper['title'] + ' ' + paper['abstract']) # embedding model input
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            # Yield only papers that have a year I could process
            if paper['year']:
                yield paper


In [8]:
# Example
next(papers())

{'id': '0704.0001',
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'year': 2007,
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'categories': 'hep-ph',
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (L

In [12]:
# Load papers into a CuDF
cdf = cudf.DataFrame(list(papers()))

In [13]:
len(cdf)

713361

In [16]:
cdf.year.value_counts()

2020    46511
2019    44098
2021    43344
2018    41643
2017    38960
2016    37540
2015    35015
2014    33854
2010    32318
2009    31998
2013    31486
2011    31048
2012    30105
2007    28917
2006    28582
2008    28511
2005    26305
2004    24418
2003    22475
2022    20996
2002    20498
2001    18826
2000    15913
Name: year, dtype: int32

In [5]:
# Pickle the dataframe to save you time in the future

with open('cdf.pkl', 'wb') as f:
    pickle.dump(cdf, f)
    
# Load pickle
# with open('cdf.pkl', 'rb') as f:
#     cdf = pickle.load(f)

## Step 2: Create sentence embeddings
Here I use a cookie-cutter -- **out of the box** -- model from HuggingFace to transform papers abstracts + titles into vectors.

**This takes a long time**... So best to take a subset. Or use the dask cluster for multi-gpu encoding.

In [None]:
batch = cdf[:100000].copy()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

vectors = model.encode(
    sentences = batch.input.values_host,
    normalize_embeddings = True,
    batch_size = 64,
    show_progress_bar = True
)

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [None]:
# Vectors created!
batch['vector'] = cudf.Series(vectors.tolist(), index=batch.index)

In [None]:
batch.head()

In [None]:
# Dump these to file with pickle or write them to Redis
with open('embeddings_100000.pkl', 'wb') as f:
    pickle.dump(batch, f)