In [3]:
!mv /Users/tyler.hutcherson/Downloads/arxiv-metadata-oai-snapshot.json .

In [1]:
import json
import pandas as pd
import os
import re
import string


DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [2]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF and ML_CATEGORY in paper['categories']:
                    yield paper

In [3]:
df = pd.DataFrame(papers())
len(df)

12038

In [4]:
# Avg length of the abstracts - num tokens
df.abstract.apply(lambda a: len(a.split())).mean()

170.11073267984716

In [20]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description.strip()

In [21]:
texts = df.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist()

## Hugging Face Embeddings

First up, we will use the `SentenceTransformer` library from Hugging Face to create embeddings for our arXiv papers.

In [7]:
# Prep
from sentence_transformers import SentenceTransformer

provider = "huggingface"
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [9]:
# Create embeddings from the title and abstract
embeddings = model.encode(
    texts,
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches:   0%|          | 0/377 [00:00<?, ?it/s]

In [12]:
embeddings.shape

(12038, 768)

In [13]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = embeddings.tolist()

In [14]:
import pickle

# Export to file!
with open(f'arxiv_{provider}_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

## OpenAI Embeddings

Next, we will use OpenAI Embeddings for our arXiv papers.

In [15]:
import openai

provider = "openai"
model_name = "text-embedding-ada-002"
openai.api_key = "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"

In [25]:
import time

embeddings = []

def batchify(seq: list, size: int):
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

In [26]:
for i, batch in enumerate(batchify(texts, size=25)):
    st = time.time()
    response = await openai.Embedding.acreate(
        input=batch,
        engine=model_name
    )
    embeddings += [r["embedding"] for r in response["data"]]
    print(f"Finished batch {i} in {time.time()-st} sec")

Finished batch 0 in 1.1912198066711426 sec
Finished batch 1 in 0.6069509983062744 sec
Finished batch 2 in 0.6274569034576416 sec
Finished batch 3 in 0.6245720386505127 sec
Finished batch 4 in 0.6610221862792969 sec
Finished batch 5 in 0.5576088428497314 sec
Finished batch 6 in 1.8995411396026611 sec
Finished batch 7 in 0.8886330127716064 sec
Finished batch 8 in 0.5986270904541016 sec
Finished batch 9 in 0.7503340244293213 sec
Finished batch 10 in 0.6709990501403809 sec
Finished batch 11 in 0.5569841861724854 sec
Finished batch 12 in 0.5603680610656738 sec
Finished batch 13 in 0.5553009510040283 sec
Finished batch 14 in 0.6193583011627197 sec
Finished batch 15 in 0.6167590618133545 sec
Finished batch 16 in 0.6474571228027344 sec
Finished batch 17 in 0.6930150985717773 sec
Finished batch 18 in 0.7281572818756104 sec
Finished batch 19 in 1.6583359241485596 sec
Finished batch 20 in 0.6751067638397217 sec
Finished batch 21 in 0.6038081645965576 sec
Finished batch 22 in 0.8647360801696777 se

In [27]:
len(embeddings)

12038

In [28]:
embeddings[:3]

[[-0.032847873866558075,
  -0.004703218583017588,
  -0.021323971450328827,
  -0.03645842894911766,
  -0.02980741113424301,
  0.01624748297035694,
  0.017306216061115265,
  0.00885671004652977,
  -0.029400205239653587,
  -0.04479255899786949,
  -0.03195202350616455,
  0.0125690633431077,
  0.004883067682385445,
  0.011306727305054665,
  0.0034985702950507402,
  -0.0031405691988766193,
  0.041344888508319855,
  0.007051434367895126,
  0.023509306833148003,
  -0.017591258510947227,
  -0.0077029624953866005,
  0.013342753052711487,
  -0.01923365332186222,
  -0.027404900640249252,
  1.0325936273147818e-05,
  0.026400461792945862,
  0.027038415893912315,
  -0.0060469950549304485,
  -0.016098173335194588,
  0.013885692693293095,
  0.008924577385187149,
  -0.003827727632597089,
  -0.02193477936089039,
  -0.017089039087295532,
  -0.004961115308105946,
  -0.015270190313458443,
  0.014184310100972652,
  0.00481859315186739,
  0.0021038928534835577,
  0.005487088114023209,
  0.011523903347551823,


In [29]:
df['vector'] = embeddings

In [30]:
df.head()

Unnamed: 0,id,title,year,authors,categories,abstract,vector
0,705.4485,Mixed membership stochastic blockmodels,2014,"Edoardo M Airoldi, David M Blei, Stephen E Fie...","stat.ME,cs.LG,math.ST,physics.soc-ph,stat.ML,s...",Observations consisting of measurements on r...,"[-0.032847873866558075, -0.004703218583017588,..."
1,808.3231,Multi-Instance Multi-Label Learning,2012,"Zhi-Hua Zhou, Min-Ling Zhang, Sheng-Jun Huang,...","cs.LG,cs.AI","In this paper, we propose the MIML (Multi-In...","[-0.033958278596401215, 0.015115243382751942, ..."
2,811.4413,A Spectral Algorithm for Learning Hidden Marko...,2012,"Daniel Hsu, Sham M. Kakade, Tong Zhang","cs.LG,cs.AI",Hidden Markov Models (HMMs) are one of the m...,"[-0.018175596371293068, 0.007647213991731405, ..."
3,903.4817,An Exponential Lower Bound on the Complexity o...,2012,"Bernd G\""artner, Martin Jaggi and Cl\'ement Maria","cs.LG,cs.CG,cs.CV,math.OC,stat.ML",For a variety of regularized optimization pr...,"[-0.008872638456523418, -0.010365040972828865,..."
4,909.5175,Bounding the Sensitivity of Polynomial Thresho...,2013,"Prahladh Harsha, Adam Klivans, Raghu Meka","cs.CC,cs.LG",We give the first non-trivial upper bounds o...,"[0.007639717310667038, -0.00016936704923864454..."


In [32]:
# Export to file!
with open(f'arxiv_{provider}_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)