In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%pip install -q -r requirements.txt

You should consider upgrading via the '/home/jovyan/workspace/untitled1-vector-search/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import json
import pandas as pd
import os
import re
import string

from vecsim_app.embeddings import Embeddings
from vecsim_app.data_utils import papers


DATA_PATH = "../arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"

In [5]:
df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))
len(df)

408773

In [9]:
# Avg length of the abstracts
# df.abstract.apply(lambda a: len(a.split())).mean()

169.84534547683685

In [6]:
df.head(3)

Unnamed: 0,id,title,year,authors,categories,abstract
0,704.0304,The World as Evolving Information,2012,Carlos Gershenson,"cs.IT,cs.AI,math.IT,q-bio.PE",This paper discusses the benefits of describ...
1,704.2744,Nahm transform and parabolic minimal Laplace t...,2012,Szilard Szabo,math.AG,We prove that Nahm transform for integrable ...
2,704.2768,Heat Equations and the Weighted $\bar\partial$...,2012,Andrew Raich,"math.AP,math.CV",The purpose of this article is to establish ...


In [7]:
df['authors_clean'] = df['authors'].apply(lambda a: ' '.join(re.findall(r'\w\w+', a)).strip())
df['authors_clean'][:3]

0    Carlos Gershenson
1        Szilard Szabo
2         Andrew Raich
Name: authors_clean, dtype: object

In [8]:
df['text'] = df.apply(lambda r: Embeddings.clean_description(r['title'] + ' ' + r['abstract']), axis=1)
df['text'][:1]

0     the world as evolving information this paper ...
Name: text, dtype: object

In [9]:
from sentence_transformers import SentenceTransformer
#see models here: https://www.sbert.net/docs/pretrained_models.html
model_name = 'sentence-transformers/all-distilroberta-v1'

model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
# Create embeddings from the title and abstract
emb = model.encode(df['text'].tolist())

In [11]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = emb.tolist()

In [13]:
df.shape

(408773, 9)

In [14]:
import pickle

# Export to file!
with open(f'arxiv_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

In [17]:
!ls -lh .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 106M
-rw-rw-r-- 1 jovyan jovyan 106M Nov  6 05:23 arxiv_embeddings_10000.pkl
-rw-rw-r-- 1 jovyan jovyan  16K Nov  6 05:23 arxiv-embeddings.ipynb
-rw-rw-r-- 1 jovyan jovyan 1.9K Nov  5 23:59 requirements.txt


In [22]:
embeddings = Embeddings()

e1 = embeddings.make("text1")
e2 = embeddings.make("text2")

In [23]:
e1[:3]

array([ 0.0134814 , -0.02945524, -0.0014616 ], dtype=float32)

In [24]:
e2[:3]

array([ 0.0119531 , -0.05998396, -0.0344477 ], dtype=float32)

In [25]:
((e1+e2)/2)[:3]

array([ 0.01271725, -0.0447196 , -0.01795465], dtype=float32)

In [30]:
(0.0134814+0.01271725)/2

0.013099324999999998

In [29]:
(0.02945524+0.05998396)/2

0.0447196