In [1]:
import json
import pandas as pd
import os
import re
import string
import asyncio
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
DATA_PATH='/home/jovyan/arxiv/arxiv-metadata-oai-snapshot.json'
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [3]:
!ls -lah {DATA_PATH}

-rw-rw-r-- 1 jovyan jovyan 3.4G Nov  6 00:15 /home/jovyan/arxiv/arxiv-metadata-oai-snapshot.json


In [4]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF and ML_CATEGORY in paper['categories']:
                    yield paper

In [7]:
df = pd.DataFrame(papers())
len(df)

11419

In [8]:
# sample only fraction of data
df = df.sample(frac=0.1)
len(df)

1142

In [9]:
# Avg length of the abstracts
df.abstract.apply(lambda a: len(a.split())).mean()

168.27408056042032

In [10]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [11]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-c7932e9d-60b8-c107-e5aa-78f001a919ed)


In [10]:
# !pip install transformers

In [5]:
from model import ColBERTModel

In [6]:
model = ColBERTModel()

using device: cuda:0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# # Create embeddings from the title and abstract
emb = []
for text in tqdm(df.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist()):
    emb.append(model.compute_document_representation(text).tolist())
    


  0%|          | 0/1142 [00:00<?, ?it/s]

In [13]:
len(emb)

1142

In [14]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector_matrix'] = emb

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1142 entries, 0 to 1141
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1142 non-null   object
 1   title          1142 non-null   object
 2   year           1142 non-null   int64 
 3   authors        1142 non-null   object
 4   categories     1142 non-null   object
 5   abstract       1142 non-null   object
 6   vector_matrix  1142 non-null   object
dtypes: int64(1), object(6)
memory usage: 62.6+ KB


In [16]:
df.memory_usage(index=True, deep=True).sum()/1024/1024

3.728963851928711

In [17]:
import pickle

# Export to file!
with open('arxiv_colbert_embeddings_1000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

In [20]:
ls -lah 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 1.7G
drwxrwsr-x  4 jovyan jovyan 4.0K Nov  7 16:18 [0m[01;34m.[0m/
drwxrwsr-x 10 jovyan jovyan 4.0K Nov  7 14:24 [01;34m..[0m/
-rw-r--r--  1 jovyan jovyan 1.7G Nov  7 16:18 arxiv_colbert_embeddings_1000.pkl
-rw-r--r--  1 jovyan jovyan  91M Nov  7 14:32 arxiv_embeddings_10000.pkl
-rw-r--r--  1 jovyan jovyan  19K Nov  7 16:18 arxiv-embeddings-colbert.ipynb
-rw-rw-r--  1 jovyan jovyan  16K Nov  7 14:33 arxiv-embeddings.ipynb
-rw-r--r--  1 jovyan jovyan 2.8K Nov  7 14:50 colbert_model.py
drwxrwsr-x  2 jovyan jovyan 4.0K Nov  7 14:51 [01;34m.ipynb_checkpoints[0m/
-rw-r--r--  1 jovyan jovyan 4.9K Nov  7 16:15 model.py
drwxr-sr-x  2 jovyan jovyan 4.0K Nov  7 16:16 [01;34m__pycache__[0m/
