## This colab contains the code to dump the resultant embeddings for an LM model pretrained on OGBN Arxiv

In [3]:
!pip install sentence-transformers
!pip install ogb
from sentence_transformers import SentenceTransformer, models
from ogb.nodeproppred import NodePropPredDataset
import pandas as pd
import numpy as np
import pickle
import math
from tqdm import tqdm

Collecting ogb
  Downloading ogb-1.3.6-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m738.9 kB/s[0m eta [36m0:00:00[0m
Collecting outdated>=0.2.0 (from ogb)
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting littleutils (from outdated>=0.2.0->ogb)
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25l[?25hdone
  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7026 sha256=53c594117c03e3c65e2c04400f614f6e5828078c0c0123fdc70c47295d78ec56
  Stored in directory: /root/.cache/pip/wheels/3d/fe/b0/27a9892da57472e538c7452a721a9cf463cc03cf7379889266
Successfully built littleutils
Installing collected packages: littleutils, outdated, ogb
Successfully installed littleutils-0.2.2 ogb-1.3.6 outdated-0.2.2


## Link the Colab to your Google Drive

We use Google Drive to load the datasets and dump the trained model.

The dataset is available at https://drive.google.com/drive/folders/10xPY3Bv6ugkJX7pAEHYPwXex234b1uWg?usp=sharing. Please create a copy of this folder in your google drive and update the DATA_ROOT to point to this

In [6]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [29]:
MODEL = "roberta-large"
DATASET = "ogbn-arxiv"
OUTPUT_FILE = ""
MODE = 'all'
DATA_ROOT = "/content/drive/Shareddrives/CS224W Project/"

## Load Node-Paper Id Mappings

In [30]:
nodeidx2paperid = pd.read_csv(DATA_ROOT+'nodeidx2paperid.csv')
nodeidx2paperid.head()

Unnamed: 0,node idx,paper id
0,0,9657784
1,1,39886162
2,2,116214155
3,3,121432379
4,4,231147053


## Load Paper Mappings

In [31]:
titleabs = pd.read_csv(DATA_ROOT+'titleabs.tsv', sep='\t')
titleabs.head()

Unnamed: 0,paperid,title,abstract
0,200971.0,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074.0,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234.0,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423.0,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481.0,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...


In [18]:
titleabs.keys()

Index(['paperid', 'title', 'abstract'], dtype='object')

# Make reverse index for text df


In [21]:

reverse_index= {}

paperids = titleabs["paperid"].tolist()
for idx, paperid in enumerate(paperids):
    if (not math.isnan(paperid)):
        reverse_index[int(paperid)] = idx

## Dataset Creation


In [26]:

mapped_text = []

for idx in tqdm(range(len(nodeidx2paperid))):
    paper_id = nodeidx2paperid.iloc[idx]['paper id']
    reference_idx = reverse_index[paper_id]
    title = titleabs.iloc[idx]['title']
    abstract = titleabs.iloc[idx]['abstract']
    if (MODE == 'title'):
        mapped_text.append("Title: " + title)
    elif (MODE == 'abstract'):
        mapped_text.append(" Abstract: " + abstract)
    else:
        mapped_text.append("Title: " + title + " Abstract: " + abstract)

100%|██████████| 169343/169343 [00:45<00:00, 3743.66it/s]


## Initialize sentence embedding model


In [27]:

word_embedding_model = models.Transformer(MODEL)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode = 'mean')
emb_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Get NumPy embeddings


In [28]:

embeddings = emb_model.encode(sentences = mapped_text,
                             batch_size = 32,
                             show_progress_bar = True,
                             convert_to_numpy = True,
                             normalize_embeddings = True)

Batches:   0%|          | 0/5292 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

# Dump Embeddings


In [None]:

with open(OUTPUT_FILE, 'wb') as f:
    pickle.dump(embeddings, f)