## This colab contains the code to pretrain an LM model on OGBN Products and dump the resultant embeddings

In [1]:
!pip install sentence-transformers
!pip install ogb
from sentence_transformers import SentenceTransformer, models
from ogb.nodeproppred import NodePropPredDataset
import pandas as pd
import numpy as np
import pickle
import math
from tqdm import tqdm

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transf

## Link the Colab to your Google Drive

We use Google Drive to load the datasets and dump the trained model.

The dataset is available at https://drive.google.com/drive/folders/10xPY3Bv6ugkJX7pAEHYPwXex234b1uWg?usp=sharing. Please create a copy of this folder in your google drive and update the DATA_ROOT to point to this

In [2]:
MODEL = "roberta-large"
DATASET = "ogbn-products"
OUTPUT_FILE = ""
MODE = 'all'
DATA_ROOT = "/content/drive/Shareddrives/CS224W Project/"

In [3]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


## Load Node-Paper Id Mappings

In [8]:
nodeidx2productid = pd.read_csv(DATA_ROOT+'ogbn-products_subset.csv')
nodeidx2productid.head()

Unnamed: 0,uid,nid,title,content
0,B00JXMX35K,1324660,Aimee Gowns Original Bra-less Nursing Gown (X-...,The one that started it all! The nightgown's m...
1,B000XQHYFU,1670226,Casio DR-270TM 2-Color Professional Desktop Pr...,Heavy duty printing
2,B00B67RO9Q,1780321,,
3,B005JN9CKM,2447868,Muay Thai Shorts-Black,"for Size Chart, please see attached photos. Im..."
4,B00CU9YBVS,1940019,Smatree&reg; Chest Belt/Strap Harness Mount+ A...,1. Smatree chest strap is fully adjustable to ...


## Load Paper Mappings

In [5]:
titleabs = pd.read_csv(DATA_ROOT+'titleabs.tsv', sep='\t')
titleabs.head()

Unnamed: 0,paperid,title,abstract
0,200971.0,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074.0,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234.0,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423.0,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481.0,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...


In [6]:
titleabs.keys()

Index(['paperid', 'title', 'abstract'], dtype='object')

# Make reverse index for text df


In [9]:
reverse_index= {}

productids = nodeidx2productid["nid"].tolist()
for idx, productid in enumerate(productids):
    if (not math.isnan(productid)):
        reverse_index[int(productid)] = idx

## Dataset Creation


In [10]:

mapped_text = []

for idx, product_id in tqdm(enumerate(productids)):
    title = nodeidx2productid.iloc[idx]['title']
    content = nodeidx2productid.iloc[idx]['content']
    if (MODE == 'title'):
        mapped_text.append("Title: " + str(title))
    elif (MODE == 'abstract'):
        mapped_text.append(" Abstract: " + str(content))
    else:
        mapped_text.append("Title: " + str(title) + " Abstract: " + str(content))

54025it [00:12, 4188.46it/s]


## Initialize sentence embedding model


In [None]:

word_embedding_model = models.Transformer(MODEL)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode = 'mean')
emb_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

## Get NumPy embeddings


In [None]:

embeddings = emb_model.encode(sentences = mapped_text,
                             batch_size = 32,
                             show_progress_bar = True,
                             convert_to_numpy = True,
                             normalize_embeddings = True)

# Dump Embeddings


In [None]:

with open(OUTPUT_FILE, 'wb') as f:
    pickle.dump(embeddings, f)