# This code contains the code to train an LM on OGBN-Arxiv

---



In [1]:
!pip install ogb
!pip install datasets
!pip install transformers
!pip install evaluate
from ogb.nodeproppred import NodePropPredDataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
from ogb.nodeproppred import Evaluator
import pandas as pd
import math
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import numpy as np
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoConfig,
    DataCollatorWithPadding,
    get_scheduler,
    get_linear_schedule_with_warmup
)
import evaluate



## Link the Colab to your Google Drive

We use Google Drive to load the datasets and dump the trained model.

The dataset is available at https://drive.google.com/drive/folders/10xPY3Bv6ugkJX7pAEHYPwXex234b1uWg?usp=sharing. Please create a copy of this folder in your google drive and update the DATA_ROOT to point to this

In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
MODEL = "roberta-large"
DATASET = "ogbn-arxiv"
OUTPUT_FILE = ""
MODE = 'all'
DATA_ROOT = "/content/drive/Shareddrives/CS224W Project/"

# Load Dataset



In [4]:

dataset = NodePropPredDataset(name = DATASET)

# Get Splits


In [5]:

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

# Get Labels for Node Classification


In [6]:

graph, label = dataset[0]
len(label)

169343

# Load Text Label Mappings


In [7]:

labelidx2arxivcategeory = pd.read_csv(DATA_ROOT+'labelidx2arxivcategeory.csv')
print(len(labelidx2arxivcategeory))
labelidx2arxivcategeory.head()

40


Unnamed: 0,label idx,arxiv category
0,0,arxiv cs na
1,1,arxiv cs mm
2,2,arxiv cs lo
3,3,arxiv cs cy
4,4,arxiv cs cr


# Load Node-Paper Id Mappings


In [8]:

nodeidx2paperid = pd.read_csv(DATA_ROOT+'nodeidx2paperid.csv')
nodeidx2paperid.head()

Unnamed: 0,node idx,paper id
0,0,9657784
1,1,39886162
2,2,116214155
3,3,121432379
4,4,231147053


# Load Paper Mappings


In [9]:

titleabs = pd.read_csv(DATA_ROOT+'titleabs.tsv', sep='\t')
titleabs.head()

Unnamed: 0,paperid,title,abstract
0,200971.0,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074.0,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234.0,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423.0,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481.0,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...



# Make reverse index for text df




In [10]:

reverse_index= {}

paperids = titleabs["paperid"].tolist()
for idx, paperid in enumerate(paperids):
    if (not math.isnan(paperid)):
        reverse_index[int(paperid)] = idx

# Dataset Creation


In [11]:

dataset_dict = {'text': [], 'labels': []}

for idx, l in tqdm(enumerate(label)):
    dataset_dict['labels'].append(l[0])
    paper_id = nodeidx2paperid.iloc[idx]['paper id']
    reference_idx = reverse_index[paper_id]
    title = titleabs.iloc[reference_idx]['title']
    abstract = titleabs.iloc[reference_idx]['abstract']
    if (MODE == 'title'):
        dataset_dict['text'].append("Title: " + title)
    elif (MODE == 'abstract'):
        dataset_dict['text'].append(" Abstract: " + abstract)
    else:
        dataset_dict['text'].append("Title: " + title + " Abstract: " + abstract)

dataset_dict['text'] = np.array(dataset_dict['text'])
dataset_dict['labels'] = np.array(dataset_dict['labels'])

169343it [01:17, 2172.94it/s]


In [None]:
train_dataset, valid_dataset, test_dataset = {}, {}, {}

train_dataset['text'] = dataset_dict['text'][train_idx]
train_dataset['labels'] = dataset_dict['labels'][train_idx]
train_dataset = Dataset.from_dict(train_dataset)

valid_dataset['text'] = dataset_dict['text'][valid_idx]
valid_dataset['labels'] = dataset_dict['labels'][valid_idx]
valid_dataset = Dataset.from_dict(valid_dataset)

test_dataset['text'] = dataset_dict['text'][test_idx]
test_dataset['labels'] = dataset_dict['labels'][test_idx]
test_dataset = Dataset.from_dict(test_dataset)

fin_dataset = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

# Init tokenizer


In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenizer_helper(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

# Tokenize dataset


In [None]:

for split in fin_dataset:
    dataset = fin_dataset[split]
    dataset = dataset.map(tokenizer_helper, batched=True, batch_size=BATCH_SIZE)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    fin_dataset[split] = dataset

# Compute Metrics for HF


In [None]:

metric = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    return metric.compute(predictions=np.argmax(logits, axis=-1), references=labels)

# Init model


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=len(labelidx2arxivcategeory))

## Define hyperparameters

In [None]:
OUTPUT_ROOT=''
NUM_EPOCHS=100
BATCH_SIZE=128
LR=1e-3
WARMUP=5
LOG_STEPS=10
MAX_LEN=512

# HF Init and Training


In [None]:

training_args = TrainingArguments(
    output_dir=OUTPUT_ROOT,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_dir=f"{OUTPUT_ROOT}/logs",
    logging_steps=LOG_STEPS,
    learning_rate=LR,
    warmup_steps=WARMUP,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = fin_dataset['train'],
    eval_dataset = fin_dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
tokenizer.save_pretrained(OUTPUT_ROOT + "checkpoint-2842/")

# Dump Logits


In [None]:
import pickle

OUT_MODEL = 'roberta_all/checkpoint-2842'
device = torch.device("cuda")
model = AutoModelForSequenceClassification.from_pretrained(OUT_MODEL, num_labels=len(labelidx2arxivcategeory))
dump_dataset = Dataset.from_dict(dataset_dict)
dump_dataset = dump_dataset.map(tokenizer_helper, batched=True, batch_size=BATCH_SIZE)
dump_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

dataloader = DataLoader(dump_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
from tqdm import tqdm

total_logits = []
model.to(device)
for batch in tqdm(dataloader):
    batch_gpu = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch_gpu)
        logits = list(outputs.logits.cpu().numpy())
        total_logits += logits
total_logits = np.array(total_logits)

with open("finetuned/roberta_logits_arxiv.pkl", 'wb') as f:
    pickle.dump(total_logits, f)