# This code contains the code to train an LM on OGBN-Products


In [1]:
!pip install transformers
!pip install ogb
!pip install datasets
!pip install evaluate
!pip install torch_geometric
!pip install accelerate -U
import torch
torch_version = str(torch.__version__)
scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
!pip install torch-scatter -f $scatter_src
!pip install torch-sparse -f $sparse_src

Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html


## Restart the kernel at this step (after installation)

In [2]:
from ogb.nodeproppred import NodePropPredDataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from ogb.nodeproppred import Evaluator
import pandas as pd
import math
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import numpy as np
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoConfig,
    DataCollatorWithPadding,
    get_scheduler,
    get_linear_schedule_with_warmup
)
import evaluate

## Link the Colab to your Google Drive

We use Google Drive to load the datasets and dump the trained model.

The dataset is available at https://drive.google.com/drive/folders/10xPY3Bv6ugkJX7pAEHYPwXex234b1uWg?usp=sharing. Please create a copy of this folder in your google drive and update the DATA_ROOT to point to this

In [3]:
from google.colab import drive
drive.mount("/content/drive/")
import pickle
import matplotlib.pyplot as plt

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
DATA_ROOT='drive/Shareddrives/CS224W Project/'
OUTPUT_ROOT = 'drive/Shareddrives/CS224W Project/'
MODEL = "bert-base-uncased"
MODE = 'all'
BATCH_SIZE = 16
NUM_EPOCHS = 2
LR = 5e-5
WARMUP = 100
LOG_STEPS = 100
MAX_LEN = 512

In [5]:
def load_data():
    num_classes = 47
    data = torch.load(DATA_ROOT+'ogbn-products_subset.pt')
    text = pd.read_csv(DATA_ROOT+'ogbn-products_subset.csv')
    text = [f'Product:{ti}; Description: {cont}\n'for ti,
            cont in zip(text['title'], text['content'])]

    data.edge_index = data.adj_t.to_symmetric()

    return data, num_classes, text,

In [6]:
data, num_classes, text = load_data()

In [7]:
train_idx = data.train_mask
val_idx = data.val_mask
test_idx = data.test_mask

In [8]:
nodeidx2productid = pd.read_csv(DATA_ROOT+'ogbn-products_subset.csv')
nodeidx2productid.head()

Unnamed: 0,uid,nid,title,content
0,B00JXMX35K,1324660,Aimee Gowns Original Bra-less Nursing Gown (X-...,The one that started it all! The nightgown's m...
1,B000XQHYFU,1670226,Casio DR-270TM 2-Color Professional Desktop Pr...,Heavy duty printing
2,B00B67RO9Q,1780321,,
3,B005JN9CKM,2447868,Muay Thai Shorts-Black,"for Size Chart, please see attached photos. Im..."
4,B00CU9YBVS,1940019,Smatree&reg; Chest Belt/Strap Harness Mount+ A...,1. Smatree chest strap is fully adjustable to ...


In [9]:
# Make reverse index for text df

reverse_index= {}

productids = nodeidx2productid["nid"].tolist()
for idx, productid in enumerate(productids):
    if (not math.isnan(productid)):
        reverse_index[int(productid)] = idx

In [10]:
data.y[1][0]

tensor(18)

In [11]:
# Dataset Creation

dataset_dict = {'text': [], 'labels': []}

for idx, product_id in tqdm(enumerate(productids)):
    dataset_dict['labels'].append(data.y[idx][0])
    title = str(nodeidx2productid.iloc[idx]['title'])
    content = str(nodeidx2productid.iloc[idx]['content'])
    if title == 'nan' or 'None':
      title = ''
    if content == 'nan' or 'None':
      content = ''
    if (MODE == 'title'):
        dataset_dict['text'].append("Title: " + title)
    elif (MODE == 'abstract'):
        dataset_dict['text'].append(" Abstract: " + content)
    else:
        dataset_dict['text'].append("Title: " + title + " Abstract: " + content)

dataset_dict['text'] = np.array(dataset_dict['text'])
dataset_dict['labels'] = np.array(dataset_dict['labels'])

54025it [00:24, 2240.86it/s]


In [12]:
train_dataset, valid_dataset, test_dataset = {}, {}, {}

train_dataset['text'] = dataset_dict['text'][train_idx]
train_dataset['labels'] = dataset_dict['labels'][train_idx]
train_dataset = Dataset.from_dict(train_dataset)

valid_dataset['text'] = dataset_dict['text'][val_idx]
valid_dataset['labels'] = dataset_dict['labels'][val_idx]
valid_dataset = Dataset.from_dict(valid_dataset)

test_dataset['text'] = dataset_dict['text'][test_idx]
test_dataset['labels'] = dataset_dict['labels'][test_idx]
test_dataset = Dataset.from_dict(test_dataset)

fin_dataset = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})


# Init tokenizer




In [13]:

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenizer_helper(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

# Tokenize dataset


In [14]:

for split in fin_dataset:
    dataset = fin_dataset[split]
    dataset = dataset.map(tokenizer_helper, batched=True, batch_size=BATCH_SIZE)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    fin_dataset[split] = dataset

Map:   0%|          | 0/14708 [00:00<?, ? examples/s]

Map:   0%|          | 0/1572 [00:00<?, ? examples/s]

Map:   0%|          | 0/37745 [00:00<?, ? examples/s]

# Compute Metrics for HF


In [15]:

metric = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    return metric.compute(predictions=np.argmax(logits, axis=-1), references=labels)

# Init model


In [16]:

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# HF Init and Training


In [18]:

training_args = TrainingArguments(
    output_dir=OUTPUT_ROOT,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_dir=f"{OUTPUT_ROOT}/logs",
    logging_steps=LOG_STEPS,
    learning_rate=LR,
    warmup_steps=WARMUP,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = fin_dataset['train'],
    eval_dataset = fin_dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-5a2143417196>", line 26, in <cell line: 26>
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1555, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1865, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occu

TypeError: ignored

In [None]:
tokenizer.save_pretrained(OUTPUT_ROOT + "checkpoint-2842/")

# Dump Logits


In [None]:
import pickle

OUT_MODEL = 'roberta_all/checkpoint-2842'
device = torch.device("cuda")
model = AutoModelForSequenceClassification.from_pretrained(OUT_MODEL, num_labels=num_classes)
dump_dataset = Dataset.from_dict(dataset_dict)
dump_dataset = dump_dataset.map(tokenizer_helper, batched=True, batch_size=BATCH_SIZE)
dump_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

dataloader = DataLoader(dump_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
from tqdm import tqdm

total_logits = []
model.to(device)
for batch in tqdm(dataloader):
    batch_gpu = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch_gpu)
        logits = list(outputs.logits.cpu().numpy())
        total_logits += logits
total_logits = np.array(total_logits)

with open("finetuned/roberta_logits_products_arxiv.pkl", 'wb') as f:
    pickle.dump(total_logits, f)