In [1]:
import torch
import pickle
from tqdm import tqdm
import json
from transformers import AutoModel, AutoTokenizer
import argparse
import os
import numpy as np

def check_path(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

merged_relations = [
    'antonym',
    'atlocation',
    'capableof',
    'causes',
    'createdby',
    'isa',
    'desires',
    'hassubevent',
    'partof',
    'hascontext',
    'hasproperty',
    'madeof',
    'notcapableof',
    'notdesires',
    'receivesaction',
    'relatedto',
    'usedfor',
]

spaced_merged_relations = [
    'antonym',
    'at location',
    'capable of',
    'causes',
    'created by',
    'is a',
    'desires',
    'has sub-event',
    'part of',
    'has context',
    'has property',
    'made of',
    'not capable of',
    'not desires',
    'receives action',
    'related to',
    'used for',
]

In [None]:
# MODEL_CLASS_TO_NAME = {
#     'gpt': list(OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
#     'bert': list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
#     'xlnet': list(XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
#     'roberta': list(ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
#     'lstm': ['lstm'],
# }
model_class = AutoModel
tokenizer_class = AutoTokenizer
model_name = 'bert-large-uncased'
# model_name = 'roberta-large'
tokenizer = tokenizer_class.from_pretrained(model_name)
model = model_class.from_pretrained(model_name, output_hidden_states=True)
model.eval()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
prune = False
cpnet_csv_path = '../data/cpnet/conceptnet.en.csv'
cpnet_vocab_path = '../data/cpnet/concept.txt'
blacklist = set(["uk", "us", "take", "make", "object", "person", "people"])  # issue: mismatch with the blacklist in grouding.py

concept2id = {}
id2concept = {}
with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
    id2concept = [w.strip() for w in fin]
concept2id = {w: i for i, w in enumerate(id2concept)}

id2relation = merged_relations
relation2id = {r: i for i, r in enumerate(id2relation)}

max_seq_length = 128
# BERT inputs
all_input_ids, all_input_mask, all_segment_ids, all_sub_span, all_rel_span, all_obj_span= [], [], [], [], [], []

# keep track of input entity ids
nrow = sum(1 for _ in open(cpnet_csv_path, 'r', encoding='utf-8'))
entity_ids = []  # stores [sub, rel, obj] for each triple

with open(cpnet_csv_path, "r", encoding="utf8") as fin:

    def not_save(cpt):
        if cpt in blacklist:
            return True
        '''originally phrases like "branch out" would not be kept in the graph'''
        # for t in cpt.split("_"):
        #     if t in nltk_stopwords:
        #         return True
        return False
    
    attrs = set()
    
    for line in tqdm(fin, total=nrow):
        ls = line.strip().split('\t')
        rel = relation2id[ls[0]]
        subj = concept2id[ls[1]]
        obj = concept2id[ls[2]]

        if prune and (not_save(ls[1]) or not_save(ls[2]) or id2relation[rel] == "hascontext"):
            continue
        
        if subj == obj:  # delete loops
            continue

        if (subj, obj, rel) not in attrs:
            attrs.add((subj, obj, rel))
            entity_ids.append([subj, rel, obj])
            # tokenize inputs and format input data for BERT
            sub_tokens = tokenizer.tokenize(ls[1].replace('_', ' '))
            rel_tokens = tokenizer.tokenize(spaced_merged_relations[rel])
            obj_tokens = tokenizer.tokenize(ls[2].replace('_', ' '))
            triple_tokens = [tokenizer.cls_token] + sub_tokens + rel_tokens + obj_tokens + [tokenizer.sep_token]
            input_ids = tokenizer.convert_tokens_to_ids(triple_tokens)

            assert len(input_ids) <= max_seq_length
            pad_len = max_seq_length - len(input_ids)
            input_mask = [1] * len(input_ids) + [0] * pad_len
            input_ids += [0] * pad_len
            segment_ids = [0] * max_seq_length # all just one sentence (no sentence pair)
            # define span of sub, rel, and obj
            sub_span = [1, len(sub_tokens)]  
            rel_span = [sub_span[-1] + 1, sub_span[-1] + len(rel_tokens)]
            obj_span = [rel_span[-1] + 1, rel_span[-1] + len(obj_tokens)]
            
            all_input_ids.append(input_ids)
            all_input_mask.append(input_mask)
            all_segment_ids.append(segment_ids)
            all_sub_span.append(sub_span)
            all_rel_span.append(rel_span)
            all_obj_span.append(obj_span)
            
        
entity_ids = np.array(entity_ids)

cache_path = '../data/cpnet/encoder_inputs/'+model_name+'.pkl'
check_path(cache_path)

with open(cache_path, 'wb') as fout:
    pickle.dump((all_input_ids, all_input_mask, all_segment_ids, all_sub_span, all_rel_span, all_obj_span, entity_ids), fout)
print('Inputs dumped')

# return BERT encodings of subj, obj, rel using tokens
# add embeddings to each entity

100%|██████████| 2487810/2487810 [03:06<00:00, 13364.25it/s]


Inputs dumped


In [None]:
cache_path = '../data/cpnet/encoder_inputs/'+model_name+'.pkl'
with open(cache_path, 'rb') as fin:
    all_input_ids, all_input_mask, all_segment_ids, all_sub_span, all_rel_span, all_obj_span, entity_ids = pickle.load(fin)

In [None]:
cpnet_csv_path = '../data/cpnet/conceptnet.en.csv'
cpnet_vocab_path = '../data/cpnet/concept.txt'

concept2id = {}
id2concept = {}
with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
    id2concept = [w.strip() for w in fin]
concept2id = {w: i for i, w in enumerate(id2concept)}

id2relation = merged_relations
relation2id = {r: i for i, r in enumerate(id2relation)}

all_input_ids, all_input_mask, all_segment_ids, all_sub_span, all_rel_span, all_obj_span = [torch.tensor(x, dtype=torch.long) for x in [all_input_ids, all_input_mask, all_segment_ids, all_sub_span, all_rel_span, all_obj_span]]

n = entity_ids.shape[0]
batch_size = 256

assert n == all_input_ids.shape[0]

# which layer of BERT to use for embeddings
layer = -1
emb_dim = 1024
max_seq_length = 128
cpnet_concept_emb = torch.zeros((len(concept2id), emb_dim)).to(device)
cpnet_rel_emb = torch.zeros((len(relation2id), emb_dim)).to(device)

with torch.no_grad():
    mask = torch.arange(max_seq_length, device=device)[None, :]

    for a in tqdm(range(0, n, batch_size), total=n // batch_size + 1, desc='Extracting features'):
        b = min(a + batch_size, n)
        *batch, sub_span, rel_span, obj_span = [x.to(device) for x in [all_input_ids[a:b], all_input_mask[a:b], all_segment_ids[a:b], all_sub_span[a:b], all_rel_span[a:b], all_obj_span[a:b]]]
        outputs = model(*batch)
        
        hidden_states = outputs[-1][layer]
        
        sub_mask = (mask >= sub_span[:, 0, None]) & (mask <= sub_span[:, 1, None])
        rel_mask = (mask >= rel_span[:, 0, None]) & (mask <= rel_span[:, 1, None])
        obj_mask = (mask >= obj_span[:, 0, None]) & (mask <= obj_span[:, 1, None])
        # apply mask using the spans, and average the token rep by dividing by span length
        sub_pooled = (hidden_states * sub_mask.float().unsqueeze(-1)).sum(1) / (sub_span[:,1].float() - sub_span[:,0].float() + 1).unsqueeze(1)
        rel_pooled = (hidden_states * rel_mask.float().unsqueeze(-1)).sum(1) / (rel_span[:,1].float() - rel_span[:,0].float() + 1).unsqueeze(1)
        obj_pooled = (hidden_states * obj_mask.float().unsqueeze(-1)).sum(1) / (obj_span[:,1].float() - obj_span[:,0].float() + 1).unsqueeze(1)
        
        sub_ids = entity_ids[a:b, 0]
        rel_ids = entity_ids[a:b, 1]
        obj_ids = entity_ids[a:b, 2]
        for i, (sub_id, rel_id, obj_id) in enumerate(zip(sub_ids, rel_ids, obj_ids)):
            cpnet_concept_emb[sub_id] += sub_pooled[i]
            cpnet_concept_emb[obj_id] += obj_pooled[i]
            cpnet_rel_emb[rel_id] += rel_pooled[i]

sub_unique, sub_counts = np.unique(entity_ids[:,0], return_counts=True) 
rel_unique, rel_counts_unordered = np.unique(entity_ids[:,1], return_counts=True) 
obj_unique, obj_counts = np.unique(entity_ids[:,2], return_counts=True) 
rel_counts = np.zeros(len(merged_relations))
rel_counts[rel_unique] += rel_counts_unordered
concept_counts = np.zeros(len(concept2id))
concept_counts[sub_unique] += sub_counts
concept_counts[obj_unique] += obj_counts 

cpnet_concept_emb = cpnet_concept_emb.to('cpu').numpy()
cpnet_rel_emb = cpnet_rel_emb.to('cpu').numpy()
cpnet_concept_emb = np.divide(cpnet_concept_emb, concept_counts[:,np.newaxis], out=np.zeros_like(cpnet_concept_emb), where=concept_counts[:,np.newaxis]!=0)
cpnet_rel_emb = cpnet_rel_emb / rel_counts[:,np.newaxis]
output_dir = '../data/cpnet/encoder_embs/'
check_path(output_dir)
np.save(output_dir + model_name + '_concept_emb', cpnet_concept_emb)
np.save(output_dir + model_name + '_rel_emb', cpnet_rel_emb)

Extracting features: 100%|██████████| 9228/9228 [2:07:37<00:00,  1.21it/s]  
