In [1]:
import sys
import os
import random
import ujson
import argparse
import pandas as pd
import numpy as np
import pickle
import json
import csv
from collections import defaultdict, OrderedDict
import jsonlines
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm



In [5]:
def load_mentions(file): 
    lines = []
    with jsonlines.open(file) as f: 
        for line in f: 
            new_line = {
                'id': line['id'],
                'sentence': line['sentence'],
                'aliases': line['aliases'], 
                'spans': line['spans'],
                'gold': line['gold'],
                'cand_probs': line['cand_probs'],
                'qids': line['qids'],
                'sent_idx_unq': line['sent_idx_unq'],
                'probs': line['probs'],
                'ctx_emb_ids': line['ctx_emb_ids'],
                'entity_ids': line['entity_ids']
            }
            lines.append(new_line)
    return pd.DataFrame(lines)

# Load bootleg labels, mapping of labeled sentences to tacred ids, and tacred splits

In [41]:
bootleg_directory = # FILL IN DIRECTORY OF BOOTLEG EMBS AND LABELS
tacred_diretory = # FILL IN DIRECTORY OF TACRED DATA

boot_labels_file = "{}bootleg_labels.jsonl".format(bootleg_directory)
bootleg_labels_df = load_mentions(boot_labels_file)
print(bootleg_labels_df.columns)
print(bootleg_labels_df.shape)

Index(['id', 'sentence', 'aliases', 'spans', 'gold', 'cand_probs', 'qids',
       'sent_idx_unq', 'probs', 'ctx_emb_ids', 'entity_ids'],
      dtype='object')
(106264, 11)


In [42]:
train_file = "{}/train.json".format(base_data)
with open(train_file) as train:
    df_train = json.load(train)
    df_train = pd.DataFrame.from_dict(df_train, orient='columns')
    print(df_train.shape)
    
dev_file = "{}/dev_rev.json".format(base_data)
with open(dev_file) as dev:
    df_dev = json.load(dev)
    df_dev = pd.DataFrame.from_dict(df_dev, orient='columns')
    print(df_dev.shape)
    
test_file = "{}/test_rev.json".format(base_data)
with open(test_file) as test:
    df_test = json.load(test)
    df_test = pd.DataFrame.from_dict(df_test, orient='columns')
    print(df_test.shape)

(68124, 14)
(22631, 14)
(15509, 14)


# Obtain the ent_id features to be used in tacred

In [23]:
ctx_emb_id_dict = {}
ctx_emb_id_dict_first = {}
qid_dict = {}
qid_dict_first = {}

for ind, row in bootleg_labels_df.iterrows():
    ctx_emb_ids = row['ctx_emb_ids']
    qids = row['qids']
    spans = row['spans']
    
    # get sentence length
    example = row['sentence']
    tokens = example.split(' ')
    length = len(tokens)
    
    # initialize result datastructures
    ctx_emb_id_result = [-1] * length
    qid_result = ['UNK'] * length
    
    ctx_emb_id_result_first = [-1] * length
    qid_result_first = ['UNK'] * length
    
    for i in range(len(spans)):
        span = spans[i]
        start, end = span[0], span[1]
        span_len = end-start
        
        # contextual
        ctx_emb_id = ctx_emb_ids[i]
        ctx_emb_id_lst = [ctx_emb_id] * span_len
        ctx_emb_id_result[start:end] = ctx_emb_id_lst
        ctx_emb_id_result_first[start] = ctx_emb_id 
         
        # qids
        qid = qids[i]
        qid_lst = [qid] * span_len
        qid_result[start:end] = qid_lst
        qid_result_first[start] = qid
        
    idx = row['id']
    if idx in ctx_emb_id_dict:
        raise ValueError('duplicate indices!')
    
    ctx_emb_id_dict[idx] = ctx_emb_id_result
    qid_dict[idx] = qid_result
    
    ctx_emb_id_dict_first[idx] = ctx_emb_id_result_first
    qid_dict_first[idx] = qid_result_first


# Add the features to the tacred data

In [None]:
dfs = [df_train, df_dev, df_test]

In [25]:
for df in dfs:
    df["entity_emb_id"] = np.nan
    df['entity_emb_id_first'] = np.nan
    df['ent_id'] = np.nan
    df['ent_id_first'] = np.nan

    dict_ctx_emb_id = {}
    dict_ctx_emb_id_first = {}
    dict_qid = {}
    dict_qid_first = {}

    for ind, row in df.iterrows():
        idx = row['id']
        tokens = row['token']
        length = len(tokens)

        # initialize result datastructures
        ctx_emb_id_default = [-1] * length
        qid_default = ['UNK'] * length

        # contextual
        if idx in ctx_emb_id_dict:
            dict_ctx_emb_id[idx] =  ctx_emb_id_dict[idx]
        else:
            dict_ctx_emb_id[idx] = ctx_emb_id_default

        if idx in ctx_emb_id_dict_first:
            dict_ctx_emb_id_first[idx] = ctx_emb_id_dict_first[idx]
        else:
            dict_ctx_emb_id_first[idx] = ctx_emb_id_default

        # qids
        if idx in qid_dict:
            dict_qid[idx] = qid_dict[idx]
        else:
            dict_qid[idx] = qid_default

        if idx in qid_dict_first:
            dict_qid_first[idx] = qid_dict_first[idx]
        else:
            dict_qid_first[idx] = qid_default
        
    assert len(dict_ctx_emb_id.keys()) == df.shape[0], print(len(dict_ctx_emb_id.keys()), df.shape[0])
    assert len(dict_ctx_emb_id_first.keys()) == df.shape[0], print(len(dict_ctx_emb_id_first.keys()), df.shape[0])
    assert len(dict_qid.keys()) == df.shape[0], print(len(dict_qid.keys()), df.shape[0])
    assert len(dict_qid_first.keys()) == df.shape[0], print(len(dict_qid_first.keys()), df.shape[0])
    df['entity_emb_id'] = df['id'].map(dict_ctx_emb_id)
    df['entity_emb_id_first'] = df['id'].map(dict_ctx_emb_id_first)
    df['ent_id'] = df['id'].map(dict_qid)
    df['ent_id_first'] = df['id'].map(dict_qid_first)
    
    

# Save the tacred data

In [36]:
out_dir = os.path.dirname('{}/'.format(bootleg_directory))
print(out_dir)

/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_iclr_model/bootleg_092620/basic_full_sentences//filt_1015_v1


In [38]:
train_out = df_train.to_json(r'{}/train_ent.json'.format(out_dir),orient='records')

In [39]:
dev_out = df_dev.to_json(r'{}/dev_rev_ent.json'.format(out_dir),orient='records')

In [40]:
test_out = df_test.to_json(r'{}/test_rev_ent.json'.format(out_dir),orient='records')