In [7]:
import pickle
import re

In [8]:
fin = open('TACRED/TACRED.pkl','rb')
tacred = pickle.load(fin)

tacred_train = tacred['train']
tacred_test = tacred['test']
tacred_dev = tacred['dev']

In [9]:
tacred_train[0].keys()

dict_keys(['ans_type', 'relation', 'tokens', 'ans_end', 'id', 'deprel', 'pos', 'key_end', 'ans_start', 'ner', 'head', 'key_type', 'sentence', 'key_start'])

In [10]:
def word_format(tokens, lower, zeros):
    tokens = " ".join(tokens)
    if zeros:
        tokens = re.sub('\d', '0', tokens)
    if lower:
        tokens = token.lower()
    return tokens.split(' ')

In [19]:
def prepare_dataset(dataset, 
                    # options related to the word format
                    lower = False,
                    zeros = True,
                    # options for the features
                    #pos_feature = True,
                    #positional_feature = True,
                    # options for the (subj, obj) representation
                    position_indicator = False,
                    subj_obj_normalization = True,
                    **kwargs):
    assert position_indicator^subj_obj_normalization, '(positional_indicator, subj_obj_normalization) should be (True, False) or (False, True)'
    
    
    for data in dataset:
        sent_id = data['id']
        sentence = data['sentence']
        tokens = data['tokens']
        pos = data['pos']
        ner = data['ner']
        heads = data['head']
        deprel = data['deprel']
        
        key_start = data['key_start']
        key_end = data['key_end']
        key_type = data['key_type']
        
        ans_start = data['ans_start']
        ans_end = data['ans_end']
        ans_type = data['ans_type']
        
        relation = data['relation']
        
        if zeros or lower:
            tokens = word_format(tokens, lower, zeros)
            
        
        
        
        
        if subj_obj_normalization:
            for i, d in enumerate(zip(tokens, pos, ner, heads, deprel)):
                _t, _p, _n, _h, _d = d
                if i >= key_start and i <= key_end: 
                    tokens[i] = "SUBJ_" + key_type
                elif i >= ans_start and i <= ans_end:
                    tokens[i] = "OBJ_" + ans_type
                
        else: # subj obj normalize
            new_tokens = []
            new_pos = []
            new_ner = []
            new_heads = []
            new_deprel = []

            new_key_start = []
            new_key_end = []
            
            
    data['id'] = sent_id
    data['sentence'] = sentence
    data['tokens'] = tokens
    data['pos'] = pos
    data['ner'] = ner
    data['head'] = heads
    data['deprel'] = deprel
        
    data['key_start'] = key_start
    data['key_end'] = key_end 
    data['key_type'] = key_type
        
    data['ans_start'] = ans_start
    data['ans_end'] = ans_end
    data['ans_type'] = ans_type
        
    data['relation'] = relation

In [20]:
prepare_dataset(tacred_train)

Her chief spokesman in the governor 's office , Bill McAllister , said her aggressive role in the presidential campaign reflected the job she was given , not a change of character .
['SUBJ_PERSON', 'chief', 'spokesman', 'in', 'the', 'governor', "'s", 'office', ',', 'OBJ_PERSON', 'OBJ_PERSON', ',', 'said', 'her', 'aggressive', 'role', 'in', 'the', 'presidential', 'campaign', 'reflected', 'the', 'job', 'she', 'was', 'given', ',', 'not', 'a', 'change', 'of', 'character', '.']
