In [1]:
from pathlib import Path
import json

In [2]:
tacred_train_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/train.json')
tacred_dev_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/dev.json')
tacred_test_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/test.json')
conll04_train_path = Path('../data/datasets/conll04/conll04_train.json')
tacred_train_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/train_conll04.json')
tacred_dev_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/dev_conll04.json')
tacred_test_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/test_conll04.json')
tacred_train_path_semeval = Path('/media/discoD/repositorios/OpenNRE/benchmark/tacred/train.json')
tacred_dev_path_semeval = Path('/media/discoD/repositorios/OpenNRE/benchmark/tacred/dev.json')
tacred_test_path_semeval = Path('/media/discoD/repositorios/OpenNRE/benchmark/tacred/test.json')
tacred_rel2id_path_semeval = Path('/media/discoD/repositorios/OpenNRE/benchmark/tacred/rel2id.json')

In [7]:
def read_json(input_file):
    with open(input_file, "r", encoding='utf-8') as reader:
        data = json.load(reader)
    return data

def save_json(json_path, examples):
    with open(json_path, 'w') as outfile:
        json.dump(examples, outfile)
        
def save_json_list(json_path, examples):
    with open(json_path, 'w') as fp:
        fp.write('\n'.join(json.dumps(example) for example in examples))

def convert_token(token):
    """ Convert PTB tokens to normal tokens """
    if (token.lower() == '-lrb-'):
        return '('
    elif (token.lower() == '-rrb-'):
        return ')'
    elif (token.lower() == '-lsb-'):
        return '['
    elif (token.lower() == '-rsb-'):
        return ']'
    elif (token.lower() == '-lcb-'):
        return '{'
    elif (token.lower() == '-rcb-'):
        return '}'
    return token
        
def convert_tacred_to_conll04(tacred_example):
    sentence = [convert_token(token) for token in tacred_example['token']]
    head_first = tacred_example['subj_start'] < tacred_example['obj_start']
    return {'tokens': sentence, 
            'entities': [{'type': tacred_example['subj_type'], 'start': tacred_example['subj_start'], 'end': int(tacred_example['subj_end']) + 1}, 
                         {'type': tacred_example['obj_type'], 'start': tacred_example['obj_start'], 'end': int(tacred_example['obj_end']) + 1}
                        ],
            'relations': [{'type': tacred_example['relation'], 'head': 0 if head_first else 1, 'tail': 1 if head_first else 0}],
            'orig_id': tacred_example['id']
           }

# {
#     "token":[ "we", "dissolved", "the", "contents", "of", "one", "packet", "in", "a", "carafe", "of", "water", "and", "ran", "the", "brew", "cycle", "." ],
#     "h":{
#         "name":"carafe",
#         "pos":[
#             9,
#             10
#         ]
#     },
#     "t":{
#         "name":"water",
#         "pos":[
#             11,
#             12
#         ]
#     },
#     "relation":"Content-Container(e2,e1)"
# }

# tacred_example = {'id': "e7798fb926b9403cfcd2",
#     'docid': "APW_ENG_20101103.0539",
#     'relation': "per:title",
#     'token': ["At", "the", "same", "time", ",", "Chief", "Financial", "Officer", "Douglas", "Flint", "will",
#         "become", "chairman", ",", "succeeding", "Stephen", "Green", "who", "is", "leaving", "to", "take", 
#         "a", "government", "job", "." ],
#     'subj_start': 8,
#     'subj_end': 9,
#     'obj_start': 12,
#     'obj_end': 12,
#     'subj_type': "PERSON",
#     'obj_type': "TITLE"}

def convert_tacred_to_semeval(tacred_example):
    sentence = [convert_token(token) for token in tacred_example['token']]
    
    head_start = tacred_example['subj_start']
    head_end = tacred_example['subj_end'] + 1
    head_name = ' '.join(tacred_example['token'][tacred_example['subj_start']:tacred_example['subj_end'] + 1])

    tail_start = tacred_example['obj_start']
    tail_end = tacred_example['obj_end'] + 1
    tail_name = ' '.join(tacred_example['token'][tacred_example['obj_start']:tacred_example['obj_end'] + 1])
    
    return {'token': sentence, 
            'h': {
                'name': head_name,
                'pos': [head_start, head_end]
            },
            't': {
                'name': tail_name,
                'pos': [tail_start, tail_end]
            },
            'relation': tacred_example['relation']}

def convert_tacred_dataset_to_conll04_format(in_file_path, out_file_path):
    examples = read_json(in_file_path)
    converted_examples = []
    for example in examples:
        converted_examples.append(convert_tacred_to_conll04(example))
    save_json(out_file_path, converted_examples)

def convert_tacred_benchmark_to_conll04_format():
    convert_tacred_dataset_to_conll04_format(tacred_train_path, tacred_train_path_conll04)
    convert_tacred_dataset_to_conll04_format(tacred_dev_path, tacred_dev_path_conll04)
    convert_tacred_dataset_to_conll04_format(tacred_test_path, tacred_test_path_conll04)
    
def convert_tacred_dataset_to_semeval_format(in_file_path, out_file_path):
    examples = read_json(in_file_path)
    converted_examples = []
    for example in examples:
        converted_examples.append(convert_tacred_to_semeval(example))
    save_json_list(out_file_path, converted_examples)

def convert_tacred_benchmark_to_semeval_format():
    convert_tacred_dataset_to_semeval_format(tacred_train_path, tacred_train_path_semeval)
    convert_tacred_dataset_to_semeval_format(tacred_dev_path, tacred_dev_path_semeval)
    convert_tacred_dataset_to_semeval_format(tacred_test_path, tacred_test_path_semeval)

In [None]:
semeval_example = {
    "token":[ "we", "dissolved", "the", "contents", "of", "one", "packet", "in", "a", "carafe", "of", "water", "and", "ran", "the", "brew", "cycle", "." ],
    "h":{
        "name":"carafe",
        "pos":[
            9,
            10
        ]
    },
    "t":{
        "name":"water",
        "pos":[
            11,
            12
        ]
    },
    "relation":"Content-Container(e2,e1)"
}

In [None]:
semeval_example['token'][semeval_example['h']['pos'][0]:semeval_example['h']['pos'][1]]
semeval_example['token'][semeval_example['t']['pos'][0]:semeval_example['t']['pos'][1]]

In [None]:
tacred_conll_example = {
    "tokens":[ "Tom", "Thabane", "resigned", "in", "October", "last", "year", "to", "form", "the", "All", "Basotho",
        "Convention", "(", "ABC", ")", ",", "crossing", "the", "floor", "with", "17", "members", "of", "parliament",
        ",", "causing", "constitutional", "monarch", "King", "Letsie", "III", "to", "dissolve", "parliament", "and",
        "call", "the", "snap", "election", "." ],
    "entities":[
        {
            "type":"ORGANIZATION",
            "start":10,
            "end":13
        },
        {
            "type":"PERSON",
            "start":0,
            "end":2
        }
    ],
    "relations":[
        {
            "type":"org:founded_by",
            "head":0,
            "tail":1
        }
    ],
    "orig_id":"61b3a5c8c9a882dcfcd2"
}

In [None]:
print(tacred_conll_example['tokens'][tacred_conll_example['entities'][0]['start']:tacred_conll_example['entities'][0]['end']])
print(tacred_conll_example['tokens'][tacred_conll_example['entities'][1]['start']:tacred_conll_example['entities'][1]['end']])

In [5]:
tacred_example = {'id': "e7798fb926b9403cfcd2",
    'docid': "APW_ENG_20101103.0539",
    'relation': "per:title",
    'token': ["At", "the", "same", "time", ",", "Chief", "Financial", "Officer", "Douglas", "Flint", "will",
        "become", "chairman", ",", "succeeding", "Stephen", "Green", "who", "is", "leaving", "to", "take", 
        "a", "government", "job", "." ],
    'subj_start': 8,
    'subj_end': 9,
    'obj_start': 12,
    'obj_end': 12,
    'subj_type': "PERSON",
    'obj_type': "TITLE"}

In [8]:
convert_tacred_to_conll04(tacred_example)

{'tokens': ['At',
  'the',
  'same',
  'time',
  ',',
  'Chief',
  'Financial',
  'Officer',
  'Douglas',
  'Flint',
  'will',
  'become',
  'chairman',
  ',',
  'succeeding',
  'Stephen',
  'Green',
  'who',
  'is',
  'leaving',
  'to',
  'take',
  'a',
  'government',
  'job',
  '.'],
 'entities': [{'type': 'PERSON', 'start': 8, 'end': 10},
  {'type': 'TITLE', 'start': 12, 'end': 13}],
 'relations': [{'type': 'per:title', 'head': 0, 'tail': 1}],
 'orig_id': 'e7798fb926b9403cfcd2'}

In [None]:
print(tacred_example['token'][tacred_example['subj_start']:tacred_example['subj_end'] + 1])
print(tacred_example['token'][tacred_example['obj_start']:tacred_example['obj_end'] + 1])

In [9]:
convert_tacred_benchmark_to_conll04_format()

In [None]:
tacred_train_examples = read_json(tacred_train_path)
tacred_train_examples

In [None]:
conll04_train_examples = read_json(conll04_train_path)
conll04_train_examples

In [None]:
convert_tacred_to_conll04(tacred_train_examples[0])

In [None]:
tacred_train_examples = read_json(tacred_train_path)
tacred_dev_examples = read_json(tacred_dev_path)
tacred_test_examples = read_json(tacred_test_path)
tacred_examples = tacred_train_examples + tacred_dev_examples + tacred_test_examples

In [None]:
print('Loaded %s examples' % len(tacred_examples))

In [None]:
tacred_examples[0]

In [None]:
entity_types = set()
relation_types = set()
for example in tacred_examples:
    relation_types.add(example['relation'])
    entity_types.add(example['subj_type'])
    entity_types.add(example['obj_type'])
print(entity_types)
print(relation_types)

In [None]:
entities={
             "entities": {
                 "MISC": {"short": "Misc", "verbose": "Miscellaneous"}, 
                 "RELIGION": {"short": "Relig", "verbose": "Religion"}, 
                 "ORGANIZATION": {"short": "Org", "verbose":"Organization"}, 
                 "LOCATION": {"short": "Loc", "verbose": "Location"},
                 "DURATION": {"short": "Dur", "verbose": "Duration"},
                 "COUNTRY": {"short": "Cntr", "verbose": "Country"},
                 "NATIONALITY": {"short": "Nation", "verbose": "Nationality"},
                 "CAUSE_OF_DEATH": {"short": "CoD", "verbose": "Cause of Death"},
                 "IDEOLOGY": {"short": "Ideo", "verbose": "Ideology"},
                 "URL": {"short": "url", "verbose": "URL"},
                 "STATE_OR_PROVINCE": {"short": "SoP", "verbose": "State or Province"},
                 "NUMBER": {"short": "Num", "verbose": "Number"},
                 "CRIMINAL_CHARGE": {"short": "Crim", "verbose": "Criminal Charge"},
                 "TITLE": {"short": "Tit", "verbose": "Title"},
                 "PERSON": {"short": "Per", "verbose": "Person"},
                 "DATE": {"short": "Dat", "verbose": "Date"},
                 "CITY": {"short": "Cit", "verbose": "City"}
             }
         }

In [None]:
relations={
             "relations": {
                 "org:city_of_headquarters": {"short": "City of Headquarters", "verbose": "City of Headquarters", "symmetric": false}, 
                 "per:spouse": {"short": "Spouse", "verbose": "Spouse", "symmetric": false}, 
                 "per:cities_of_residence": {"short": "Cities of Residence", "verbose": "Cities of Residence", "symmetric": false}, 
                 "per:city_of_death": {"short": "City of Death", "verbose": "City of Death", "symmetric": false}, 
                 "org:country_of_headquarters": {"short": "Country of Headquarters", "verbose": "Country of Headquarters", "symmetric": false}, 
                 "per:country_of_birth": {"short": "Country of Birth", "verbose": "Country of Birth", "symmetric": false}, 
                 "per:siblings": {"short": "Siblings", "verbose": "Siblings", "symmetric": false}, 
                 "org:shareholders": {"short": "Shareholders", "verbose": "Shareholders", "symmetric": false}, 
                 "org:member_of": {"short": "Member of", "verbose": "Member of", "symmetric": false}, 
                 "org:dissolved": {"short": "Dissolved", "verbose": "Dissolved", "symmetric": false}, 
                 "per:schools_attended": {"short": "Schools Attended", "verbose": "Schools Attended", "symmetric": false}, 
                 "org:political/religious_affiliation": {"short": "Political/Religious affiliation", "verbose": "Political/Religious affiliation", "symmetric": false}, 
                 "per:city_of_birth": {"short": "City of Birth", "verbose": "City of Birth", "symmetric": false}, 
                 "per:children": {"short": "Children", "verbose": "Children", "symmetric": false}, 
                 "org:top_members/employees": {"short": "Top Members / Employees", "verbose": "Top Members / Employees", "symmetric": false}, 
                 "per:stateorprovince_of_birth": {"short": "State or Province of Birth", "verbose": "State or Province of Birth", "symmetric": false}, 
                 "per:stateorprovince_of_death": {"short": "State or Province of Death", "verbose": "State or Province of Death", "symmetric": false}, 
                 "org:number_of_employees/members": {"short": "Number of Employees/Members", "verbose": "Number of Employees/Members", "symmetric": false}, 
                 "per:countries_of_residence": {"short": "Countries of Residence", "verbose": "Countries of Residence", "symmetric": false}, 
                 "org:founded": {"short": "Founded", "verbose": "Founded", "symmetric": false}, 
                 "per:country_of_death": {"short": "Country of Death", "verbose": "Country of Death", "symmetric": false}, 
                 "per:title": {"short": "Title", "verbose": "Title", "symmetric": false}, 
                 "org:stateorprovince_of_headquarters": {"short": "State or Province of Headquarters", "verbose": "State or Province of Headquarters", "symmetric": false}, 
                 "per:religion": {"short": "Religion", "verbose": "Religion", "symmetric": false}, 
                 "org:founded_by": {"short": "Founded by", "verbose": "Founded by", "symmetric": false}, 
                 "per:age": {"short": "Age", "verbose": "Age", "symmetric": false}, 
                 "no_relation": {"short": "No Relation", "verbose": "No Relation", "symmetric": false}, 
                 "per:stateorprovinces_of_residence": {"short": "State or Provinces of Residence", "verbose": "State or Provinces of Residence", "symmetric": false}, 
                 "org:website": {"short": "Website", "verbose": "Website", "symmetric": false}, 
                 "per:employee_of": {"short": "Employee Of", "verbose": "Employee Of", "symmetric": false}, 
                 "org:parents": {"short": "Organization Parents", "verbose": "Organization Parents", "symmetric": false}, 
                 "per:parents": {"short": "Person Parents", "verbose": "Person Parents", "symmetric": false}, 
                 "org:alternate_names": {"short": "Organization Alternate Names", "verbose": "Organization Alternate Names", "symmetric": false}, 
                 "org:subsidiaries": {"short": "Subsidiaries", "verbose": "Subsidiaries", "symmetric": false}, 
                 "per:alternate_names": {"short": "Person Alternate Names", "verbose": "Person Alternate Names", "symmetric": false}, 
                 "per:cause_of_death": {"short": "Cause of Death", "verbose": "Cause of Death", "symmetric": false}, 
                 "per:date_of_death": {"short": "Date of Death", "verbose": "Date of Death", "symmetric": false}, 
                 "org:members": {"short": "Members", "verbose": "Members", "symmetric": false}, 
                 "per:date_of_birth": {"short": "Date of Birth", "verbose": "Date of Birth", "symmetric": false}, 
                 "per:charges": {"short": "Charges", "verbose": "Charges", "symmetric": false}, 
                 "per:origin": {"short": "Origin", "verbose": "Origin", "symmetric": false}, 
                 "per:other_family": {"short": "Other Family", "verbose": "Other Family", "symmetric": false}
             }
         }

In [45]:
convert_tacred_benchmark_to_semeval_format()

In [26]:
tacred_semeval_examples = read_json(tacred_train_path_semeval)

In [27]:
relations = set()
for example in tacred_semeval_examples:
    relations.add(example['relation'])
print(relations)

{'org:dissolved', 'org:city_of_headquarters', 'per:title', 'org:founded_by', 'org:alternate_names', 'no_relation', 'per:children', 'org:members', 'per:date_of_death', 'per:religion', 'per:other_family', 'per:origin', 'org:country_of_headquarters', 'org:member_of', 'per:countries_of_residence', 'per:cause_of_death', 'org:number_of_employees/members', 'org:founded', 'org:subsidiaries', 'per:cities_of_residence', 'org:top_members/employees', 'per:parents', 'per:city_of_birth', 'org:shareholders', 'per:employee_of', 'per:country_of_birth', 'per:age', 'per:country_of_death', 'per:alternate_names', 'org:website', 'per:stateorprovince_of_birth', 'per:stateorprovinces_of_residence', 'org:political/religious_affiliation', 'per:siblings', 'per:date_of_birth', 'org:parents', 'per:charges', 'per:stateorprovince_of_death', 'per:spouse', 'per:city_of_death', 'org:stateorprovince_of_headquarters', 'per:schools_attended'}


In [30]:
relations_list = {}
for idx, relation in enumerate(relations):
    relations_list[relation] = idx
save_json(tacred_rel2id_path_semeval, relations_list)

In [32]:
save_json_list('test.json', [semeval_example, semeval_example])

TypeError: dumps() takes 1 positional argument but 2 were given