In [2]:
from pathlib import Path
import json

In [13]:
tacred_train_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/train.json')
tacred_dev_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/dev.json')
tacred_test_path = Path('/media/discoD/Corpora/TACRED/tacred/data/json/test.json')
conll04_train_path = Path('../data/datasets/conll04/conll04_train.json')
tacred_train_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/train_conll04.json')
tacred_dev_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/dev_conll04.json')
tacred_test_path_conll04 = Path('/media/discoD/Corpora/TACRED/tacred/data/json/test_conll04.json')

In [27]:
def read_json(input_file):
    with open(input_file, "r", encoding='utf-8') as reader:
        data = json.load(reader)
    return data

def save_json(json_path, examples):
    with open(json_path, 'w') as outfile:
        json.dump(examples, outfile)

def convert_token(token):
    """ Convert PTB tokens to normal tokens """
    if (token.lower() == '-lrb-'):
        return '('
    elif (token.lower() == '-rrb-'):
        return ')'
    elif (token.lower() == '-lsb-'):
        return '['
    elif (token.lower() == '-rsb-'):
        return ']'
    elif (token.lower() == '-lcb-'):
        return '{'
    elif (token.lower() == '-rcb-'):
        return '}'
    return token
        
def convert_tacred_to_conll04(tacred_example):
    sentence = [convert_token(token) for token in tacred_example['token']]
    return {'tokens': sentence, 
            'entities': [{'type': tacred_example['subj_type'], 'start': tacred_example['subj_start'], 'end': int(tacred_example['subj_end']) + 1}, 
                         {'type': tacred_example['obj_type'], 'start': tacred_example['obj_start'], 'end': int(tacred_example['obj_end']) + 1}
                        ],
            'relations': [{'type': tacred_example['relation'], 'head': 0, 'tail': 1}],
            'orig_id': tacred_example['id']
           }

def convert_tacred_dataset_to_conll04_format(in_file_path, out_file_path):
    examples = read_json(in_file_path)
    converted_examples = []
    for example in examples:
        converted_examples.append(convert_tacred_to_conll04(example))
    save_json(out_file_path, converted_examples)

def convert_tacred_benchmark_to_conll04_format():
    convert_tacred_dataset_to_conll04_format(tacred_train_path, tacred_train_path_conll04)
    convert_tacred_dataset_to_conll04_format(tacred_dev_path, tacred_dev_path_conll04)
    convert_tacred_dataset_to_conll04_format(tacred_test_path, tacred_test_path_conll04)

In [29]:
convert_tacred_benchmark_to_conll04_format()

In [6]:
tacred_train_examples = read_json(tacred_train_path)
tacred_train_examples

[{'guid': '61b3a5c8c9a882dcfcd2',
  'label': 'org:founded_by',
  'ner1': 'ORGANIZATION',
  'ner2': 'PERSON',
  'sentence': ['Tom',
   'Thabane',
   'resigned',
   'in',
   'October',
   'last',
   'year',
   'to',
   'form',
   'the',
   'All',
   'Basotho',
   'Convention',
   '(',
   'ABC',
   ')',
   ',',
   'crossing',
   'the',
   'floor',
   'with',
   '17',
   'members',
   'of',
   'parliament',
   ',',
   'causing',
   'constitutional',
   'monarch',
   'King',
   'Letsie',
   'III',
   'to',
   'dissolve',
   'parliament',
   'and',
   'call',
   'the',
   'snap',
   'election',
   '.'],
  'span1': [10, 12],
  'span2': [0, 1]},
 {'guid': '61b3a65fb9b7111c4ca4',
  'label': 'no_relation',
  'ner1': 'PERSON',
  'ner2': 'PERSON',
  'sentence': ['In',
   '1983',
   ',',
   'a',
   'year',
   'after',
   'the',
   'rally',
   ',',
   'Forsberg',
   'received',
   'the',
   'so-called',
   '``',
   'genius',
   'award',
   "''",
   'from',
   'the',
   'John',
   'D.',
   'and',
   

In [8]:
conll04_train_examples = read_json(conll04_train_path)
conll04_train_examples

[{'tokens': ['Newspaper',
   '`',
   'Explains',
   "'",
   'U.S.',
   'Interests',
   'Section',
   'Events',
   'FL1402001894',
   'Havana',
   'Radio',
   'Reloj',
   'Network',
   'in',
   'Spanish',
   '2100',
   'GMT',
   '13',
   'Feb',
   '94'],
  'entities': [{'type': 'Loc', 'start': 4, 'end': 5},
   {'type': 'Loc', 'start': 9, 'end': 10},
   {'type': 'Org', 'start': 10, 'end': 13},
   {'type': 'Other', 'start': 15, 'end': 17},
   {'type': 'Other', 'start': 17, 'end': 20}],
  'relations': [{'type': 'OrgBased_In', 'head': 2, 'tail': 1}],
  'orig_id': 3255},
 {'tokens': ['`',
   '`',
   'If',
   'it',
   'does',
   'not',
   'snow',
   ',',
   'and',
   'a',
   'lot',
   ',',
   'within',
   'this',
   'month',
   'we',
   'will',
   'have',
   'no',
   'water',
   'to',
   'submerge',
   '150',
   ',',
   '000',
   'hectares',
   '(',
   '370',
   ',',
   '500',
   'acres',
   ')',
   'of',
   'rice',
   ',',
   "'",
   "'",
   'said',
   'Bruno',
   'Pusterla',
   ',',
   'a',

In [10]:
convert_tacred_to_conll04(tacred_train_examples[0])

{'tokens': ['Tom',
  'Thabane',
  'resigned',
  'in',
  'October',
  'last',
  'year',
  'to',
  'form',
  'the',
  'All',
  'Basotho',
  'Convention',
  '(',
  'ABC',
  ')',
  ',',
  'crossing',
  'the',
  'floor',
  'with',
  '17',
  'members',
  'of',
  'parliament',
  ',',
  'causing',
  'constitutional',
  'monarch',
  'King',
  'Letsie',
  'III',
  'to',
  'dissolve',
  'parliament',
  'and',
  'call',
  'the',
  'snap',
  'election',
  '.'],
 'entities': [{'type': 'ORGANIZATION', 'start': 10, 'end': 12},
  {'type': 'PERSON', 'start': 0, 'end': 1}],
 'relations': [{'type': 'org:founded_by', 'head': 0, 'tail': 1}],
 'orig_id': '61b3a5c8c9a882dcfcd2'}

In [20]:
tacred_train_examples = read_json(tacred_train_path)
tacred_dev_examples = read_json(tacred_dev_path)
tacred_test_examples = read_json(tacred_test_path)
tacred_examples = tacred_train_examples + tacred_dev_examples + tacred_test_examples

In [21]:
print('Loaded %s examples' % len(tacred_examples))

Loaded 106264 examples


In [22]:
tacred_examples[0]

{'id': '61b3a5c8c9a882dcfcd2',
 'docid': 'AFP_ENG_20070218.0019.LDC2009T13',
 'relation': 'org:founded_by',
 'token': ['Tom',
  'Thabane',
  'resigned',
  'in',
  'October',
  'last',
  'year',
  'to',
  'form',
  'the',
  'All',
  'Basotho',
  'Convention',
  '-LRB-',
  'ABC',
  '-RRB-',
  ',',
  'crossing',
  'the',
  'floor',
  'with',
  '17',
  'members',
  'of',
  'parliament',
  ',',
  'causing',
  'constitutional',
  'monarch',
  'King',
  'Letsie',
  'III',
  'to',
  'dissolve',
  'parliament',
  'and',
  'call',
  'the',
  'snap',
  'election',
  '.'],
 'subj_start': 10,
 'subj_end': 12,
 'obj_start': 0,
 'obj_end': 1,
 'subj_type': 'ORGANIZATION',
 'obj_type': 'PERSON',
 'stanford_pos': ['NNP',
  'NNP',
  'VBD',
  'IN',
  'NNP',
  'JJ',
  'NN',
  'TO',
  'VB',
  'DT',
  'DT',
  'NNP',
  'NNP',
  '-LRB-',
  'NNP',
  '-RRB-',
  ',',
  'VBG',
  'DT',
  'NN',
  'IN',
  'CD',
  'NNS',
  'IN',
  'NN',
  ',',
  'VBG',
  'JJ',
  'NN',
  'NNP',
  'NNP',
  'NNP',
  'TO',
  'VB',
  'NN'

In [23]:
entity_types = set()
relation_types = set()
for example in tacred_examples:
    relation_types.add(example['relation'])
    entity_types.add(example['subj_type'])
    entity_types.add(example['obj_type'])
print(entity_types)
print(relation_types)

{'MISC', 'RELIGION', 'ORGANIZATION', 'LOCATION', 'DURATION', 'COUNTRY', 'NATIONALITY', 'CAUSE_OF_DEATH', 'IDEOLOGY', 'URL', 'STATE_OR_PROVINCE', 'NUMBER', 'CRIMINAL_CHARGE', 'TITLE', 'PERSON', 'DATE', 'CITY'}
{'org:city_of_headquarters', 'per:spouse', 'per:cities_of_residence', 'per:city_of_death', 'org:country_of_headquarters', 'per:country_of_birth', 'per:siblings', 'org:shareholders', 'org:member_of', 'org:dissolved', 'per:schools_attended', 'org:political/religious_affiliation', 'per:city_of_birth', 'per:children', 'org:top_members/employees', 'per:stateorprovince_of_birth', 'per:stateorprovince_of_death', 'org:number_of_employees/members', 'per:countries_of_residence', 'org:founded', 'per:country_of_death', 'per:title', 'org:stateorprovince_of_headquarters', 'per:religion', 'org:founded_by', 'per:age', 'no_relation', 'per:stateorprovinces_of_residence', 'org:website', 'per:employee_of', 'org:parents', 'per:parents', 'org:alternate_names', 'org:subsidiaries', 'per:alternate_names',

In [24]:
entities={
             "entities": {
                 "MISC": {"short": "Misc", "verbose": "Miscellaneous"}, 
                 "RELIGION": {"short": "Relig", "verbose": "Religion"}, 
                 "ORGANIZATION": {"short": "Org", "verbose":"Organization"}, 
                 "LOCATION": {"short": "Loc", "verbose": "Location"},
                 "DURATION": {"short": "Dur", "verbose": "Duration"},
                 "COUNTRY": {"short": "Cntr", "verbose": "Country"},
                 "NATIONALITY": {"short": "Nation", "verbose": "Nationality"},
                 "CAUSE_OF_DEATH": {"short": "CoD", "verbose": "Cause of Death"},
                 "IDEOLOGY": {"short": "Ideo", "verbose": "Ideology"},
                 "URL": {"short": "url", "verbose": "URL"},
                 "STATE_OR_PROVINCE": {"short": "SoP", "verbose": "State or Province"},
                 "NUMBER": {"short": "Num", "verbose": "Number"},
                 "CRIMINAL_CHARGE": {"short": "Crim", "verbose": "Criminal Charge"},
                 "TITLE": {"short": "Tit", "verbose": "Title"},
                 "PERSON": {"short": "Per", "verbose": "Person"},
                 "DATE": {"short": "Dat", "verbose": "Date"},
                 "CITY": {"short": "Cit", "verbose": "City"}
             }
         }

In [None]:
relations={
             "relations": {
                 "org:city_of_headquarters": {"short": "City of Headquarters", "verbose": "City of Headquarters", "symmetric": false}, 
                 "per:spouse": {"short": "Spouse", "verbose": "Spouse", "symmetric": false}, 
                 "per:cities_of_residence": {"short": "Cities of Residence", "verbose": "Cities of Residence", "symmetric": false}, 
                 "per:city_of_death": {"short": "City of Death", "verbose": "City of Death", "symmetric": false}, 
                 "org:country_of_headquarters": {"short": "Country of Headquarters", "verbose": "Country of Headquarters", "symmetric": false}, 
                 "per:country_of_birth": {"short": "Country of Birth", "verbose": "Country of Birth", "symmetric": false}, 
                 "per:siblings": {"short": "Siblings", "verbose": "Siblings", "symmetric": false}, 
                 "org:shareholders": {"short": "Shareholders", "verbose": "Shareholders", "symmetric": false}, 
                 "org:member_of": {"short": "Member of", "verbose": "Member of", "symmetric": false}, 
                 "org:dissolved": {"short": "Dissolved", "verbose": "Dissolved", "symmetric": false}, 
                 "per:schools_attended": {"short": "Schools Attended", "verbose": "Schools Attended", "symmetric": false}, 
                 "org:political/religious_affiliation": {"short": "Political/Religious affiliation", "verbose": "Political/Religious affiliation", "symmetric": false}, 
                 "per:city_of_birth": {"short": "City of Birth", "verbose": "City of Birth", "symmetric": false}, 
                 "per:children": {"short": "Children", "verbose": "Children", "symmetric": false}, 
                 "org:top_members/employees": {"short": "Top Members / Employees", "verbose": "Top Members / Employees", "symmetric": false}, 
                 "per:stateorprovince_of_birth": {"short": "State or Province of Birth", "verbose": "State or Province of Birth", "symmetric": false}, 
                 "per:stateorprovince_of_death": {"short": "State or Province of Death", "verbose": "State or Province of Death", "symmetric": false}, 
                 "org:number_of_employees/members": {"short": "Number of Employees/Members", "verbose": "Number of Employees/Members", "symmetric": false}, 
                 "per:countries_of_residence": {"short": "Countries of Residence", "verbose": "Countries of Residence", "symmetric": false}, 
                 "org:founded": {"short": "Founded", "verbose": "Founded", "symmetric": false}, 
                 "per:country_of_death": {"short": "Country of Death", "verbose": "Country of Death", "symmetric": false}, 
                 "per:title": {"short": "Title", "verbose": "Title", "symmetric": false}, 
                 "org:stateorprovince_of_headquarters": {"short": "State or Province of Headquarters", "verbose": "State or Province of Headquarters", "symmetric": false}, 
                 "per:religion": {"short": "Religion", "verbose": "Religion", "symmetric": false}, 
                 "org:founded_by": {"short": "Founded by", "verbose": "Founded by", "symmetric": false}, 
                 "per:age": {"short": "Age", "verbose": "Age", "symmetric": false}, 
                 "no_relation": {"short": "No Relation", "verbose": "No Relation", "symmetric": false}, 
                 "per:stateorprovinces_of_residence": {"short": "State or Provinces of Residence", "verbose": "State or Provinces of Residence", "symmetric": false}, 
                 "org:website": {"short": "Website", "verbose": "Website", "symmetric": false}, 
                 "per:employee_of": {"short": "Employee Of", "verbose": "Employee Of", "symmetric": false}, 
                 "org:parents": {"short": "Organization Parents", "verbose": "Organization Parents", "symmetric": false}, 
                 "per:parents": {"short": "Person Parents", "verbose": "Person Parents", "symmetric": false}, 
                 "org:alternate_names": {"short": "Organization Alternate Names", "verbose": "Organization Alternate Names", "symmetric": false}, 
                 "org:subsidiaries": {"short": "Subsidiaries", "verbose": "Subsidiaries", "symmetric": false}, 
                 "per:alternate_names": {"short": "Person Alternate Names", "verbose": "Person Alternate Names", "symmetric": false}, 
                 "per:cause_of_death": {"short": "Cause of Death", "verbose": "Cause of Death", "symmetric": false}, 
                 "per:date_of_death": {"short": "Date of Death", "verbose": "Date of Death", "symmetric": false}, 
                 "org:members": {"short": "Members", "verbose": "Members", "symmetric": false}, 
                 "per:date_of_birth": {"short": "Date of Birth", "verbose": "Date of Birth", "symmetric": false}, 
                 "per:charges": {"short": "Charges", "verbose": "Charges", "symmetric": false}, 
                 "per:origin": {"short": "Origin", "verbose": "Origin", "symmetric": false}, 
                 "per:other_family": {"short": "Other Family", "verbose": "Other Family", "symmetric": false}
             }
         }