In [6]:
from pathlib import Path
from tqdm import tqdm
import json

In [2]:
raw_nyt_path = Path("/media/discoD/repositorios/CasRel/data/NYT/raw_NYT/raw_nyt")
spert_path = Path("/media/discoD/repositorios/spert/data/datasets/nyt")
if not spert_path.exists():
    spert_path.mkdir()

In [3]:
nyt_train_path = raw_nyt_path / 'raw_train.json'
nyt_valid_path = raw_nyt_path / 'raw_valid.json'
nyt_test_path = raw_nyt_path / 'raw_test.json'
nyt_train_spert_path = spert_path / 'train.json'
nyt_valid_spert_path = spert_path / 'dev.json'
nyt_test_spert_path = spert_path / 'test.json'
nyt_types_path = spert_path / 'types.json'

In [9]:
for line in nyt_train_path.open(mode='r', encoding='utf-8').readlines():
    print(line)
    print(json.loads(line))
    break

{"sentText": "Massachusetts ASTON MAGNA Great Barrington ; also at Bard College , Annandale-on-Hudson , N.Y. , July 1-Aug .", "articleId": "/m/vinci8/data1/riedel/projects/relation/kb/nyt1/docstore/nyt-2005-2006.backup/1669365.xml.pb", "relationMentions": [{"em1Text": "Annandale-on-Hudson", "em2Text": "Bard College", "label": "/location/location/contains"}], "entityMentions": [{"start": 1, "label": "ORGANIZATION", "text": "Bard College"}, {"start": 2, "label": "LOCATION", "text": "Annandale-on-Hudson"}], "sentId": "1"}

{'sentText': 'Massachusetts ASTON MAGNA Great Barrington ; also at Bard College , Annandale-on-Hudson , N.Y. , July 1-Aug .', 'articleId': '/m/vinci8/data1/riedel/projects/relation/kb/nyt1/docstore/nyt-2005-2006.backup/1669365.xml.pb', 'relationMentions': [{'em1Text': 'Annandale-on-Hudson', 'em2Text': 'Bard College', 'label': '/location/location/contains'}], 'entityMentions': [{'start': 1, 'label': 'ORGANIZATION', 'text': 'Bard College'}, {'start': 2, 'label': 'LOCATION

In [16]:
def read_json(input_file):
    data = []
    for line in input_file.open(mode="r", encoding='utf-8').readlines():
        data.append(json.loads(line))
    return data

def save_json(json_path, examples):
    with open(json_path, 'w') as outfile:
        json.dump(examples, outfile)
        
def save_json_list(json_path, examples):
    with open(json_path, 'w') as fp:
        fp.write('\n'.join(json.dumps(example) for example in examples))
        
def convert_nyt_to_spert_entities(sentence_tokens, entities_mentions):
    entities = []
    for entity_mention in entities_mentions:
        entity_text = entity_mention['text']
        entity_type = entity_mention['label']
        entity_tokens = entity_text.split()
        starting_tokens = [idx for idx, token in enumerate(sentence_tokens) if token == entity_tokens[0]]
        for starting_token in starting_tokens:
            entities.append({
                "type": entity_type,
                "text": entity_text,
                "tokens": entity_tokens,
                "start": starting_token,
                "end": starting_token + len(entity_tokens)
            })
    return entities

def convert_nyt_to_spert_relations(sentence_tokens, entities, relations_mentions):
    relations = []
    
    def get_entities(entity_text):
        return [(i, entity) for i, entity in enumerate(entities) if entity['text'] == entity_text]
    
    for relation_mention in relations_mentions:
        subject = relation_mention["em1Text"]
        subject_entity = get_entities(subject)
        _object = relation_mention["em2Text"]
        object_entity = get_entities(_object)
        relation_type = relation_mention["label"]
        for subject_, object_ in zip(subject_entity, object_entity):
            relations.append({
                "type": relation_type,
                "head": subject_[0],
                "tail": object_[0]
            })
            
    return relations
        
def convert_nyt_to_spert(raw_example):
    tokens = [token for token in raw_example['sentText'].split()]
    orig_id = raw_example['articleId']
    entities_mentions = raw_example['entityMentions']
    entities = convert_nyt_to_spert_entities(tokens, entities_mentions)
    relations_mentions = raw_example['relationMentions']
    relations = convert_nyt_to_spert_relations(tokens, entities, relations_mentions)
    return {'tokens': tokens, 'entities': entities, 'relations': relations, 'orig_id': orig_id}

# {
#  "sentText": "Massachusetts ASTON MAGNA Great Barrington ; also at Bard College , Annandale-on-Hudson , N.Y. , July 1-Aug .", 
#  "articleId": "/m/vinci8/data1/riedel/projects/relation/kb/nyt1/docstore/nyt-2005-2006.backup/1669365.xml.pb", 
#  "relationMentions": [
#      {"em1Text": "Annandale-on-Hudson", "em2Text": "Bard College", "label": "/location/location/contains"}
#  ], 
#  "entityMentions": [
#      {"start": 1, "label": "ORGANIZATION", "text": "Bard College"}, 
#      {"start": 2, "label": "LOCATION", "text": "Annandale-on-Hudson"}
#  ], 
#  "sentId": "1"
# }

# {
#     "tokens": ["Newspaper", "`", "Explains", "'", "U.S.", "Interests", "Section", "Events", "FL1402001894", "Havana", "Radio", "Reloj", "Network", "in", "Spanish", "2100", "GMT", "13", "Feb", "94"], 
#     "entities": [
#         {"type": "Loc", "start": 4, "end": 5}, 
#         {"type": "Loc", "start": 9, "end": 10}, 
#         {"type": "Org", "start": 10, "end": 13}, 
#         {"type": "Other", "start": 15, "end": 17}, 
#         {"type": "Other", "start": 17, "end": 20}
#     ], 
#     "relations": [
#         {"type": "OrgBased_In", "head": 2, "tail": 1}
#     ], 
#     "orig_id": 3255
# }

def convert_nyt_dataset_to_spert_format(in_file_path, out_file_path):
    examples = read_json(in_file_path)
    converted_examples = []
    for example in tqdm(examples, 'Examples from %s' % in_file_path):
        converted_examples.append(convert_nyt_to_spert(example))
    save_json(out_file_path, converted_examples)

def convert_nyt_benchmark_to_spert_format():
    convert_nyt_dataset_to_spert_format(nyt_train_path, nyt_train_spert_path)
    convert_nyt_dataset_to_spert_format(nyt_valid_path, nyt_valid_spert_path)
    convert_nyt_dataset_to_spert_format(nyt_test_path, nyt_test_spert_path)

In [17]:
convert_nyt_benchmark_to_spert_format()

Examples from /media/discoD/repositorios/CasRel/data/NYT/raw_NYT/raw_nyt/raw_train.json: 100%|██████████| 56196/56196 [00:05<00:00, 10654.71it/s]
Examples from /media/discoD/repositorios/CasRel/data/NYT/raw_NYT/raw_nyt/raw_valid.json: 100%|██████████| 5000/5000 [00:00<00:00, 24995.58it/s]
Examples from /media/discoD/repositorios/CasRel/data/NYT/raw_NYT/raw_nyt/raw_test.json: 100%|██████████| 5000/5000 [00:00<00:00, 9555.61it/s] 


In [None]:
print(tacred_example['token'][tacred_example['subj_start']:tacred_example['subj_end'] + 1])
print(tacred_example['token'][tacred_example['obj_start']:tacred_example['obj_end'] + 1])

In [9]:
convert_tacred_benchmark_to_conll04_format()

In [None]:
tacred_train_examples = read_json(tacred_train_path)
tacred_train_examples

In [None]:
conll04_train_examples = read_json(conll04_train_path)
conll04_train_examples

In [None]:
convert_tacred_to_conll04(tacred_train_examples[0])

In [None]:
tacred_train_examples = read_json(tacred_train_path)
tacred_dev_examples = read_json(tacred_dev_path)
tacred_test_examples = read_json(tacred_test_path)
tacred_examples = tacred_train_examples + tacred_dev_examples + tacred_test_examples

In [None]:
print('Loaded %s examples' % len(tacred_examples))

In [None]:
tacred_examples[0]

In [None]:
entity_types = set()
relation_types = set()
for example in tacred_examples:
    relation_types.add(example['relation'])
    entity_types.add(example['subj_type'])
    entity_types.add(example['obj_type'])
print(entity_types)
print(relation_types)

In [None]:
entities={
             "entities": {
                 "MISC": {"short": "Misc", "verbose": "Miscellaneous"}, 
                 "RELIGION": {"short": "Relig", "verbose": "Religion"}, 
                 "ORGANIZATION": {"short": "Org", "verbose":"Organization"}, 
                 "LOCATION": {"short": "Loc", "verbose": "Location"},
                 "DURATION": {"short": "Dur", "verbose": "Duration"},
                 "COUNTRY": {"short": "Cntr", "verbose": "Country"},
                 "NATIONALITY": {"short": "Nation", "verbose": "Nationality"},
                 "CAUSE_OF_DEATH": {"short": "CoD", "verbose": "Cause of Death"},
                 "IDEOLOGY": {"short": "Ideo", "verbose": "Ideology"},
                 "URL": {"short": "url", "verbose": "URL"},
                 "STATE_OR_PROVINCE": {"short": "SoP", "verbose": "State or Province"},
                 "NUMBER": {"short": "Num", "verbose": "Number"},
                 "CRIMINAL_CHARGE": {"short": "Crim", "verbose": "Criminal Charge"},
                 "TITLE": {"short": "Tit", "verbose": "Title"},
                 "PERSON": {"short": "Per", "verbose": "Person"},
                 "DATE": {"short": "Dat", "verbose": "Date"},
                 "CITY": {"short": "Cit", "verbose": "City"}
             }
         }

In [None]:
relations={
             "relations": {
                 "org:city_of_headquarters": {"short": "City of Headquarters", "verbose": "City of Headquarters", "symmetric": false}, 
                 "per:spouse": {"short": "Spouse", "verbose": "Spouse", "symmetric": false}, 
                 "per:cities_of_residence": {"short": "Cities of Residence", "verbose": "Cities of Residence", "symmetric": false}, 
                 "per:city_of_death": {"short": "City of Death", "verbose": "City of Death", "symmetric": false}, 
                 "org:country_of_headquarters": {"short": "Country of Headquarters", "verbose": "Country of Headquarters", "symmetric": false}, 
                 "per:country_of_birth": {"short": "Country of Birth", "verbose": "Country of Birth", "symmetric": false}, 
                 "per:siblings": {"short": "Siblings", "verbose": "Siblings", "symmetric": false}, 
                 "org:shareholders": {"short": "Shareholders", "verbose": "Shareholders", "symmetric": false}, 
                 "org:member_of": {"short": "Member of", "verbose": "Member of", "symmetric": false}, 
                 "org:dissolved": {"short": "Dissolved", "verbose": "Dissolved", "symmetric": false}, 
                 "per:schools_attended": {"short": "Schools Attended", "verbose": "Schools Attended", "symmetric": false}, 
                 "org:political/religious_affiliation": {"short": "Political/Religious affiliation", "verbose": "Political/Religious affiliation", "symmetric": false}, 
                 "per:city_of_birth": {"short": "City of Birth", "verbose": "City of Birth", "symmetric": false}, 
                 "per:children": {"short": "Children", "verbose": "Children", "symmetric": false}, 
                 "org:top_members/employees": {"short": "Top Members / Employees", "verbose": "Top Members / Employees", "symmetric": false}, 
                 "per:stateorprovince_of_birth": {"short": "State or Province of Birth", "verbose": "State or Province of Birth", "symmetric": false}, 
                 "per:stateorprovince_of_death": {"short": "State or Province of Death", "verbose": "State or Province of Death", "symmetric": false}, 
                 "org:number_of_employees/members": {"short": "Number of Employees/Members", "verbose": "Number of Employees/Members", "symmetric": false}, 
                 "per:countries_of_residence": {"short": "Countries of Residence", "verbose": "Countries of Residence", "symmetric": false}, 
                 "org:founded": {"short": "Founded", "verbose": "Founded", "symmetric": false}, 
                 "per:country_of_death": {"short": "Country of Death", "verbose": "Country of Death", "symmetric": false}, 
                 "per:title": {"short": "Title", "verbose": "Title", "symmetric": false}, 
                 "org:stateorprovince_of_headquarters": {"short": "State or Province of Headquarters", "verbose": "State or Province of Headquarters", "symmetric": false}, 
                 "per:religion": {"short": "Religion", "verbose": "Religion", "symmetric": false}, 
                 "org:founded_by": {"short": "Founded by", "verbose": "Founded by", "symmetric": false}, 
                 "per:age": {"short": "Age", "verbose": "Age", "symmetric": false}, 
                 "no_relation": {"short": "No Relation", "verbose": "No Relation", "symmetric": false}, 
                 "per:stateorprovinces_of_residence": {"short": "State or Provinces of Residence", "verbose": "State or Provinces of Residence", "symmetric": false}, 
                 "org:website": {"short": "Website", "verbose": "Website", "symmetric": false}, 
                 "per:employee_of": {"short": "Employee Of", "verbose": "Employee Of", "symmetric": false}, 
                 "org:parents": {"short": "Organization Parents", "verbose": "Organization Parents", "symmetric": false}, 
                 "per:parents": {"short": "Person Parents", "verbose": "Person Parents", "symmetric": false}, 
                 "org:alternate_names": {"short": "Organization Alternate Names", "verbose": "Organization Alternate Names", "symmetric": false}, 
                 "org:subsidiaries": {"short": "Subsidiaries", "verbose": "Subsidiaries", "symmetric": false}, 
                 "per:alternate_names": {"short": "Person Alternate Names", "verbose": "Person Alternate Names", "symmetric": false}, 
                 "per:cause_of_death": {"short": "Cause of Death", "verbose": "Cause of Death", "symmetric": false}, 
                 "per:date_of_death": {"short": "Date of Death", "verbose": "Date of Death", "symmetric": false}, 
                 "org:members": {"short": "Members", "verbose": "Members", "symmetric": false}, 
                 "per:date_of_birth": {"short": "Date of Birth", "verbose": "Date of Birth", "symmetric": false}, 
                 "per:charges": {"short": "Charges", "verbose": "Charges", "symmetric": false}, 
                 "per:origin": {"short": "Origin", "verbose": "Origin", "symmetric": false}, 
                 "per:other_family": {"short": "Other Family", "verbose": "Other Family", "symmetric": false}
             }
         }

In [45]:
convert_tacred_benchmark_to_semeval_format()

In [26]:
tacred_semeval_examples = read_json(tacred_train_path_semeval)

In [27]:
relations = set()
for example in tacred_semeval_examples:
    relations.add(example['relation'])
print(relations)

{'org:dissolved', 'org:city_of_headquarters', 'per:title', 'org:founded_by', 'org:alternate_names', 'no_relation', 'per:children', 'org:members', 'per:date_of_death', 'per:religion', 'per:other_family', 'per:origin', 'org:country_of_headquarters', 'org:member_of', 'per:countries_of_residence', 'per:cause_of_death', 'org:number_of_employees/members', 'org:founded', 'org:subsidiaries', 'per:cities_of_residence', 'org:top_members/employees', 'per:parents', 'per:city_of_birth', 'org:shareholders', 'per:employee_of', 'per:country_of_birth', 'per:age', 'per:country_of_death', 'per:alternate_names', 'org:website', 'per:stateorprovince_of_birth', 'per:stateorprovinces_of_residence', 'org:political/religious_affiliation', 'per:siblings', 'per:date_of_birth', 'org:parents', 'per:charges', 'per:stateorprovince_of_death', 'per:spouse', 'per:city_of_death', 'org:stateorprovince_of_headquarters', 'per:schools_attended'}


In [30]:
relations_list = {}
for idx, relation in enumerate(relations):
    relations_list[relation] = idx
save_json(tacred_rel2id_path_semeval, relations_list)

In [32]:
save_json_list('test.json', [semeval_example, semeval_example])

TypeError: dumps() takes 1 positional argument but 2 were given