In [249]:
import pandas as pd
import json

In [250]:
train = pd.read_csv('./FB15k-237/train.txt', sep = '\t', header = None)
train.columns = ['head', 'relation', 'tail']

In [251]:
test = pd.read_csv('./FB15k-237/test.txt', sep = '\t', header = None)
test.columns = ['head', 'relation', 'tail']

In [252]:
valid = pd.read_csv('./FB15k-237/valid.txt', sep = '\t', header = None)
valid.columns = ['head', 'relation', 'tail']

In [253]:
path_wikidata = '/Users/home/Documents/github/datasets_knowledge_embedding/FB15K-237/entity2wikidata.json'
entities_wikidata = json.loads(open(path_wikidata).read())

In [254]:
def get_entity(entity, entities_wikidata):
    if entity in entities_wikidata:
        return entities_wikidata[entity]['label'].lower().replace(' ', '_')
    else:
        return entity

In [255]:
train['head'] = train['head'].apply(lambda head: get_entity(head, entities_wikidata))
train['tail'] = train['tail'].apply(lambda tail: get_entity(tail, entities_wikidata))

valid['head'] = valid['head'].apply(lambda head: get_entity(head, entities_wikidata))
valid['tail'] = valid['tail'].apply(lambda tail: get_entity(tail, entities_wikidata))

test['head'] = test['head'].apply(lambda head: get_entity(head, entities_wikidata))
test['tail'] = test['tail'].apply(lambda tail: get_entity(tail, entities_wikidata))

In [256]:
train.head()

Unnamed: 0,head,relation,tail
0,dominican_republic,/location/country/form_of_government,republic
1,mighty_morphin_power_rangers,/tv/tv_program/regular_cast./tv/regular_tv_app...,wendee_lee
2,drama_film,/media_common/netflix_genre/titles,american_history_x
3,michelle_rodriguez,/award/award_winner/awards_won./award/award_ho...,naveen_andrews
4,australia_national_association_football_team,/soccer/football_team/current_roster./sports/s...,midfielder


In [257]:
valid.head()

Unnamed: 0,head,relation,tail
0,american_pie,/film/film/genre,romance_film
1,st._louis,/location/location/time_zones,central_time_zone
2,george_burns,/people/person/spouse_s./people/marriage/type_...,marriage
3,primetime_emmy_award_for_outstanding_writing_f...,/award/award_category/winners./award/award_hon...,david_chase
4,silent_hill,/film/film/release_date_s./film/film_regional_...,lithuania


In [258]:
test.head()

Unnamed: 0,head,relation,tail
0,zürich,/travel/travel_destination/climate./travel/tra...,october
1,autoharp,/music/performance_role/regular_performances./...,heart
2,winnie_the_pooh,/film/film/release_date_s./film/film_regional_...,france
3,england,/location/location/contains,pontefract
4,england,/location/location/contains,lancaster


In [259]:
df = pd.concat([train, test, valid], axis = "rows")

entities = pd.concat(
    [
        pd.DataFrame(df['head']), 
        pd.DataFrame(df['tail']).rename(columns={'tail':'head'})
    ], 
    axis='rows', 
    sort=False
)

In [260]:
entities = entities.drop_duplicates().reset_index(drop=True)
entities = entities.to_dict()['head']
entities = {value: key for key, value in entities.items()}

In [261]:
with open('./fb15k237/entities.json', 'w') as output:
    json.dump(entities, output, indent=4)

In [262]:
relations = df['relation'].drop_duplicates().reset_index(drop=True).to_dict()
relations = {value: key for key, value in relations.items()} 

with open('./fb15k237/relations.json', 'w') as output:
    json.dump(relations, output, indent=4)

In [263]:
train['head'] = train['head'].apply(lambda head: entities[head])
train['relation'] = train['relation'].apply(lambda relation: relations[relation])
train['tail'] = train['tail'].apply(lambda tail: entities[tail])

In [264]:
train.head()

Unnamed: 0,head,relation,tail
0,0,0,13624
1,1,1,3865
2,2,2,11613
3,3,3,5093
4,4,4,1243


In [265]:
valid['head'] = valid['head'].apply(lambda head: entities[head])
valid['relation'] = valid['relation'].apply(lambda relation: relations[relation])
valid['tail'] = valid['tail'].apply(lambda tail: entities[tail])

In [266]:
valid.head()

Unnamed: 0,head,relation,tail
0,4349,31,2741
1,9339,154,13672
2,6274,85,950
3,7226,47,2160
4,1404,13,596


In [267]:
test['head'] = test['head'].apply(lambda head: entities[head])
test['relation'] = test['relation'].apply(lambda relation: relations[relation])
test['tail'] = test['tail'].apply(lambda tail: entities[tail])

In [268]:
test.head()

Unnamed: 0,head,relation,tail
0,4798,148,10648
1,838,92,12858
2,4578,13,352
3,1164,15,14139
4,1164,15,12385


In [269]:
train.to_csv('./fb15k237/train.csv', index=False, header=None)

In [270]:
valid.to_csv('./fb15k237/valid.csv', index=False, header=None)

In [271]:
test.to_csv('./fb15k237/test.csv', index=False, header=None)