#### This notebook aim to update FB15K237. It allows to align the common entities of WN18RR and FB15K237 which are common. We choose not to align the entities that are present in the WN18RR and FB15K237 test sets.

In [2]:
from kdmkr import utils

In [3]:
import json

In [4]:
import pandas as pd

#### FB15K-237-WN18RR_annotated - PositiveOnly refer to shared entities between fb15k237 and wn18rr. 

In [5]:
intersection = pd.read_csv('FB15K-237-WN18RR_annotated - PositiveOnly.csv', header=None)
intersection.columns = ['id', 'entities_wn']

In [6]:
intersection['entities_fb'] = intersection['entities_wn'].str.split('.').str[0]

In [7]:
intersection.head()

Unnamed: 0,id,entities_wn,entities_fb
0,/m/082pc,west_bank.n.01,west_bank
1,/m/01lyv,country_music.n.01,country_music
2,/m/0jm_,american_football.n.01,american_football
3,/m/06mz5,south_dakota.n.01,south_dakota
4,/m/0jfx,antarctica.n.01,antarctica


#### Load entities

In [8]:
entities_fb = utils.read_json('./fb15k237/entities.json')
entities_wn = utils.read_json('./wn18rr/entities.json')

In [9]:
entities_fb_prime = {value: key for key, value in entities_fb.items()}
entities_wn_prime = {value: key for key, value in entities_wn.items()}

#### Load relations

In [10]:
relations_fb = utils.read_json('./fb15k237/relations.json')
relations_wn = utils.read_json('./wn18rr/relations.json')

In [11]:
relations_fb_prime = {value: key for key, value in relations_fb.items()}
relations_wn_prime = {value: key for key, value in relations_wn.items()}

#### Load FB15K237 test set

In [12]:
test_wn = pd.read_csv('./wn18rr/test.csv', header=None)
test_wn.columns = ['head', 'relation', 'tail']

In [13]:
test_wn['head'] = test_wn['head'].apply(lambda x: entities_wn_prime[x])
test_wn['relation'] = test_wn['relation'].apply(lambda x: relations_wn_prime[x])
test_wn['tail'] = test_wn['tail'].apply(lambda x: entities_wn_prime[x])

In [14]:
test_wn.head()

Unnamed: 0,head,relation,tail
0,trade_name.n.01,_member_of_domain_usage,metharbital.n.01
1,call.v.03,_verb_group,call.v.09
2,united_kingdom.n.01,_member_of_domain_region,facer.n.01
3,blattodea.n.01,_member_meronym,cockroach.n.01
4,rickettsiaceae.n.01,_hypernym,bacteria_family.n.01


In [15]:
test_fb = pd.read_csv('./fb15k237/test.csv', header=None)
test_fb.columns = ['head', 'relation', 'tail']

In [16]:
test_fb['head'] = test_fb['head'].apply(lambda x: entities_fb_prime[x])
test_fb['relation'] = test_fb['relation'].apply(lambda x: relations_fb_prime[x])
test_fb['tail'] = test_fb['tail'].apply(lambda x: entities_fb_prime[x])

#### Gather entities of fb15k237

In [17]:
entities_test_fb = pd.concat([test_fb['head'], test_fb['tail']]).to_frame().drop_duplicates()
entities_test_fb.columns = ['entities_test']

In [18]:
entities_test_fb.head()

Unnamed: 0,entities_test
0,zürich
1,autoharp
2,winnie_the_pooh
3,england
5,brad_dourif


#### Load WN18RR test set

In [19]:
test_wn = pd.read_csv('./wn18rr/test.csv', header=None)
test_wn.columns = ['head', 'relation', 'tail']

In [20]:
test_wn['head']     = test_wn['head'].apply(lambda x: entities_wn_prime[x])
test_wn['relation'] = test_wn['relation'].apply(lambda x: relations_wn_prime[x])
test_wn['tail']     = test_wn['tail'].apply(lambda x: entities_wn_prime[x])

In [21]:
entities_test_wn = pd.concat([test_wn['head'], test_wn['tail']]).to_frame().drop_duplicates()
entities_test_wn.columns = ['entities_test']

In [22]:
entities_test_wn.head()

Unnamed: 0,entities_test
0,trade_name.n.01
1,call.v.03
2,united_kingdom.n.01
3,blattodea.n.01
4,rickettsiaceae.n.01


#### Remove POS tagging from wordnet

In [23]:
entities_test_wn['entities_test'] = entities_test_wn['entities_test'].str.split('.').str[0]

In [24]:
entities_test_wn.head()

Unnamed: 0,entities_test
0,trade_name
1,call
2,united_kingdom
3,blattodea
4,rickettsiaceae


#### Gather entities of WN18RR and FB15K237 which belong to the test set:

In [25]:
entities_test = pd.concat([entities_test_fb, entities_test_wn]).drop_duplicates()

In [26]:
entities_test.head()

Unnamed: 0,entities_test
0,zürich
1,autoharp
2,winnie_the_pooh
3,england
5,brad_dourif


#### Remove entities that are common to WN18RR and FB15K237 and that are in the test set

In [27]:
intersection = pd.merge(
    left     = intersection,
    right    = entities_test,
    left_on  = 'entities_fb',
    right_on = 'entities_test',
    how      = 'left'
)

In [28]:
intersection.head()

Unnamed: 0,id,entities_wn,entities_fb,entities_test
0,/m/082pc,west_bank.n.01,west_bank,
1,/m/01lyv,country_music.n.01,country_music,country_music
2,/m/0jm_,american_football.n.01,american_football,american_football
3,/m/06mz5,south_dakota.n.01,south_dakota,south_dakota
4,/m/0jfx,antarctica.n.01,antarctica,


In [29]:
#intersection = intersection[intersection['entities_test'].isnull()]
intersection = intersection[['entities_wn', 'entities_fb']]

#### Common entities that are not in the test set of FB15K237 or WN18RR:

In [30]:
intersection.head()

Unnamed: 0,entities_wn,entities_fb
0,west_bank.n.01,west_bank
1,country_music.n.01,country_music
2,american_football.n.01,american_football
3,south_dakota.n.01,south_dakota
4,antarctica.n.01,antarctica


In [31]:
mapping_fb15K237_wn18rr = intersection.set_index('entities_fb').to_dict()['entities_wn']

In [32]:
mapping_fb15K237_wn18rr

{'west_bank': 'west_bank.n.01',
 'country_music': 'country_music.n.01',
 'american_football': 'american_football.n.01',
 'south_dakota': 'south_dakota.n.01',
 'antarctica': 'antarctica.n.01',
 'clarinet': 'clarinet.n.01',
 'harmonica': 'harmonica.n.01',
 'papua_new_guinea': 'papua_new_guinea.n.01',
 'north_dakota': 'north_dakota.n.01',
 'cairo': 'cairo.n.02',
 'latvia': 'latvia.n.01',
 'belarus': 'belarus.n.01',
 'trondheim': 'trondheim.n.01',
 'author': 'author.v.01',
 'liguria': 'liguria.n.01',
 'trinidad_and_tobago': 'trinidad_and_tobago.n.01',
 'baku': 'baku.n.01',
 'pittsburgh': 'pittsburgh.n.01',
 'tallinn': 'tallinn.n.01',
 'dhaka': 'dhaka.n.01',
 'bolivia': 'bolivia.n.01',
 'bruges': 'bruges.n.01',
 'uzbekistan': 'uzbekistan.n.01',
 'iraq': 'iraq.n.01',
 'bremen': 'bremen.n.01',
 'oregon': 'oregon.n.01',
 'statistics': 'statistics.n.01',
 'anthropology': 'anthropology.n.01',
 'mountain': 'mountain.n.01',
 'british_isles': 'british_isles.n.01',
 'lower_saxony': 'lower_saxony.n.0

#### Add pos tagging to FB15K237 entities which are shared by WN18RR:

In [33]:
import collections
import copy

In [34]:
entities_fb = utils.read_json('./fb15k237/entities.json')

In [35]:
entities_aligned = 0
for e, key in copy.deepcopy(entities_fb).items():
    if e in mapping_fb15K237_wn18rr:
        entities_fb[mapping_fb15K237_wn18rr[e]] = entities_fb.pop(e)
        entities_aligned += 1
print(f'Number of entities aligned: {entities_aligned}')

Number of entities aligned: 754


In [36]:
entities_fb

{'mighty_morphin_power_rangers': 1,
 'drama_film': 2,
 'michelle_rodriguez': 3,
 'australia_national_association_football_team': 4,
 'maldives_national_football_team': 5,
 'bryan_singer': 6,
 'bafta_award_for_best_original_screenplay': 7,
 'danny_devito': 8,
 'academy_award_for_best_foreign_language_film': 10,
 'anonymous': 11,
 'city_of_angels': 12,
 'bridesmaids': 13,
 'serpico': 14,
 'catherine_keener': 15,
 'united_states_of_america': 16,
 'stan_lee': 17,
 'california_institute_of_the_arts': 18,
 'chiwetel_ejiofor': 19,
 'golden_raspberry_award_for_worst_supporting_actor': 20,
 'oprah_winfrey': 21,
 'washington,_d.c.': 22,
 'saeed_jaffrey': 23,
 'heartland_rock': 24,
 'hunter_s._thompson': 25,
 'avant-garde_jazz': 26,
 'yale_university': 27,
 'frankenweenie': 28,
 'the_bahamas': 29,
 'alan_burnett': 30,
 'bill_payne': 31,
 'billy_ray_cyrus': 32,
 'fantastic_four:_rise_of_the_silver_surfer': 33,
 '52nd_annual_grammy_awards': 34,
 'american_reunion': 35,
 'bafta_award_for_best_makeup

In [37]:
with open('./kdmkr_fb15K237/entities.json', 'w') as output:
    json.dump(entities_fb, output, indent=4)

#### Update relations

In [38]:
relations_fb = utils.read_json('./fb15k237/relations.json')

In [39]:
relations_wn = utils.read_json('./wn18rr/relations.json')

In [40]:
relations_wn

{'_hypernym': 0,
 '_derivationally_related_form': 1,
 '_instance_hypernym': 2,
 '_also_see': 3,
 '_member_meronym': 4,
 '_synset_domain_topic_of': 5,
 '_has_part': 6,
 '_member_of_domain_usage': 7,
 '_member_of_domain_region': 8,
 '_verb_group': 9,
 '_similar_to': 10}

#### Align `'/location/location/contains'` with `_has_part`

In [41]:
relations_fb['_has_part'] = relations_fb.pop('/location/location/contains')

In [42]:
relations_fb

{'/location/country/form_of_government': 0,
 '/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor': 1,
 '/media_common/netflix_genre/titles': 2,
 '/award/award_winner/awards_won./award/award_honor/award_winner': 3,
 '/soccer/football_team/current_roster./sports/sports_team_roster/position': 4,
 '/soccer/football_team/current_roster./soccer/football_roster_position/position': 5,
 '/film/actor/film./film/performance/film': 6,
 '/award/award_category/nominees./award/award_nomination/nominated_for': 7,
 '/award/award_nominee/award_nominations./award/award_nomination/award_nominee': 8,
 '/music/performance_role/regular_performances./music/group_membership/role': 9,
 '/award/award_category/winners./award/award_honor/ceremony': 10,
 '/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium': 11,
 '/award/award_winning_work/awards_won./award/award_honor/award_winner': 12,
 '/film/film/release_date_s./film/film_regional_release_date/film_release_region

In [43]:
with open('./kdmkr_fb15K237/relations.json', 'w') as output:
    json.dump(relations_fb, output, indent=4)

#### Extract intersection of training set of WN18RR and FB15K237

In [44]:
train_wn = pd.read_csv('./wn18rr/train.csv', header=None)
train_wn.columns = ['head', 'relation', 'tail']

In [45]:
e_wn = utils.read_json('./wn18rr/entities.json')
e_wn_prime = {value: key for key, value in e_wn.items()}

In [46]:
r_wn = utils.read_json('./wn18rr/relations.json')
r_wn_prime = {value: key for key, value in r_wn.items()}

In [47]:
train_wn['head'] = train_wn['head'].apply(lambda x: e_wn_prime[x])
train_wn['relation'] = train_wn['relation'].apply(lambda x: r_wn_prime[x])
train_wn['tail'] = train_wn['tail'].apply(lambda x: e_wn_prime[x])

In [48]:
train_wn.head()

Unnamed: 0,head,relation,tail
0,land_reform.n.01,_hypernym,reform.n.01
1,cover.v.01,_derivationally_related_form,covering.n.02
2,botany.n.02,_derivationally_related_form,botanize.v.01
3,kamet.n.01,_instance_hypernym,mountain_peak.n.01
4,question.n.01,_derivationally_related_form,ask.v.01


In [49]:
train_fb = pd.read_csv('./kdmkr_fb15k237/train.csv', header=None)
train_fb.columns = ['head', 'relation', 'tail']

In [50]:
e_fb = utils.read_json('./kdmkr_fb15k237/entities.json')
e_fb_prime = {value: key for key, value in e_fb.items()}

In [51]:
r_fb = utils.read_json('./kdmkr_fb15k237/relations.json')
r_fb_prime = {value: key for key, value in r_fb.items()}

In [52]:
train_fb['head'] = train_fb['head'].apply(lambda x: e_fb_prime[x])
train_fb['relation'] = train_fb['relation'].apply(lambda x: r_fb_prime[x])
train_fb['tail'] = train_fb['tail'].apply(lambda x: e_fb_prime[x])

In [54]:
train_fb.head()

Unnamed: 0,head,relation,tail
0,dominican_republic.n.01,/location/country/form_of_government,republic
1,mighty_morphin_power_rangers,/tv/tv_program/regular_cast./tv/regular_tv_app...,wendee_lee
2,drama_film,/media_common/netflix_genre/titles,american_history_x
3,michelle_rodriguez,/award/award_winner/awards_won./award/award_ho...,naveen_andrews
4,australia_national_association_football_team,/soccer/football_team/current_roster./sports/s...,midfielder


In [64]:
e_inter = intersection.set_index('entities_wn').to_dict()['entities_fb']

In [65]:
intersection_fb = train_fb[(train_fb['head'].isin(e_inter)) & (train_fb['relation'].isin(r_wn)) & (train_fb['tail'].isin(e_inter))]

In [66]:
intersection_wn = train_wn[(train_wn['head'].isin(e_inter)) & (train_wn['relation'].isin(r_fb)) & (train_wn['tail'].isin(e_inter))]

In [67]:
intersection = pd.concat([intersection_wn, intersection_fb])

In [68]:
intersection = intersection.drop_duplicates()
intersection

Unnamed: 0,head,relation,tail
436,israel.n.01,_has_part,jerusalem.n.01
730,europe.n.01,_has_part,switzerland.n.01
863,great_britain.n.02,_has_part,scotland.n.01
929,caribbean.n.02,_has_part,jamaica.n.01
1129,germany.n.01,_has_part,stuttgart.n.01
...,...,...,...
260616,latin_america.n.01,_has_part,paraguay.n.01
264491,europe.n.01,_has_part,alps.n.01
265946,greece.n.01,_has_part,sparta.n.01
267046,latin_america.n.01,_has_part,peru.n.01


In [69]:
intersection_reverse = copy.deepcopy(intersection)

In [70]:
intersection_reverse['head'], intersection_reverse['tail'] = intersection_reverse['tail'], intersection_reverse['head']

In [71]:
intersection_reverse.head()

Unnamed: 0,head,relation,tail
436,jerusalem.n.01,_has_part,jerusalem.n.01
730,switzerland.n.01,_has_part,switzerland.n.01
863,scotland.n.01,_has_part,scotland.n.01
929,jamaica.n.01,_has_part,jamaica.n.01
1129,stuttgart.n.01,_has_part,stuttgart.n.01


In [72]:
test = pd.concat([test_fb, test_wn]).drop_duplicates()
test['test'] = True

In [74]:
leak = pd.merge(
    left  = intersection,
    right = test,
    on    = ['head', 'relation', 'tail'],
    how   = 'inner',
)

leak.head()

Unnamed: 0,head,relation,tail,test
0,netherlands.n.01,_has_part,rotterdam.n.01,True
1,germany.n.01,_has_part,aachen.n.01,True
2,germany.n.01,_has_part,lower_saxony.n.01,True
3,europe.n.01,_has_part,romania.n.01,True
4,germany.n.01,_has_part,bonn.n.01,True


In [119]:
intersection = pd.merge(
    left = intersection,
    right = test,
    on = ['head', 'relation', 'tail'],
    how = 'left',
)

In [120]:
intersection = intersection[intersection['test'].isnull()]

In [121]:
intersection = intersection[['head', 'relation', 'tail']]

In [122]:
intersection_fb15k237 = copy.deepcopy(intersection)

In [123]:
intersection_fb15k237['head'] = intersection_fb15k237['head'].apply(lambda x: e_fb[x])
intersection_fb15k237['relation'] = intersection_fb15k237['relation'].apply(lambda x: r_fb[x])
intersection_fb15k237['tail'] = intersection_fb15k237['tail'].apply(lambda x: e_fb[x])

In [124]:
intersection_fb15k237.head()

Unnamed: 0,head,relation,tail
0,8193,15,12233
1,179,15,343
2,6884,15,6817
3,8820,15,164
4,772,15,5033


In [125]:
intersection_fb15k237.to_csv('./intersection/fb15k237.py', index=False, header=None)

In [126]:
intersection_wn18rr = copy.deepcopy(intersection)

In [127]:
intersection_wn18rr['head']     = intersection_wn18rr['head'].apply(lambda x: e_wn[x])
intersection_wn18rr['relation'] = intersection_wn18rr['relation'].apply(lambda x: r_wn[x])
intersection_wn18rr['tail']     = intersection_wn18rr['tail'].apply(lambda x: e_wn[x])

In [128]:
intersection_wn18rr.head()

Unnamed: 0,head,relation,tail
0,436,6,19120
1,56,6,1746
2,843,6,659
3,907,6,34621
4,325,6,37921


In [129]:
intersection_wn18rr.to_csv('./intersection/wn18rr.py', index=False, header=None)