In [1]:
import json
import pandas as pd

### Merge fb15k  with wn18rr:

In [2]:
relations_fb15k = json.loads(open(f'./fb15k237/relations.json').read())
relations_wn18  = json.loads(open(f'./wn18rr/relations.json').read())
relations_fb15k.update(relations_wn18)

In [3]:
# '/location/location/contains' -> '_has_part'
id_has_part = relations_fb15k.pop('/location/location/contains')
relations_fb15k['_has_part'] = id_has_part

# '/people/profession/specialization_of' -> '_hypernym'
id_hypernym = relations_fb15k.pop('/people/profession/specialization_of')
relations_fb15k['_hypernym'] = id_hypernym

#'/user/ktrueman/default_domain/international_organization/member_states' -> '_member_meronym'
id_member_meronym = relations_fb15k.pop('/user/ktrueman/default_domain/international_organization/member_states')
relations_fb15k['_member_meronym'] = id_member_meronym

In [4]:
id_has_part

15

In [149]:
relations_fb15k = {key: id for id, (key, _) in enumerate(relations_fb15k.items())}

In [150]:
mapping_new_relation = {
    id_has_part: relations_fb15k['_has_part'],
    id_hypernym: relations_fb15k['_hypernym'],
    id_member_meronym: relations_fb15k['_member_meronym'],
    
}

In [151]:
with open('./kdmkr_fb15k237/relations.json', 'w') as output_file:
    json.dump(relations_fb15k, output_file, indent=4)

In [152]:
entities_wn18 = json.loads(open(f'./wn18rr/entities.json').read())

entities_wn18 = pd.DataFrame(
    pd.Series(entities_wn18)
).reset_index().drop(0, axis = 'columns').rename(columns={'index': 'entity'})

entities_wn18['tag'] = entities_wn18['entity'].str[-5:]

entities_wn18['entity'] = entities_wn18['entity'].str.split('.').str[0]

entities_wn18 = entities_wn18.drop_duplicates('entity')

In [153]:
train = pd.read_csv('./fb15k237/train.csv', header = None)
valid = pd.read_csv('./fb15k237/valid.csv', header = None)
test  = pd.read_csv('./fb15k237/test.csv', header  = None)

train.columns = ['head', 'relation', 'tail']
valid.columns = ['head', 'relation', 'tail']
test.columns  = ['head', 'relation', 'tail']

In [154]:
train

Unnamed: 0,head,relation,tail
0,0,0,13624
1,1,1,3865
2,2,2,11613
3,3,3,5093
4,4,4,1243
...,...,...,...
272110,5476,11,13625
272111,1483,20,1445
272112,4820,170,10221
272113,2386,30,5370


In [155]:
def map_new_relation(r, mapping_new_relation):
    if r in mapping_new_relation:
        return mapping_new_relation[r]
    else:
        return r
    
train['relation'] = train['relation'].apply(lambda r: map_new_relation(r, mapping_new_relation))


valid['relation'] = valid['relation'].apply(lambda r: map_new_relation(r, mapping_new_relation))

test['relation'] = test['relation'].apply(lambda r: map_new_relation(r, mapping_new_relation))


In [156]:
entities_fb15k = json.loads(open(f'./fb15k237/entities.json').read())
entities_fb15k = {value: key for key, value in entities_fb15k.items()}

In [157]:
train['head'] = train['head'].apply(lambda x: entities_fb15k[x])
train['tail'] = train['tail'].apply(lambda x: entities_fb15k[x])

valid['head'] = valid['head'].apply(lambda x: entities_fb15k[x])
valid['tail'] = valid['tail'].apply(lambda x: entities_fb15k[x])

test['head'] = test['head'].apply(lambda x: entities_fb15k[x])
test['tail'] = test['tail'].apply(lambda x: entities_fb15k[x])

In [158]:
train.head()

Unnamed: 0,head,relation,tail
0,dominican_republic,0,republic
1,mighty_morphin_power_rangers,1,wendee_lee
2,drama_film,2,american_history_x
3,michelle_rodriguez,3,naveen_andrews
4,australia_national_association_football_team,4,midfielder


In [159]:
def update_entity_name_fb15k(on, df):

    df = pd.merge(
        left     = df,
        right    = entities_wn18,
        how      = 'left',
        left_on  = on,
        right_on = 'entity'
    )

    df['tag'] = df['tag'].fillna('')

    df[on] = df[on] + df['tag']

    df.drop(['entity', 'tag'], axis='columns', inplace=True)
    
    return df

In [160]:
train = update_entity_name_fb15k(on='head', df=train)
train = update_entity_name_fb15k(on='tail', df=train)

valid = update_entity_name_fb15k(on='head', df=valid)
valid = update_entity_name_fb15k(on='tail', df=valid)

test = update_entity_name_fb15k(on='head', df=test)
test = update_entity_name_fb15k(on='tail', df=test)

In [161]:
df = pd.concat([train, valid, test])

In [162]:
entities = pd.concat(
    [
        pd.DataFrame(df['head']).rename(columns={'head':'entity'}), 
        pd.DataFrame(df['tail']).rename(columns={'tail':'entity'}), 
    ], 
    sort=False
)

In [163]:
entities = entities.drop_duplicates().reset_index(drop=True)

In [164]:
entities = entities.to_dict()['entity']
entities = {value: key for key, value in entities.items()}

In [165]:
with open('./kdmkr_fb15k237/entities.json', 'w') as output_file:
    json.dump(entities, output_file, indent=4)

In [166]:
train['head'] = train['head'].apply(lambda x: entities[x])
train['tail'] = train['tail'].apply(lambda x: entities[x])

valid['head'] = valid['head'].apply(lambda x: entities[x])
valid['tail'] = valid['tail'].apply(lambda x: entities[x])

test['head'] = test['head'].apply(lambda x: entities[x])
test['tail'] = test['tail'].apply(lambda x: entities[x])

In [167]:
train.head()

Unnamed: 0,head,relation,tail
0,0,0,13624
1,1,1,3865
2,2,2,11613
3,3,3,5093
4,4,4,1243


In [168]:
valid.head()

Unnamed: 0,head,relation,tail
0,4349,31,2741
1,9339,154,13672
2,6274,85,950
3,7226,47,2160
4,1404,13,596


In [169]:
test.head()

Unnamed: 0,head,relation,tail
0,4798,148,10648
1,838,92,12858
2,4578,13,352
3,1164,240,14139
4,1164,240,12385


In [170]:
train.to_csv('./kdmkr_fb15k237/train.csv', header=None, index=False)
valid.to_csv('./kdmkr_fb15k237/valid.csv', header=None, index=False)
test.to_csv('./kdmkr_fb15k237/test.csv', header=None, index=False)

## Merge wn18rr with fb15k237:

In [171]:
relations_fb15k = json.loads(open(f'./fb15k237/relations.json').read())
relations_wn18  = json.loads(open(f'./wn18rr/relations.json').read())
relations_wn18.update(relations_fb15k)
relations_wn18 = {key: id for id, (key, _) in enumerate(relations_wn18.items())}

In [172]:
with open('./kdmkr_wn18rr/relations.json', 'w') as output_file:
    json.dump(relations_wn18, output_file, indent=4)

In [173]:
train = pd.read_csv('./wn18rr/train.csv', header=None)
valid = pd.read_csv('./wn18rr/valid.csv', header=None)
test  = pd.read_csv('./wn18rr/test.csv', header=None)

In [174]:
train.to_csv('./kdmkr_wn18rr/train.csv', index=False, header=None)
valid.to_csv('./kdmkr_wn18rr/valid.csv', index=False, header=None)
test.to_csv('./kdmkr_wn18rr/test.csv', index=False, header=None)

# Strategie 2

In [175]:
# 237 entities for fb15k237
# 11  entities for wn18rr

In [51]:
inverse_relations_fb15k = {id: key for key, id in relations_fb15k.items()}
df_fb_15k = df.copy(deep=True)
df_fb_15k['relation'] = df_fb_15k['relation'].apply(lambda r: inverse_relations_fb15k[r])
df_fb_15k.columns = ['head', 'relation_fb15K', 'tail']
df_fb_15k.head()

Unnamed: 0,head,relation_fb15K,tail
0,dominican_republic.n.01,/location/country/form_of_government,republic
1,mighty_morphin_power_rangers,/tv/tv_program/regular_cast./tv/regular_tv_app...,wendee_lee
2,drama_film,/media_common/netflix_genre/titles,american_history_x
3,michelle_rodriguez,/award/award_winner/awards_won./award/award_ho...,naveen_andrews
4,australia_national_association_football_team,/soccer/football_team/current_roster./sports/s...,midfielder


In [52]:
inversed_relations_wn18 = {id: key for key, id in relations_wn18.items()}
entities_wn_18 = json.loads(open(f'./wn18rr/entities.json').read())
inversed_entities_wn_18 = {id: key for key, id in entities_wn_18.items()}
wn18rr_df = pd.concat([train, valid, test], sort = False)
wn18rr_df.columns = ['head', 'relation_wn18', 'tail']
wn18rr_df['relation_wn18'] = wn18rr_df['relation_wn18'].apply(lambda r: inversed_relations_wn18[r])
wn18rr_df['head'] = wn18rr_df['head'].apply(lambda h: inversed_entities_wn_18[h])
wn18rr_df['tail'] = wn18rr_df['tail'].apply(lambda t: inversed_entities_wn_18[t])

In [53]:
wn18rr_df.head()

Unnamed: 0,head,relation_wn18,tail
0,land_reform.n.01,_hypernym,reform.n.01
1,cover.v.01,_derivationally_related_form,covering.n.02
2,botany.n.02,_derivationally_related_form,botanize.v.01
3,kamet.n.01,_instance_hypernym,mountain_peak.n.01
4,question.n.01,_derivationally_related_form,ask.v.01


In [54]:
common = pd.merge(
    left = df_fb_15k,
    right = wn18rr_df,
    on = ['head', 'tail'],
    how = 'left'
)

In [55]:
common = common[common['relation_wn18'].notnull()]

In [56]:
common_entities = pd.concat([
    common['head'].rename(columns={'head': 'entity'}),
    common['tail'].rename(columns={'tail': 'entity'}),
]).drop_duplicates()

In [77]:
common

Unnamed: 0,head,relation_fb15K,tail,relation_wn18
104,algeria.n.01,/location/country/capital,algiers.n.01,_has_part
181,europe.n.01,/base/locations/continents/countries_within,poland.n.01,_has_part
514,kyushu.n.01,/location/location/contains,nagasaki.n.01,_has_part
583,brazil.n.01,/location/location/contains,rio_de_janeiro.n.01,_has_part
638,lithuania.n.01,/location/location/contains,vilnius.n.01,_has_part
...,...,...,...,...
306485,czech.a.01,/language/human_language/countries_spoken_in,czechoslovakia.n.01,_derivationally_related_form
306595,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,denmark.n.01,_member_meronym
307514,iberian_peninsula.n.01,/location/location/partially_contains,spain.n.01,_has_part
308861,texas.n.01,/location/location/contains,laredo.n.01,_has_part


In [79]:
relation = '/user/ktrueman/default_domain/international_organization/member_states'


common[common['relation_fb15K'] == relation]

Unnamed: 0,head,relation_fb15K,tail,relation_wn18
29876,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,united_kingdom.n.01,_member_meronym
54371,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,netherlands.n.01,_member_meronym
94562,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,france.n.01,_member_meronym
102208,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,canada.n.01,_member_meronym
123101,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,belgium.n.01,_member_meronym
128759,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,norway.n.01,_member_meronym
180403,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,italy.n.01,_member_meronym
200220,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,portugal.n.01,_member_meronym
219860,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,bulgaria.n.01,_member_meronym
273146,north_atlantic_treaty_organization.n.01,/user/ktrueman/default_domain/international_or...,turkey.n.02,_member_meronym


In [57]:
pd.DataFrame(common_entities.reset_index(drop=True), columns=['entity'])

Unnamed: 0,entity
0,algeria.n.01
1,europe.n.01
2,kyushu.n.01
3,brazil.n.01
4,lithuania.n.01
...,...
633,virginia_beach.n.01
634,cedar_rapids.n.01
635,czechoslovakia.n.01
636,laredo.n.01


In [64]:
common_statistics = common[
    ['relation_wn18', 'relation_fb15K', 'head']
      ].groupby(
    ['relation_fb15K', 'relation_wn18']
).count()


common_statistics.columns = ['triplet_with_common_head_tail']

common_statistics.sort_values(by = ['triplet_with_common_head_tail', 'relation_fb15K'], ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,triplet_with_common_head_tail
relation_fb15K,relation_wn18,Unnamed: 2_level_1
/location/location/contains,_has_part,464
/base/locations/continents/countries_within,_has_part,71
/location/country/capital,_has_part,37
/location/location/partially_contains,_has_part,27
/base/aareas/schema/administrative_area/capital,_has_part,23
/people/profession/specialization_of,_hypernym,17
/user/ktrueman/default_domain/international_organization/member_states,_member_meronym,14
/location/country/second_level_divisions,_has_part,13
/location/location/contains,_member_of_domain_region,11
/location/country/official_language,_member_meronym,6


In [None]:
# _has_part -> /location/location/contains

In [66]:
total_relation = pd.DataFrame(df_fb_15k['relation_fb15K'].value_counts())

In [67]:
total_relation.reset_index()[total_relation.reset_index()['index'] == '/location/location/contains']

Unnamed: 0,index,relation_fb15K
12,/location/location/contains,5834


In [68]:
/location/location/contains -> has part

SyntaxError: invalid syntax (<ipython-input-68-ce51c3502640>, line 1)

In [69]:
common_head = pd.merge(
    left = df_fb_15k,
    right = wn18rr_df.rename(columns={'tail': 'tail_wn18_rr'}),
    on = ['head'],
    how = 'left'
)

common_head = common_head[common_head['relation_wn18'].notnull()]

common_head.drop(['relation_wn18', 'tail_wn18_rr'], axis = 'columns', inplace=True)

common_head = common_head.drop_duplicates()

In [70]:
common_head.head()

Unnamed: 0,head,relation_fb15K,tail
0,dominican_republic.n.01,/location/country/form_of_government,republic
9,harpsichord.n.01,/music/performance_role/regular_performances./...,violin.n.01
52,prime_minister.n.01,/government/government_office_category/officeh...,dominica.n.01
60,cabaret.n.01,/award/award_winning_work/awards_won./award/aw...,bafta_award_for_best_film
65,beijing.n.01,/common/topic/webpage./common/webpage/category,/m/08mbj5d


In [71]:
common_tail = pd.merge(
    left = common_head,
    right = wn18rr_df.rename(columns={'head': 'head_wn18_rr'}),
    on = ['tail'],
    how = 'left'
)


common_tail = common_tail[common_tail['relation_wn18'].notnull()]

common_tail.drop(['relation_wn18', 'head_wn18_rr'], axis = 'columns', inplace=True)

common_tail = common_tail.drop_duplicates()


In [72]:
common_tail['relation_fb15K'].value_counts().head(15)

/film/film/release_date_s./film/film_regional_release_date/film_release_region                                      1026
/location/location/contains                                                                                          865
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country                                       817
/location/location/adjoin_s./location/adjoining_relationship/adjoins                                                 734
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month                                  456
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office     442
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants                        408
/music/performance_role/track_performances./music/track_contribution/role                                            344
/location/statistical_region/rel

In [74]:
relation = '/people/profession/specialization_of'

common_tail[common_tail['relation_fb15K'] == relation]

Unnamed: 0,head,relation_fb15K,tail
360,bishop.n.01,/people/profession/specialization_of,priest.n.02
573,astronomer.n.01,/people/profession/specialization_of,scientist.n.01
3778,supermodel.n.01,/people/profession/specialization_of,model.v.06
6672,drummer.n.01,/people/profession/specialization_of,musician.n.02
7075,acting.n.01,/people/profession/specialization_of,actor.n.01
7512,advertising.n.02,/people/profession/specialization_of,businessperson.n.01
8759,organist.n.01,/people/profession/specialization_of,musician.n.02
10190,spokesperson.n.01,/people/profession/specialization_of,actor.n.01
10348,psychic.n.01,/people/profession/specialization_of,prophet.n.01
10631,fiction.n.01,/people/profession/specialization_of,writer.n.01


In [63]:
head = 'history.n.01'
wn18rr_df[wn18rr_df['relation_wn18'] == '_member_meronym']

Unnamed: 0,head,relation_wn18,tail
15,primulaceae.n.01,_member_meronym,glaux.n.01
16,proteaceae.n.01,_member_meronym,bartle_frere.n.01
30,taiwan.n.01,_member_meronym,taiwanese.n.01
38,carya.n.01,_member_meronym,pecan.n.02
91,hydrophyllaceae.n.01,_member_meronym,hydrophyllum.n.01
...,...,...,...
3022,encelia.n.01,_member_meronym,brittlebush.n.01
3037,cypriniformes.n.01,_member_meronym,catostomidae.n.01
3062,manteodea.n.01,_member_meronym,mantidae.n.01
3094,geraniaceae.n.01,_member_meronym,geranium.n.01


In [62]:
df_fb_15k[df_fb_15k['relation_fb15K'] == '/user/ktrueman/default_domain/international_organization/member_states']

Unnamed: 0,head,relation_fb15K,tail
1245,/m/02jxk,/user/ktrueman/default_domain/international_or...,germany.n.01
8151,organisation_for_economic_co-operation_and_dev...,/user/ktrueman/default_domain/international_or...,united_states_of_america
8807,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,chad.n.04
9234,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,central_african_republic.n.01
11538,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,hungary.n.01
...,...,...,...
17812,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,morocco.n.01
18495,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,barbados.n.02
19086,/m/02jxk,/user/ktrueman/default_domain/international_or...,italy.n.01
19144,world_trade_organization.n.01,/user/ktrueman/default_domain/international_or...,brunei.n.01


In [61]:
df_fb_15k[df_fb_15k['relation_fb15K'] == '/people/profession/specialization_of']

Unnamed: 0,head,relation_fb15K,tail
1468,bishop.n.01,/people/profession/specialization_of,priest.n.02
2711,astronomer.n.01,/people/profession/specialization_of,scientist.n.01
5000,research_associate,/people/profession/specialization_of,architecture.n.03
5221,preacher.n.01,/people/profession/specialization_of,clergy.n.01
15179,rabbi.n.01,/people/profession/specialization_of,clergy.n.01
...,...,...,...
16650,lyricist.n.01,/people/profession/specialization_of,writer.n.01
16860,aerospace_engineering,/people/profession/specialization_of,engineer.v.01
17721,bandleader.n.01,/people/profession/specialization_of,music_director
18615,art_director,/people/profession/specialization_of,artist.n.01


In [None]:
# Has part -> /location/location/contains