In [1]:
import pandas as pd
from tqdm import tqdm
from pykeen import datasets

### 1to1 mapping from kgt5 paper (https://github.com/apoorvumang/kgt5/issues/18#issuecomment-1227189777)

In [2]:
! mkdir ../data/mappings

In [3]:
! wget -P ../data/mappings https://storage.googleapis.com/t5-kgc-colab/data/wd5m_aliases_entities_v3.txt
! wget -P ../data/mappings https://storage.googleapis.com/t5-kgc-colab/data/wd5m_aliases_relations_v3.txt

--2023-10-30 15:23:52--  https://storage.googleapis.com/t5-kgc-colab/data/wd5m_aliases_entities_v3.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.162.207, 64.233.165.207, 173.194.73.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.162.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147298212 (140M) [text/plain]
Saving to: ‘../data/mappings/wd5m_aliases_entities_v3.txt’


2023-10-30 15:24:07 (11,0 MB/s) - ‘../data/mappings/wd5m_aliases_entities_v3.txt’ saved [147298212/147298212]

--2023-10-30 15:24:07--  https://storage.googleapis.com/t5-kgc-colab/data/wd5m_aliases_relations_v3.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.73.207, 173.194.221.207, 209.85.233.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.73.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18468 (18K) [text/plain]
Saving to: ‘../data/mappings/wd5m_ali

In [4]:
! ls ../data/mappings

wd5m_aliases_entities_v3.txt  wd5m_aliases_relations_v3.txt


In [5]:
entity_v3_path = '../data/mappings/wd5m_aliases_entities_v3.txt'
relation_v3_path = '../data/mappings/wd5m_aliases_relations_v3.txt'

Checking 1to1 condition 

In [6]:
entity_mapping = {}
without_name = []
duplicates = []

with open(entity_v3_path, 'r') as f:
    for line in tqdm(f):
        line = line.strip().split('\t') 
        
        if len(line) < 2:
            without_name.append(line[0])
        else:
            id, name = line[0], line[1]

            if id not in entity_mapping.keys():
                entity_mapping[id] = name    
            else:
                duplicates.append((id, name, entity_mapping[id]))

assert len(without_name) == 0 and len(duplicates) == 0
assert len(set(entity_mapping.keys())) == len(set(entity_mapping.values()))
len(entity_mapping.keys())

4818679it [00:08, 597722.09it/s]


4818679

In [7]:
relation_mapping = {}
without_name = []
duplicates = []

with open(relation_v3_path, 'r') as f:
    for line in tqdm(f):
        line = line.strip().split('\t') 
        
        if len(line) < 2:
            without_name.append(line[0])
        else:
            id, name = line[0], line[1]

            if id not in relation_mapping.keys():
                relation_mapping[id] = name    
            else:
                duplicates.append(( id, name, relation_mapping[id]))

assert len(without_name) == 0 and len(duplicates) == 0
assert len(set(relation_mapping.keys())) == len(set(relation_mapping.values()))
len(relation_mapping.keys())

828it [00:00, 715173.75it/s]


828

Checking that mappings are sufficient for the dataset

In [8]:
dataset = datasets.Wikidata5M()
train_df = pd.read_csv(
    dataset.training_path,
    sep="\t",
    names=["head", "relation", "tail"],
    encoding="utf-8",
)
valid_df = pd.read_csv(
    dataset.validation_path,
    sep="\t",
    names=["head", "relation", "tail"],
    encoding="utf-8",
)
test_df = pd.read_csv(
    dataset.testing_path,
    sep="\t",
    names=["head", "relation", "tail"],
    encoding="utf-8",
)

In [9]:
entities = set(list(train_df['head'].unique()) + list(train_df['tail'].unique()) + 
    list(valid_df['head'].unique()) + list(valid_df['tail'].unique()) + 
    list(test_df['head'].unique()) + list(test_df['tail'].unique()))
    
assert all(elem in entity_mapping.keys()  for elem in entities)
len(entities)

4594485

In [10]:
relations = set(list(train_df['relation']) + list(valid_df['relation']) + list(test_df['relation']))
assert all(elem in relation_mapping.keys()  for elem in relations)
len(relations)

822