The goal of this notebook is to perform a sanity check and split the dataset into training, validation, and test sets.

In [16]:
import pandas as pd
from tqdm import tqdm
import os
import json
from pykeen import datasets
import time

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import argparse, sys
from loguru import logger
import urllib.request
import tarfile




In [4]:
# Define your specific path
save_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation"
os.makedirs(save_path, exist_ok=True)  # Create the directory if it doesn't exist

# File URL and destination
url = "https://www.dropbox.com/s/6sbhm0rwo4l73jq/wikidata5m_transductive.tar.gz?dl=1"
file_path = os.path.join(save_path, "wikidata5m_transductive.tar.gz")

# Download with progress
def download_with_progress(url, file_path):
    with urllib.request.urlopen(url) as response:
        total_size = int(response.info().get('Content-Length', 0))  # Total size in bytes
        block_size = 1024  # Block size in bytes
        t = tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading")
        with open(file_path, 'wb') as f:
            while True:
                buffer = response.read(block_size)
                if not buffer:
                    break
                f.write(buffer)
                t.update(len(buffer))
        t.close()

print("Downloading file...")
download_with_progress(url, file_path)
print(f"File saved to {file_path}")


Downloading file...


Downloading: 100%|██████████| 168M/168M [03:48<00:00, 736kB/s]]

File saved to /home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/wikidata5m_transductive.tar.gz





In [5]:

# Create the dataset directory
dataset_path = os.path.join(save_path, "datasets/wikidata5m")
os.makedirs(dataset_path, exist_ok=True)

# File paths
tar_file_path = os.path.join(save_path, "wikidata5m_transductive.tar.gz")

# Extract the tar.gz file
print("Extracting the tar.gz file...")
with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extractall(path=dataset_path)
print(f"Files extracted to {dataset_path}")


Extracting the tar.gz file...
Files extracted to /home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m


In [2]:
train_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/wikidata5m_transductive_train.txt"
test_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/wikidata5m_transductive_test.txt"
valid_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/wikidata5m_transductive_valid.txt"

# Load the datasets
columns = ["head", "relation", "tail"]  
train_df = pd.read_csv(train_path, sep='\t', header=None, names=columns)
test_df = pd.read_csv(test_path, sep='\t', header=None, names=columns)
valid_df = pd.read_csv(valid_path, sep='\t', header=None, names=columns)


In [3]:
train_df.head()

Unnamed: 0,head,relation,tail
0,Q29387131,P31,Q5
1,Q326660,P1412,Q652
2,Q7339549,P57,Q1365729
3,Q554335,P27,Q29999
4,Q20641639,P54,Q80955


In [4]:
print("shape of train dataset: ", train_df.shape)
print("shape of valid dataset: ", valid_df.shape)
print("shape of test dataset: ", test_df.shape)

shape of train dataset:  (20614279, 3)
shape of valid dataset:  (5163, 3)
shape of test dataset:  (5133, 3)


In [5]:
# Concatenate all datasets
all_data = pd.concat([train_df, test_df, valid_df], ignore_index=True)

# Shuffle the data
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
all_data.shape

(20624575, 3)

In [6]:
# Select 200 samples
subset_data = all_data.iloc[:700]

# Display the subset
print("Subset Data:")
print(subset_data.head())


Subset Data:
        head relation      tail
0   Q6442440      P31    Q23397
1   Q7148490      P17       Q30
2  Q16145304      P19  Q3752988
3   Q5201498     P421     Q6723
4    Q937515    P1344     Q9674


In [7]:
# Split into train, validation, and test sets
train_data = subset_data.iloc[:500]
val_data = subset_data.iloc[500:600]
test_data = subset_data.iloc[600:700]

# Display sizes of splits
print(f"Train Size: {len(train_data)}")
print(f"Validation Size: {len(val_data)}")
print(f"Test Size: {len(test_data)}")


Train Size: 500
Validation Size: 100
Test Size: 100


In [8]:
# Save the splits to files
train_data.to_csv(f"{dataset_path}/subset_train.txt", sep='\t', index=False, header=False)
val_data.to_csv(f"{dataset_path}/subset_val.txt", sep='\t', index=False, header=False)
test_data.to_csv(f"{dataset_path}/subset_test.txt", sep='\t', index=False, header=False)
print("Subsets saved!")


Subsets saved!


In [9]:
# Define the file paths
train_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_train.txt"
test_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_test.txt"
valid_path = "/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_val.txt"

# Load the datasets
columns = ["head", "relation", "tail"]  # Adjust column names if necessary
train_df = pd.read_csv(train_path, sep='\t', header=None, names=columns)
test_df = pd.read_csv(test_path, sep='\t', header=None, names=columns)
valid_df = pd.read_csv(valid_path, sep='\t', header=None, names=columns)

In [10]:
train_df

Unnamed: 0,head,relation,tail
0,Q6442440,P31,Q23397
1,Q7148490,P17,Q30
2,Q16145304,P19,Q3752988
3,Q5201498,P421,Q6723
4,Q937515,P1344,Q9674
...,...,...,...
495,Q8270170,P31,Q5
496,Q16977847,P407,Q1860
497,Q113008,P1412,Q188
498,Q24928114,P17,Q668


In [11]:


class Verbalizer:
    def __init__(self, df, similarity_matrix=None, relation2index=None, entity2text=None, relation2text=None):
        self.df = df
        self.similarity_matrix = similarity_matrix
        self.relation2index = relation2index
        self.entity2text = entity2text
        self.relation2text = relation2text
        self.sep = '[SEP]'

    def get_neighbourhood(self, node_id, relation_id=None, tail_id=None, limit=None):
        neighs = []

        if tail_id is None:
            # Neighbors for test dataset
            head_matches = self.df[self.df['head'] == node_id].copy()
            if limit:
                head_matches = head_matches.head(limit)
            neighs.extend(head_matches.to_dict('records'))

            tail_matches = self.df[self.df['tail'] == node_id].copy()
            if limit:
                tail_matches = tail_matches.head(limit)
            for _, row in tail_matches.iterrows():
                row = row.to_dict()
                row['relation'] = 'inverse of ' + row['relation']
                neighs.append(row)
        else:
            # Neighbors for train dataset
            head_matches = self.df[(self.df['head'] == node_id) &
                                   ((self.df['tail'] != tail_id) |
                                    (self.df['relation'] != relation_id))].copy()
            if limit:
                head_matches = head_matches.head(limit)
            neighs.extend(head_matches.to_dict('records'))

            tail_matches = self.df[(self.df['tail'] == node_id) &
                                   ((self.df['head'] != tail_id) |
                                    (self.df['relation'] != relation_id))].copy()
            if limit:
                tail_matches = tail_matches.head(limit)
            for _, row in tail_matches.iterrows():
                row = row.to_dict()
                row['relation'] = 'inverse of ' + row['relation']
                neighs.append(row)

        return neighs

    def verbalize(self, head, relation, tail=None, inverse=False):
        relation_prefix = 'inverse of ' if inverse else ''
        limit = 200 if inverse else None

        neighbourhood = self.get_neighbourhood(head, relation, tail, limit)
        relation_text = relation_prefix + self.relation2text[relation]

        # Sort based on similarity
        neighbourhood.sort(
            key=lambda x: self.similarity_matrix[self.relation2index[self.relation2text[x['relation']]]]
                         [self.relation2index[relation_text]],
            reverse=True
        )

        neighbourhood = neighbourhood[:512]
        verbalization = f"predict {self.sep} {self.entity2text[head]} {relation_text} {self.sep} "

        verbalization += " ".join(
            list(
                map(
                    lambda x: f"{self.relation2text[x['relation']]} " +
                              f"{self.entity2text[x['tail']] if x['head'] == head else self.entity2text[x['head']]} {self.sep}",
                    neighbourhood
                )
            )
        )

        return " ".join(verbalization.split()).strip()


In [12]:
def verbalize_dataset(input_df, verbalizer):
    docs = []

    for i, doc in tqdm(input_df.iterrows(), total=len(input_df)):
        try:
            direct_verbalization = verbalizer.verbalize(doc['head'], doc['relation'], doc['tail'])
            docs.append({
                'id': i * 2,
                'verbalization': direct_verbalization,
                'head': doc['head'],
                'tail': doc['tail'],
                'relation': doc['relation'],
                'verbalized_tail': verbalizer.entity2text[doc['tail']]
            })

            inverse_verbalization = verbalizer.verbalize(doc['tail'], doc['relation'], doc['head'], inverse=True)
            docs.append({
                'id': i * 2 + 1,
                'verbalization': inverse_verbalization,
                'head': doc['tail'],
                'tail': doc['head'],
                'relation': "inverse of " + doc['relation'],
                'verbalized_tail': verbalizer.entity2text[doc['head']]
            })

        except Exception as e:
            print(f"Exception {e} on {i}th triplet")

    return pd.DataFrame(docs)

In [18]:
parser = argparse.ArgumentParser()

parser.add_argument("--relation_vectors_path", help="path to the embeddings of verbalized relations", default="/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/data/embeddings/fasttext_vecs-wikidata5m.npy")
parser.add_argument("--rel2ind_path", help="path to the mapping of textual relations to the index of corresponding vectors", default="/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/data/relation2ind-wikidata5m.json")
parser.add_argument("--entity_mapping_path", help="path to the entity2text mapping", default="/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/data/mappings/wd5m_aliases_entities_v3.txt")
parser.add_argument("--relation_mapping_path", help="path to the relation2text mapping", default="/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/data/relation2text-wikidata5m.json")
parser.add_argument("--train_path", help="train KG path", default='/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_train.txt')
parser.add_argument("--valid_path", help="valid KG path", default='/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_val.txt')
parser.add_argument("--test_path", help="test KG path", default='/home/ahmadi/sadaf/GraphNeighborLM/Better-together/data-preparation/datasets/wikidata5m/subset_test.txt')
parser.add_argument("--train_verbalized_output", help="verbalized train KG path", default='verbalized_train')
parser.add_argument("--valid_verbalized_output", help="verbalized valid KG path", default='verbalized_valid')
parser.add_argument("--test_verbalized_output", help=" verbalized test KG path", default='verbalized_test')

args = parser.parse_args()

usage: ipykernel_launcher.py [-h]
                             [--relation_vectors_path RELATION_VECTORS_PATH]
                             [--rel2ind_path REL2IND_PATH]
                             [--entity_mapping_path ENTITY_MAPPING_PATH]
                             [--relation_mapping_path RELATION_MAPPING_PATH]
                             [--train_path TRAIN_PATH]
                             [--valid_path VALID_PATH] [--test_path TEST_PATH]
                             [--train_verbalized_output TRAIN_VERBALIZED_OUTPUT]
                             [--valid_verbalized_output VALID_VERBALIZED_OUTPUT]
                             [--test_verbalized_output TEST_VERBALIZED_OUTPUT]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/ahmadi/.local/share/jupyter/runtime/kernel-v3d7530335ff9860d7196d5148d23a01e479dfe3c3.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [13]:
vecs = np.load(args.relation_vectors_path)
similarity_matrix = cosine_similarity(vecs)

with open(args.rel2ind_path, 'r') as f:
    rel2ind = json.load(f)

entity_mapping = {}
with open(args.entity_mapping_path, 'r') as f:
    for line in f:
        _id, name = line.strip().split('\t')
        entity_mapping[_id] = name

with open(args.relation_mapping_path, 'r') as f:
    relation_mapping = json.load(f)

In [None]:
train_df = pd.read_csv(args.train_path, sep='\t', header=None, names=columns)
test_df = pd.read_csv(args.test_path, sep='\t', header=None, names=columns)
valid_df = pd.read_csv(args.valid_path, sep='\t', header=None, names=columns)

In [None]:
# Verbalize datasets
verbalizer = Verbalizer(train_df, similarity_matrix=similarity_matrix,
                        relation2index=rel2ind, entity2text=entity_mapping, relation2text=relation_mapping)

print('Verbalizing train KG...')
train_verbalized = verbalize_dataset(train_df, verbalizer)

print('Verbalizing valid KG...')
valid_verbalized = verbalize_dataset(valid_df, verbalizer)

print('Verbalizing test KG...')
test_verbalized = verbalize_dataset(test_df, verbalizer)

# Save results
train_verbalized.to_csv(args.train_verbalized_output, index=False)
valid_verbalized.to_csv(args.valid_verbalized_output, index=False)
test_verbalized.to_csv(args.test_verbalized_output, index=False)