In [None]:

import numpy as np
import random


def star_graph(degSource, pathLen, numNodes, reverse=False):
    source = np.random.randint(0, numNodes, 1)[0]
    goal = np.random.randint(0, numNodes, 1)[0]
    while goal == source:
        goal = np.random.randint(0, numNodes, 1)[0]

    path = [source]
    edge_list = []

    # Choose random nodes along the path
    for _ in range(pathLen - 2):
        node = np.random.randint(0, numNodes, 1)[0]
        while node in path or node == goal:
            node = np.random.randint(0, numNodes, 1)[0]
        path.append(node)

    path.append(goal)
    # Connect the path
    for i in range(len(path) - 1):
        edge_list.append([path[i], path[i + 1]])

    remaining_nodes = []
    for i in range(numNodes):
        if i not in path:
            remaining_nodes.append(i)

    i = 0
    deg_nodes = set()
    while i < degSource - 1:
        # Add neighbour to source
        node = source
        next_node = np.random.randint(0, numNodes, 1)[0]
        l = 1
        while l < pathLen:
            if next_node not in deg_nodes and next_node not in path:
                edge_list.append([node, next_node])
                deg_nodes.add(next_node)
                node = next_node
                l += 1
            next_node = np.random.randint(0, numNodes, 1)[0]

        i += 1

    random.shuffle(edge_list)
    if reverse:
        path = path[::-1]

    return path, edge_list, source, goal


def generate_and_save(n_train, n_test, degSource, pathLen, numNodes, reverse=False):
    """
    Generate a list of train and testing graphs and save them for reproducibility
    """
    file = open('./data/datasets/graphs/' + 'deg_' + str(degSource) + '_path_' + str(pathLen) + '_nodes_' + str(
        numNodes) + '_train_' +
          str(n_train) + '.txt', 'w')

    for i in range(n_train):
        path, edge_list, start, goal = star_graph(degSource, pathLen, numNodes, reverse=reverse)
        path_str = ''
        for node in path:
            path_str += str(node) + ','
        path_str = path_str[:-1]

        edge_str = ''
        for e in edge_list:
            edge_str += str(e[0]) + ',' + str(e[1]) + ' '
        edge_str = edge_str[:-1]
        edge_str += '/' + str(start) + ',' + str(goal) + '='

        out = edge_str + path_str
        file.write(out + '\n')
    file.close()

    file = open('./data/datasets/graphs/' + 'deg_' + str(degSource) + '_path_' + str(pathLen) + '_nodes_' +
                str(numNodes) + '_test_' +
                 str(n_test) + '.txt', 'w')

    for i in range(n_test):
        path, edge_list, start, goal = star_graph(degSource, pathLen, numNodes, reverse=reverse)
        path_str = ''
        for node in path:
            path_str += str(node) + ','
        path_str = path_str[:-1]

        edge_str = ''
        for e in edge_list:
            edge_str += str(e[0]) + ',' + str(e[1]) + '|'
        edge_str = edge_str[:-1]
        edge_str += '/' + str(start) + ',' + str(goal) + '='

        out = edge_str + path_str
        file.write(out + '\n')

    file.close()


def prefix_target_list(filename=None, reverse=False):
    """
    Load graphs and split them into prefix and target and return the list
    """
    data_list = []
    with open(filename, 'r') as f:
        lines = f.readlines()
    for line in lines:
        prefix = line.strip().split('=')[0] + '='
        target = line.strip().split('=')[1]
        if reverse:
            target = ','.join(target.split(',')[::-1])
        data_list.append((prefix, target))

    return data_list


def get_edge_list(x, num_nodes, path_len):
    """
    Given the tokenised input for the Transformer, map back to the edge_list
    """
    edge_list = []
    pair = []
    x = x.squeeze().cpu().numpy()

    for i, n in enumerate(x):
        if n in range(num_nodes):
            pair.append(n)
        if len(pair) == 2:
            edge_list.append(pair)
            pair = []
        if n == num_nodes + 2:
            break

    start = x[i + 1]
    goal = x[i + 2]
    path = [x[i + j] for j in range(4, 4 + path_len)]

    return edge_list, start, goal, path


def get_edge_list_byte(x, num_nodes, path_len, decode):
    """
    Given the tokenised input for the Transformer, map back to the edge_list
    """
    edge_list = []
    x = list(x.squeeze().cpu().numpy())
    dec = [decode([val]) for val in x]
    edge = []
    for i, val in enumerate(dec):
        if val not in [',', '|', '=', '->']:
            edge.append(val)
        if len(edge) == 2:
            edge_list.append(edge)
            edge = []

        if val == '->':
            break
    i += 2
    start = dec[i + 1]
    goal = dec[i -1]
    path = [dec[i + 3 + 2 * j] for j in range(0, path_len - 2)]

    return edge_list, start, goal, path

In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
import os

def push_dataset_to_hf_hub(n_train, n_test, degSource, pathLen, numNodes, reverse=False):
    HF_USERNAME = ""  # Replace with your HF username # ENTER YOUR HF USERNAME
    DATASET_NAME = f"star-graph-deg-{degSource}-path-{pathLen}-nodes-{numNodes}"  # Replace with your dataset name
    REPO_ID = f"{HF_USERNAME}/{DATASET_NAME}"

    trn_file = open('./data/datasets/graphs/' + 'deg_' + str(degSource) + '_path_' + str(pathLen) + '_nodes_' + str(
        numNodes) + '_train_' +
          str(n_train) + '.txt', 'r')
    tst_file = open('./data/datasets/graphs/' + 'deg_' + str(degSource) + '_path_' + str(pathLen) + '_nodes_' + str(
        numNodes) + '_test_' +
          str(n_test) + '.txt', 'r')
    
    trn_pts = trn_file.read().splitlines()
    tst_pts = tst_file.read().splitlines()

    train_data = {
      "graph": [],
      "source": [],
      "destination": [],
      "path": [],
    }

    tst_data = {
        "graph": [],
        "source": [],
        "destination": [],
        "path": []
    }
    i = 0
    for trn_pt in trn_pts:
        question, path = trn_pt.split('=')
        graph, source_and_destination = question.split("/")
        source, destination = source_and_destination.split(',')
        if i == 0:
            print(question)
            print(path)
            i += 1
        train_data['graph'].append(graph)
        train_data['path'].append(path)
        train_data['source'].append(source)
        train_data['destination'].append(destination)
    
    for tst_pt in tst_pts:
        question, path = tst_pt.split('=')
        graph, source_and_destination = question.split("/")
        source, destination = source_and_destination.split(',')
        tst_data['graph'].append(graph)
        tst_data['path'].append(path)
        tst_data['source'].append(source)
        tst_data['destination'].append(destination)
    train_dataset = Dataset.from_dict(train_data)
    test_dataset = Dataset.from_dict(tst_data)
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
    })
    dataset_dict.push_to_hub(REPO_ID)
    return train_dataset, test_dataset



In [None]:
# Create graphs and save
n_train = 200000
n_test = 20000
deg = 128
path_len = 3
num_nodes = 300
reverse = False
generate_and_save(n_train=n_train, n_test=n_test, degSource=deg, pathLen=path_len, numNodes=num_nodes,
                    reverse=reverse)


In [None]:
x, y = push_dataset_to_hf_hub(n_train=n_train, n_test=n_test, degSource=deg, pathLen=path_len, numNodes=num_nodes,
                    reverse=reverse)