In [1]:
import torch
import torch_geometric
from torch_geometric.datasets import MoleculeNet


In [2]:
def unsqueeze_y(data):
    data.y = data.y.squeeze(1).long()
    return data

In [3]:
dataset = MoleculeNet(root="./dataset/just", name='HIV')

In [4]:
print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: HIV(41127):
Number of graphs: 41127
Number of features: 9
Number of classes: 2

Data(x=[19, 9], edge_index=[2, 40], edge_attr=[40, 3], smiles='CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)=[O+]2', y=[1, 1])
Number of nodes: 19
Number of edges: 40
Average node degree: 2.11
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [5]:
import sys
sys.path.insert(1, '/home/sam/Documents/network/supernode/HIV_test/')

In [6]:
from data.transformation import AddSupernodes
from data.concepts import *

In [7]:
dataset = MoleculeNet(root="./dataset/MoleculeNety1", name='HIV', pre_transform=unsqueeze_y)

In [8]:
concepts_list = [
       {"name": "GCB", "fun": cycle_basis, "args": [], "features" : [2]},
       {"name": "GMC", "fun": max_cliques, "args": []},
    ]
data = AddSupernodes(concepts_list)(dataset[0])
data

Data(x=[21], edge_index=[2, 64], y=[1], ntype=[21], S=[21], edge_S=[64, 1])

In [9]:
data.y

tensor([0.])

# DATASET TR

# FILES

In [10]:
from collections.abc import Mapping
from typing import Any, List, Optional, Sequence, Union

import torch.utils.data
from torch.utils.data.dataloader import default_collate

from torch_geometric.data import Batch, Dataset
from torch_geometric.data.data import BaseData
from torch_geometric.data.datapipes import DatasetAdapter
from torch_geometric.typing import TensorFrame, torch_frame


class Collater:
    def __init__(
        self,
        dataset: Union[Dataset, Sequence[BaseData], DatasetAdapter],
        follow_batch: Optional[List[str]] = None,
        exclude_keys: Optional[List[str]] = None,
    ):
        print("zzzzz")
        self.dataset = dataset
        self.follow_batch = follow_batch
        self.exclude_keys = exclude_keys

    def __call__(self, batch: List[Any]) -> Any:
        print("xxxxx")
        elem = batch[0]
        if isinstance(elem, BaseData):
            print("0000")
            return Batch.from_data_list(
                batch,
                follow_batch=self.follow_batch,
                exclude_keys=self.exclude_keys,
            )
        elif isinstance(elem, torch.Tensor):
            print("a")
            return default_collate(batch)
        elif isinstance(elem, TensorFrame):
            print("b")
            return torch_frame.cat(batch, dim=0)
        elif isinstance(elem, float):
            print("c")
            return torch.tensor(batch, dtype=torch.float)
        elif isinstance(elem, int):
            print("d")
            return torch.tensor(batch)
        elif isinstance(elem, str):
            print("e")
            return batch
        elif isinstance(elem, Mapping):
            print("f")
            return {key: self([data[key] for data in batch]) for key in elem}
        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
            print("g")
            return type(elem)(*(self(s) for s in zip(*batch)))
        elif isinstance(elem, Sequence) and not isinstance(elem, str):
            print("h")
            return [self(s) for s in zip(*batch)]

        raise TypeError(f"DataLoader found invalid type: '{type(elem)}'")


class DataLoader(torch.utils.data.DataLoader):
    r"""A data loader which merges data objects from a
    :class:`torch_geometric.data.Dataset` to a mini-batch.
    Data objects can be either of type :class:`~torch_geometric.data.Data` or
    :class:`~torch_geometric.data.HeteroData`.

    Args:
        dataset (Dataset): The dataset from which to load the data.
        batch_size (int, optional): How many samples per batch to load.
            (default: :obj:`1`)
        shuffle (bool, optional): If set to :obj:`True`, the data will be
            reshuffled at every epoch. (default: :obj:`False`)
        follow_batch (List[str], optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (List[str], optional): Will exclude each key in the
            list. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :class:`torch.utils.data.DataLoader`.
    """
    def __init__(
        self,
        dataset: Union[Dataset, Sequence[BaseData], DatasetAdapter],
        batch_size: int = 1,
        shuffle: bool = False,
        follow_batch: Optional[List[str]] = None,
        exclude_keys: Optional[List[str]] = None,
        **kwargs,
    ):
        # Remove for PyTorch Lightning:
        kwargs.pop('collate_fn', None)

        # Save for PyTorch Lightning < 1.6:
        self.follow_batch = follow_batch
        self.exclude_keys = exclude_keys

        print("eeee")

        super().__init__(
            dataset,
            batch_size,
            shuffle,
            collate_fn=Collater(dataset, follow_batch, exclude_keys),
            **kwargs,
        )

In [11]:
from data.transformation import AddSupernodes
import torch_geometric.transforms as T


def squeeze_y(data):
    data.y = data.y.squeeze(1)
    data.smiles=None
    return data

concepts_list = [
       {"name": "GMC", "fun": cycle_basis, "args": []},
    ]


dataset = MoleculeNet("./dataset/testttaaa", name="HIV",
                      pre_transform=T.Compose(squeeze_y, AddSupernodes(concepts_list))
                     )
loader = DataLoader(dataset, 100,
                    shuffle=False, num_workers=0)

TypeError: Compose.__init__() takes 2 positional arguments but 3 were given

In [None]:
print(dataset[0])
print(dataset[1])

i = iter(loader)
for data in loader:
    print(data)

In [None]:
dataset[1].y

In [None]:
import torch
a = torch.tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
a.long()

In [None]:
import torch
from torch_geometric.data import Data

# Assuming data1 and data2 are your Data objects
data1 = Data(x=torch.randn(19, 9), edge_index=torch.randint(0, 19, (2, 40)), y=torch.randn(1), ntype=torch.randn(19), S=torch.randn(19), edge_S=torch.randn(40, 1))
data2 = Data(x=torch.randn(39, 9), edge_index=torch.randint(0, 39, (2, 88)), y=torch.randn(1), ntype=torch.randn(39), S=torch.randn(39), edge_S=torch.randn(88, 1))

# Concatenate x and y attributes
concatenated_x = torch.cat([data1.x, data2.x], dim=0)
concatenated_y = torch.cat([data1.y, data2.y], dim=0)

# Create a new Data object with concatenated attributes
concatenated_data = Data(x=concatenated_x, edge_index=data1.edge_index, y=concatenated_y, ntype=data1.ntype, S=data1.S, edge_S=data1.edge_S)

# Print concatenated_data to verify
print(concatenated_data)

### DATASET


In [12]:
from torch_geometric.data import InMemoryDataset
import shutil
from data.transformation import AddSupernodesHeteroMulti
from torch_geometric.data import HeteroData

concepts_list = [
       {"name": "GMC", "fun": cycle_basis, "args": []},
]

In [13]:
class MoleculeHIV_herero_multi(InMemoryDataset):
    def __init__(self, root, concepts, transform=None, pre_transform=None):
        self.concepts = concepts
        super().__init__(root, transform, pre_transform)
        self.load(self.processed_paths[0], data_cls=HeteroData)

    def raw_file_names(self):
        return ['data.pth']        

    def processed_file_names(self):
        return ['transformed_dataset.pth']


    def download(self):
        dataset = MoleculeNet("./dataset/STEP/", name="HIV",
                              pre_transform=squeeze_y,)
        transformed_dataset = [AddSupernodesHeteroMulti(self.concepts)(data) for data in dataset]
        torch.save(transformed_dataset, f'{self.raw_dir}/data.pth')

    def process(self):
        print(self.processed_paths[0])
        data_list = torch.load(f'{self.raw_dir}/data.pth')

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.save(data_list, f'{self.processed_dir}/transformed_dataset.pth')


In [14]:
dataset = MoleculeHIV_herero_multi("./dataset/gg", concepts=concepts_list)

Processing...


dataset/gg/processed/transformed_dataset.pth


Done!


In [15]:
dataset[4]

HeteroData(
  normal={ x=[10, 9] },
  label={ y=[1] },
  GMC={ x=[1, 9] },
  (normal, orig, normal)={
    edge_index=[2, 18],
    edge_attr=[18, 3],
  },
  (normal, identity, normal)={ edge_index=[2, 10] },
  (normal, toSup, GMC)={ edge_index=[2, 6] },
  (GMC, toNor, normal)={ edge_index=[2, 6] },
  (GMC, identity, GMC)={ edge_index=[2, 1] }
)