# Creating graph datasets

## creating In Memory datasets

we need 4 fundamental methods: 
- raw_file_names() -> list of file names in `raw_dir` for raw data used to skip the download 
- processed_file_names -> list of file name sin `processed_dir` to skip the processing 
- download() downloads raw data into raw_dir() --> don´t implement if no download necessary 
- process() process raw data and save it into processed dir 

the 'process' method is the most important one. this creates a list of 'Data' objects that are saved into 'processed_dir' then. Data objects will be collated into one giant `Data` object 

In [None]:
import torch 
from torch_geometric.data import InMemoryDataset, download_url 

In [None]:
class MyOwnDataset(InMemoryDataset): 
    def __init__(self, root, transform=None, pre_transform= None, pre_filter = None): 
        super().__init__(root, transform, pre_transform, pre_filter)
        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self): 
        return ['data1.pt', 'data2.pt']
    
    @property
    def processed_file_names(self): 
        return ['data.pt']
    
    def download(self):
        url = 'https://example.com/data.zip'
        download_url(url, self.raw_dir)

    def process(self): 
        data_list = [...]

        if self.pre_filter is not None:
            data_list = [d for d in data_list if self.pre_filter(d)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(d) for d in data_list]

        self.save(data_list, self.processed_paths[0])

in my case, I would need to use hdf5 or similar. Zarr could work, but Arrow does not, it's ill suited for this purpose

# Creating 'larger' Datasets

if stuff does not fit into memory, we can use the `Dataset` class. This follows closely the concept of the torchvision datasets. It expects the methods len() and get() to be implemented. get() implements the logic to get a single graph, len() gets the number of examples in the dataset. Works in much the same way as the Julia datasets we already have

In [None]:
import os.path as osp 
from torch_geometric.data import Dataset 

class MyOwnOnDiskDataset(Dataset): 
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None): 
        pass 
    
    @property 
    def raw_file_names(self): 
        return ['data1.pt', 'data2.pt']
    
    @property 
    def processed_file_names(self):
        return ['data.pt']
    
    def download(self):
        url = 'https://example.com/data.zip'
        download_url(url, self.raw_dir)

    def process(self): 
        idx = 0
        for raw_path in self.raw_paths:
            # Read data from `raw_path
            data = Data(...) # this is where the data loading happens and where the performance bottlenecks will be

            if self.pre_filter is not None and not self.pre_filter(data): 
                continue 

            if self.pre_transform is not None:
                data = self.pre_transform(data)

            torch.save(data, osp.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1

    def len(self):
        return len(self.processed_file_names)
    
    def get(self, idx):
        data =torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data

Here, each graph data object gets saved individually in process(), and is manually loaded in get(). We might want to cache some for ease of use

Use HDF5 or Zarr if possible

# Loading Graphs from CSV

In [None]:
from torch_geometric.data import download_url, extract_zip

In [None]:
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

In [None]:
extract_zip(download_url(url, '.'), './')

In [None]:
movie_path = './ml-latest-small/movies.csv'
rating_path = './ml-latest-small/ratings.csv'

In [None]:
import pandas as pd
display(pd.read_csv(movie_path).head())


In [None]:
display(pd.read_csv(rating_path).head())

In [None]:
def load_node_csv(path, index_col=None, encoders=None, **kwargs): 
    df = pd.read_csv(path, index_col = index_col, **kwargs)
    mapping = {index: i for i ,index in enumerate(df.index.unique())}

    x = None 

    if encoders is None:
        encoders = {col: lambda x: x for col in df.columns if col != index_col}
    else: 
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=1)
    
    return x, mapping 

build a bunch of encoders here. this is useful in general, but might become complex code wise

In [None]:
import sentence_transformers as st

In [None]:
class SequenceEncoder: 
    def __init__(self, model_name = 'all-MiniLM-L6-v2', device=None): 
        self.device = device 
        self.model = st.SentenceTransformer(model_name, device=device)
        

    @torch.no_grad() 
    def __call__(self, df): 
        x = self.model.encode(df.values, show_progress_bar=True, convert_to_tensor=True, device=self.device)
        return x.cpu() 
    

In [None]:
class GenreEncoder: 
    def __init__(self, sep='|'): 
        self.sep = sep 

    @torch.no_grad()
    def __call__(self, df): 
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))

        for i, col in enumerate(df.values): 
            for genre in col.split(self.sep): 
                x[i, mapping[genre]] = 1
        return x


In [None]:
movie_x, movie_mapping = load_node_csv(
    movie_path, index_col='movieId', encoders={
        'title': SequenceEncoder(),
        'genres': GenreEncoder()
    }
)

In [None]:
_, user_mapping = load_node_csv(rating_path, index_col='userId')

build the heterodata

In [None]:
from torch_geometric.data import HeteroData

In [None]:
data = HeteroData() 

data['user'].num_nodes = len(user_mapping)
data['movie'].x = movie_x 


display(data)

connect the users to movies according to their ratings

In [None]:
def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = pd.read_csv(path, **kwargs)

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst], dtype=torch.long)

    edge_attrs = None
    if encoders is None:
        encoders = {col: lambda x: x for col in df.columns if col not in [src_index_col, dst_index_col]}
    else:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=1)
    
    return edge_index, edge_attr

    


load additional edge leve features via encoders

In [None]:
class IdentityEncoder: 

    def __init__(self, dtype=torch.float): 
        self.dtype = dtype

    def __call__(self, df): 
        return torch.from_numpy(df.values).view(-1, 1).to(dtype=self.dtype)

In [None]:
edge_index, edge_label = load_edge_csv(
    rating_path,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [None]:
data['user', 'rates', 'movie'].edge_index = edge_index


In [None]:
data['user', 'rates', 'movie'].edge_label = edge_label


In [None]:
data