In [None]:
# Mounting Google Drive to load dataset
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/Deep\ Learning/

Mounted at /content/gdrive
/content/gdrive/My Drive/Deep Learning


# Setup

In [None]:
import torch
torch.__version__

'2.0.1+cu118'

In [None]:
!pip install torch_geometric rdflib --quiet
# Optional dependencies:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.1+cu118.html --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/661.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   

In [None]:
!pip install catboost --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
## edited entities.py from PyG that also saves node_features
import logging
import os
import os.path as osp
from collections import Counter
from typing import Callable, List, Optional

import numpy as np
import torch

from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_tar,
)


class Entities(InMemoryDataset):
    url = 'https://data.dgl.ai/dataset/{}.tgz'

    def __init__(self, root: str, name: str,
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        self.name = name.lower()
        assert self.name in ['aifb', 'am', 'mutag', 'bgs']
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self) -> str:
        return osp.join(self.root, self.name, 'raw')

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, self.name, 'processed')

    @property
    def num_relations(self) -> int:
        return self.data.edge_type.max().item() + 1

    @property
    def num_classes(self) -> int:
        return self.data.train_y.max().item() + 1

    @property
    def raw_file_names(self) -> List[str]:
        return [
            f'{self.name}_stripped.nt.gz',
            'completeDataset.tsv',
            'trainingSet.tsv',
            'testSet.tsv',
        ]

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        path = download_url(self.url.format(self.name), self.root)
        extract_tar(path, self.raw_dir)
        os.unlink(path)

    def process(self):
        import gzip

        import pandas as pd
        import rdflib as rdf

        graph_file, task_file, train_file, test_file = self.raw_paths

        with hide_stdout():
            g = rdf.Graph()
            with gzip.open(graph_file, 'rb') as f:
                g.parse(file=f, format='nt')

        freq = Counter(g.predicates())

        relations = sorted(set(g.predicates()), key=lambda p: -freq.get(p, 0))
        subjects = set(g.subjects())
        objects = set(g.objects())
        nodes = list(subjects.union(objects))

        N = len(nodes)
        R = 2 * len(relations)

        relations_dict = {rel: i for i, rel in enumerate(relations)}
        nodes_dict = {node: i for i, node in enumerate(nodes)}

        edges = []
        node_features = {}
        for s, p, o in g.triples((None, None, None)):
            src, dst, rel = nodes_dict[s], nodes_dict[o], relations_dict[p]
            edges.append([src, dst, 2 * rel])
            edges.append([dst, src, 2 * rel + 1])

            # SAVE LITERAL VALUES
            if isinstance(o, rdf.Literal):
                literal_type = p#type(o.value)
                if literal_type not in node_features:
                    node_features[literal_type] = [[dst], [o.value]]
                else:
                    node_features[literal_type][0].append(dst)
                    node_features[literal_type][1].append(o.value)

        edges = torch.tensor(edges, dtype=torch.long).t().contiguous()
        perm = (N * R * edges[0] + R * edges[1] + edges[2]).argsort()
        edges = edges[:, perm]

        edge_index, edge_type = edges[:2], edges[2]

        if self.name == 'am':
            label_header = 'label_cateogory'
            nodes_header = 'proxy'
        elif self.name == 'aifb':
            label_header = 'label_affiliation'
            nodes_header = 'person'
        elif self.name == 'mutag':
            label_header = 'label_mutagenic'
            nodes_header = 'bond'
        elif self.name == 'bgs':
            label_header = 'label_lithogenesis'
            nodes_header = 'rock'

        labels_df = pd.read_csv(task_file, sep='\t')
        labels_set = set(labels_df[label_header].values.tolist())
        labels_dict = {lab: i for i, lab in enumerate(list(labels_set))}
        nodes_dict = {np.unicode(key): val for key, val in nodes_dict.items()}

        train_labels_df = pd.read_csv(train_file, sep='\t')
        train_indices, train_labels = [], []
        for nod, lab in zip(train_labels_df[nodes_header].values,
                            train_labels_df[label_header].values):
            train_indices.append(nodes_dict[nod])
            train_labels.append(labels_dict[lab])

        train_idx = torch.tensor(train_indices, dtype=torch.long)
        train_y = torch.tensor(train_labels, dtype=torch.long)

        test_labels_df = pd.read_csv(test_file, sep='\t')
        test_indices, test_labels = [], []
        for nod, lab in zip(test_labels_df[nodes_header].values,
                            test_labels_df[label_header].values):
            test_indices.append(nodes_dict[nod])
            test_labels.append(labels_dict[lab])

        test_idx = torch.tensor(test_indices, dtype=torch.long)
        test_y = torch.tensor(test_labels, dtype=torch.long)

        data = Data(edge_index=edge_index, edge_type=edge_type,
                    train_idx=train_idx, train_y=train_y, test_idx=test_idx,
                    test_y=test_y, num_nodes=N, node_features=node_features)

        torch.save(self.collate([data]), self.processed_paths[0])

    def __repr__(self) -> str:
        return f'{self.name.upper()}{self.__class__.__name__}()'


class hide_stdout(object):
    def __enter__(self):
        self.level = logging.getLogger().level
        logging.getLogger().setLevel(logging.ERROR)

    def __exit__(self, *args):
        logging.getLogger().setLevel(self.level)

In [None]:
import rdflib
dataset = Entities("./datasets", "MUTAG")
# Dataset is a list of Graphs (but with only one element??)
print(dataset[0])
data = dataset[0]
charge = rdflib.URIRef("http://dl-learner.org/carcinogenesis#charge")
data['node_features'].keys()
#data['node_features'][charge][1]

Data(
  edge_index=[2, 148454],
  edge_type=[148454],
  train_idx=[272],
  train_y=[272],
  test_idx=[68],
  test_y=[68],
  num_nodes=23644,
  node_features={
    http://dl-learner.org/carcinogenesis#charge=[2],
    http://dl-learner.org/carcinogenesis#cytogen_ca=[2],
    http://dl-learner.org/carcinogenesis#salmonella=[2],
    http://dl-learner.org/carcinogenesis#salmonella_n=[2],
    http://dl-learner.org/carcinogenesis#cytogen_sce=[2],
    http://dl-learner.org/carcinogenesis#amesTestPositive=[2],
    http://dl-learner.org/carcinogenesis#mouse_lymph=[2],
    http://dl-learner.org/carcinogenesis#chromex=[2],
    http://dl-learner.org/carcinogenesis#micronuc_m=[2],
    http://dl-learner.org/carcinogenesis#drosophila_rt=[2],
    http://dl-learner.org/carcinogenesis#drosophila_slrl=[2],
    http://dl-learner.org/carcinogenesis#chromaberr=[2],
    http://dl-learner.org/carcinogenesis#salmonella_reduc=[2],
    http://dl-learner.org/carcinogenesis#micronuc_f=[2]
  }
)


dict_keys([rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#charge'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#cytogen_ca'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#salmonella'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#salmonella_n'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#cytogen_sce'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#amesTestPositive'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#mouse_lymph'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#chromex'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#micronuc_m'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#drosophila_rt'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#drosophila_slrl'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#chromaberr'), rdflib.term.URIRef('http://dl-learner.org/carcinogenesis#salmonella_reduc'), rdflib.term.URIRef('http://dl-learner.org/carcino

In [None]:
edge_index = data.edge_index
edge_type = data.edge_type
num_relations = (edge_type.max()+1).item()
num_nodes = (edge_index.max()+1).item()

In [None]:
print(edge_index.size())
print(num_nodes)

torch.Size([2, 148454])
23644


In [None]:
import rdflib

charge = rdflib.URIRef("http://dl-learner.org/carcinogenesis#charge")
filtered_node_features = {}
filtered_node_features[0] = [None, None]
filtered_node_features[0][0] = torch.tensor(
    data.node_features[charge][0], dtype=torch.long
)
filtered_node_features[0][1] = torch.tensor(
    data.node_features[charge][1], dtype=torch.float32
).reshape(-1, 1)

In [None]:
num_features = 0
for type, (idx, features) in filtered_node_features.items():
    num_features = max(num_features, features.size(1))

torch.Size([9189])
torch.Size([9189, 1])


In [None]:
node_type = torch.zeros(num_nodes)
node_features = torch.zeros(num_nodes, num_features)
for type, (idx, features) in filtered_node_features.items():
    node_type[idx] = type
    node_features[idx,:features.size(1)] = features

In [None]:
# Dataset: predict if a cell (should be atom right?) is prone to (mute/produce ? any of that) a carcinome.
print(num_features) # number of features of a node, in here just a single one for demonstration: charge, that has a single value
print(node_type.size()) # same as above, all node in this test are picked to have the charge value, so a tensor with size of the number of nodes, for each node we have its type (what kind of features it has?)
print(node_features.size()) # for each node it shows its corresponded charge value i.e. its feature to be embedded
print(num_relations) # number of relations between nodes i.e. types of edges
print(num_nodes) # number of nodes
print(edge_index.size()) # the alternative of the adjacency matrix, each index (entry) is an edge, that shows the 2 nodes that it connects
print(edge_type.size()) # for each edge it shows its type

1
torch.Size([23644])
torch.Size([23644, 1])
46
23644
torch.Size([2, 148454])
torch.Size([148454])


# Network

In [None]:
import torch
import torch.nn as nn
from torch_geometric.nn.inits import glorot
from torch_geometric.nn.conv.rgcn_conv import masked_edge_index
#from torch_sparse import SparseTensor

class GCNEncoder(nn.Module):
    def __init__(self, input_size, num_nodes, seed = 42, device = "cpu"):
        super().__init__()
        self.input_size = input_size
        self.num_nodes = num_nodes
        self.seed = seed
        self.device = device

    def forward(self, nodes, node_type):
        #Set the random set
        torch.manual_seed(self.seed)

        #Initialize embeddings tensor with zeros
        embeddings = torch.zeros((self.num_nodes, self.input_size), device=self.device)

        #Initialize the embeddings with normal distribution
        torch.nn.init.normal_(embeddings, mean=0.0, std=1/self.input_size)

        types = node_type.unique()

        #Iterate over each node type
        for type in types:

            # Set the random seed for the current node type
            torch.manual_seed(self.seed+type)

            # Initialize node transform tensor with zeros
            node_transform = torch.zeros((nodes.size(1), self.input_size), device=self.device)

            # Initialize the node transform with normal distribution
            torch.nn.init.normal_(node_transform, mean=0.0, std=1/self.input_size)

            # Calculate the embeddings for nodes of the current type
            embeddings[node_type==type,:] = (
                nodes[node_type==type,:]/(float(
                nodes[node_type==type,:].size(1))**(1/2)))@node_transform

        return embeddings

class RGCNConvolution(nn.Module):
    def __init__(self, input_size, output_size, num_relations, seed, device="cpu"):
        super().__init__()
        self.device = device
        self.input_size = input_size
        self.output_size = output_size
        self.num_relations = num_relations
        self.seed = seed
        torch.manual_seed(seed)
        self.seeds = torch.randint(high=100*num_relations, size=(num_relations,))
        self.root_seed = torch.randint(high=100*num_relations, size=(1,)).item()

    def forward(self, node_features, edge_index, edge_type):
        size = node_features.size()

        output = torch.zeros((size[0], self.output_size), device=self.device)

        #For each relation, computes node representation as average of neighbours' node representations
        for i in range(self.num_relations):
            torch.manual_seed(self.seeds[i])
            weight_matrix = torch.zeros((self.input_size, self.output_size), device=self.device)
            glorot(weight_matrix)

            # Mask the edge index based on the current relation
            mask = masked_edge_index(edge_index, edge_type==i)

            # Create an adjacency matrix based on the masked edge index
            adj = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.shape[-1], device=edge_index.device), size=(size[0], size[0]))

            # Check if adjacency matrix or node features are None
            if adj is None:
                print("adj is none")
            if node_features is None:
                print("node_features is none")
            if weight_matrix is None:
                print("weight_matrix is none")

            # Compute the average of neighbors' node representations
            neig_avg = adj @ node_features @ weight_matrix
            neig_avg /= adj.sum(1).to_dense().reshape(-1,1)

            # Accumulate the neighbor averages to the output
            output += neig_avg
            del weight_matrix

        torch.manual_seed(self.root_seed)
        weight_root = torch.zeros((self.input_size, self.output_size), device=self.device)
        glorot(weight_root)

        # Compute the contribution of the node itself
        if node_features.dtype == torch.long:
            output += weight_root[node_features]
        else:
            output += node_features @ weight_root

        return output

class RRGCN(nn.Module):
    def __init__(self, num_nodes, num_layers, num_relations, input_size, seed=42, device = "cpu"):
        super().__init__()
        self.device = device
        self.num_nodes = num_nodes
        self.num_relations = num_relations
        self.input_size = input_size
        self.seed = seed
        self.relu = nn.ReLU()

        # Initialize RGCN layers
        self.layers = [RGCNConvolution(input_size, input_size, num_relations, seed, device=self.device) for _ in range(num_layers)]

        #Initialize GCN encoder
        self.encoder = GCNEncoder(input_size, num_nodes, seed, device = self.device)

    def ppv(self, nodes, edge_index):

        #Create of tensors adj and x_positive
        adjacency = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.size(1)))
        nodes = (nodes>0).float()

        #Calculate the proportion of positive values for each representation dimension
        ppv_output = adjacency @ (nodes/nodes.sum(0).reshape(1,-1))

        return ppv_output

    def forward(self, nodes, node_type, edge_index, edge_type):

        #Pass nodes through the GCN encoder to obtain initial node representations
        output = self.encoder(nodes, node_type)

        # Apply the first RGCN layer with ReLU activation function
        output = self.relu(self.layers[0](output, edge_index, edge_type))
        ppv_output = self.ppv(output, edge_index)

        # Iterate through the remaining RGCN layers
        for layer in self.layers[1:]:
            output = self.relu(layer(output, edge_index, edge_type))
            ppv_output = layer(ppv_output, edge_index, edge_type)
            ppv_output = self.ppv(ppv_output, edge_index)

        # Concatenate the final output with the ppv_output
        return torch.hstack([output, ppv_output])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Data computation will be on ", device)

#Create an istance of the RRGCN model
model = RRGCN(num_nodes, 2, num_relations, 16, device=device)

Data computation will be on  cpu


In [None]:
X = model(node_features, node_type, edge_index, edge_type)

In [None]:
X_train = X[data.train_idx]
y_train = data.train_y
X_test = X[data.test_idx]
y_test = data.test_y

In [None]:
print("X_train size: ", X_train.size())
print("y_train size: ", y_train.size())
print("X_test size: ", X_test.size())
print("y_test size: ", y_test.size())

X_train size:  torch.Size([272, 32])
y_train size:  torch.Size([272])
X_test size:  torch.Size([68, 32])
y_test size:  torch.Size([68])


# Metrics

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Perform a grid search using cross-validation to find the optimal n_estimators value
grid_search = GridSearchCV(GradientBoostingClassifier(), {'n_estimators':[100]})
grid_search.fit(X_train, y_train)

In [None]:
# Use the best estimator to make predictions on the test data
y_test_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        45
           1       0.64      0.61      0.62        23

    accuracy                           0.75        68
   macro avg       0.72      0.72      0.72        68
weighted avg       0.75      0.75      0.75        68



In [None]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(
    iterations=1000,
    early_stopping_rounds=100,
    task_type="CPU",
    random_seed=42,
    use_best_model=True,
    auto_class_weights="Balanced",
)
clf = clf.fit(
    X_train.cpu().numpy(),
    y_train.cpu().numpy(),
    eval_set=(X_test.cpu().numpy(), y_test.cpu().numpy()),
)

Learning rate set to 0.022975
0:	learn: 0.6832749	test: 0.6859207	best: 0.6859207 (0)	total: 51.8ms	remaining: 51.7s
1:	learn: 0.6733097	test: 0.6777481	best: 0.6777481 (1)	total: 55.4ms	remaining: 27.6s
2:	learn: 0.6648642	test: 0.6718404	best: 0.6718404 (2)	total: 60.9ms	remaining: 20.2s
3:	learn: 0.6563535	test: 0.6670743	best: 0.6670743 (3)	total: 64.5ms	remaining: 16.1s
4:	learn: 0.6471435	test: 0.6617362	best: 0.6617362 (4)	total: 68.4ms	remaining: 13.6s
5:	learn: 0.6402463	test: 0.6576264	best: 0.6576264 (5)	total: 72ms	remaining: 11.9s
6:	learn: 0.6344184	test: 0.6528449	best: 0.6528449 (6)	total: 75.6ms	remaining: 10.7s
7:	learn: 0.6278410	test: 0.6477428	best: 0.6477428 (7)	total: 79.2ms	remaining: 9.82s
8:	learn: 0.6214929	test: 0.6416257	best: 0.6416257 (8)	total: 82.6ms	remaining: 9.09s
9:	learn: 0.6145423	test: 0.6387231	best: 0.6387231 (9)	total: 86.5ms	remaining: 8.56s
10:	learn: 0.6071049	test: 0.6341614	best: 0.6341614 (10)	total: 89.9ms	remaining: 8.08s
11:	learn: 0.

In [None]:
print(classification_report(clf.predict(X_test.cpu().numpy()), data.test_y.cpu().numpy()))

              precision    recall  f1-score   support

           0       0.84      0.79      0.82        48
           1       0.57      0.65      0.60        20

    accuracy                           0.75        68
   macro avg       0.70      0.72      0.71        68
weighted avg       0.76      0.75      0.75        68

