# Visualization of Dataset Properties

In [6]:
"""
    IMPORTING LIBS
"""
import numpy as np
import socket
import time

from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

In [7]:
import torch
from data.data import LoadData # import dataset
print(torch.cuda.is_available())

True


## 1. TU Datasets Visualization Script

In [8]:
def visualize_TUs_data(DATASET_NAME):
    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")

    # Original Statistics
    num_nodes, graph_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        num_nodes += [g.number_of_nodes() for g in split[0][:][0]]
        graph_labels += split[0][:][1]
    orig_mean, orig_std, orig_max, orig_min = np.mean(num_nodes), np.std(num_nodes), np.max(num_nodes), np.min(num_nodes)

    max_nodes = int(orig_mean+orig_std)
    print("Original Dataset Statistics:\n")
    print("Max nodes {}, Min nodes {}\n".format(orig_max, orig_min))
    print("Mean no. of nodes {}, S.d. {}\n".format(orig_mean, orig_std))

    num_nodes, graph_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        split_num_nodes, split_graph_labels = [], []
        g = split[0][:][0]
        lab = split[0][:][1]
        for idx in range(len(g)):
            if g[idx].number_of_nodes() < max_nodes:
                split_num_nodes.append(g[idx].number_of_nodes())
                split_graph_labels.append(lab[idx])


        num_nodes += split_num_nodes
        graph_labels += split_graph_labels
    label_bins = len(np.unique(graph_labels))
    
    print("VISUALIZATIONS:\nMax nodes in consideration: {}".format(max_nodes))
    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    
    plt.hist(num_nodes, bins=len(np.unique(num_nodes)))
    plt.xlabel('Number of Nodes in Graph', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    
    plt.subplot(122)
    plt.hist2d(graph_labels, num_nodes, bins=[label_bins, 20])
    plt.xlabel(r'Graph label', fontsize=12)
    plt.ylabel(r'Graph size (number of nodes)', fontsize=12)
    plt.colorbar()
    plt.show()
    
    print("Correlation between graph size (number of nodes) and labels: %.2f" % np.corrcoef(graph_labels, num_nodes)[0,1])

### a. TU Dataset ENZYMES

In [9]:
visualize_TUs_data('ENZYMES')

[I] Loading data (notebook) ...
[!] Dataset:  ENZYMES
[!] Splitting the data into train/val/test ...


NameError: name 'format_dataset' is not defined

### b. TU Datasets DD

In [None]:
visualize_TUs_data('DD')

### c. TU Datasets PROTEINS_full

In [None]:
visualize_TUs_data('PROTEINS_full')

## 2. MNIST/CIFAR10 Superpixels Dataset Visualization Script

In [None]:
def visualize_superpixels_data(DATASET_NAME):
    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")

    # Original Statistics
    num_nodes, graph_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        num_nodes += [g.number_of_nodes() for g in split[:][0]]
        graph_labels += list(split[:][1].numpy())
    orig_mean, orig_std, orig_max, orig_min = np.mean(num_nodes), np.std(num_nodes), np.max(num_nodes), np.min(num_nodes)

    max_nodes = int(orig_mean+orig_std)
    print("Original Dataset Statistics:\n")
    print("Max nodes {}, Min nodes {}\n".format(orig_max, orig_min))
    print("Mean no. of nodes {}, S.d. {}\n".format(orig_mean, orig_std))

    num_nodes, graph_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        split_num_nodes, split_graph_labels = [], []
        g = split[:][0]
        lab = split[:][1]
        for idx in range(len(g)):
            if g[idx].number_of_nodes() < max_nodes:
                split_num_nodes.append(g[idx].number_of_nodes())
                split_graph_labels.append(lab[idx].item())


        num_nodes += split_num_nodes
        graph_labels += split_graph_labels
    label_bins = len(np.unique(graph_labels))
    
    print("VISUALIZATIONS:\nMax nodes in consideration: {}".format(max_nodes))
    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    
    plt.hist(num_nodes, bins=len(np.unique(num_nodes)))
    plt.xlabel('Number of Nodes in Graph', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    
    plt.subplot(122)
    plt.hist2d(graph_labels, num_nodes, bins=[label_bins, 20])
    plt.xlabel(r'Graph label', fontsize=12)
    plt.ylabel(r'Graph size (number of nodes)', fontsize=12)
    plt.colorbar()
    plt.show()

In [None]:
DATASET_NAME = 'MNIST'

print("[I] Loading data (notebook) ...")
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test
print("[I] Finished loading.")

### Superpixels Dataset MNIST

In [None]:
visualize_superpixels_data('MNIST')

In [None]:
visualize_superpixels_data('CIFAR10')

## 3. ZINC Molecules Dataset Visualization Script

In [None]:
def visualize_molecules_data(DATASET_NAME):
    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")
    
    # Original Statistics
    num_nodes, graph_scores = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        num_nodes += [g.number_of_nodes() for g in split[:][0]]
        graph_scores += split[:][1]
    orig_mean, orig_std, orig_max, orig_min = np.mean(num_nodes), np.std(num_nodes), np.max(num_nodes), np.min(num_nodes)

    max_nodes = int(orig_mean+orig_std)
    print("Original Dataset Statistics:\n")
    print("Max nodes {}, Min nodes {}\n".format(orig_max, orig_min))
    print("Mean no. of nodes {}, S.d. {}\n".format(orig_mean, orig_std))

    num_nodes, graph_scores = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        split_num_nodes, split_graph_scores = [], []
        g = split[:][0]
        sco = split[:][1]
        for idx in range(len(g)):
            if g[idx].number_of_nodes() < max_nodes:
                split_num_nodes.append(g[idx].number_of_nodes())
                split_graph_scores.append(sco[idx].item())


        num_nodes += split_num_nodes
        graph_scores += split_graph_scores

    print("VISUALIZATIONS:\nMax nodes in consideration: {}".format(max_nodes))
    plt.figure(figsize=(12, 5))
    plt.subplot(121)

    plt.hist(num_nodes, bins=len(np.unique(num_nodes)))
    plt.xlabel('Number of Nodes in Graph', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.show()

    print("Mean of graph regression scores and s.d. [{:.4f}] (+/-) [{:.4f}]".format(np.mean(graph_scores), np.std(graph_scores)))

In [None]:
visualize_molecules_data('ZINC')

## 4. VOC Superpixels Dataset Visualization Script

In [None]:
def visualize_superpixels_node_data(DATASET_NAME):
    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")
    
        # Original Statistics
    num_nodes, graph_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        num_nodes += [g.number_of_nodes() for g in split[:][0]]
        graph_labels += split[:][1]
    orig_mean, orig_std, orig_max, orig_min = np.mean(num_nodes), np.std(num_nodes), np.max(num_nodes), np.min(num_nodes)

    max_nodes = int(orig_mean+orig_std)
    print("Original Dataset Statistics:\n")
    print("Max nodes {}, Min nodes {}\n".format(orig_max, orig_min))
    print("Mean no. of nodes {}, S.d. {}\n".format(orig_mean, orig_std))

    num_nodes, graph_node_labels = [], []
    for split in [dataset.train, dataset.test, dataset.val]:
        split_num_nodes, split_graph_labels = [], []
        g = split[:][0]
        lab = split[:][1]
        for idx in range(len(g)):
            if g[idx].number_of_nodes() < max_nodes:
                split_num_nodes.append(g[idx].number_of_nodes())
                split_graph_labels.append(len(np.unique(lab[idx])))


        num_nodes += split_num_nodes
        graph_node_labels += split_graph_labels
    # label_bins = len(np.unique(graph_labels))

    print("VISUALIZATIONS:\nMax nodes in consideration: {}".format(max_nodes))
    plt.figure(figsize=(12, 5))
    plt.subplot(121)

    plt.hist(num_nodes, bins=len(np.unique(num_nodes)))
    plt.xlabel('Number of Nodes in Graph', fontsize=12)
    plt.ylabel('Count', fontsize=12)

    plt.subplot(122)
    plt.hist(graph_node_labels, bins=len(np.unique(graph_node_labels)))
    plt.xlabel('Number Unique Node Labels in Graph', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.show()

In [None]:
visualize_superpixels_node_data('VOC')