# Data Generation

This script generates the data that is used in the evaluation of the model. The scoring of the generated samples is done through the system defined by Nobre et al. [1]

[1] Nobre, C., Meyer, M., Streit, M., & Lex, A. (2019, June). The state of the art in visualizing multivariate networks. In Computer Graphics Forum (Vol. 38, No. 3, pp. 807-832).

In [17]:
import json
import random
import numpy as np
from collections import Counter

In [18]:
nobre_scores = {
    'size': {
        'nodelink topology driven': {'small': 3, 'medium': 2, 'large': 1},
        'nodelink attribute driven faceting': {'small': 3, 'medium': 1, 'large': 1},
        'nodelink attribute driven positioning': {'small': 3, 'medium': 1, 'large': 1},
        'adjacency matrix': {'small': 3, 'medium': 1, 'large': 1},
        'quilts': {'small': 3, 'medium': 1, 'large': 1},
        'biofabric': {'small': 3, 'medium': 1, 'large': 1},
        'sunburst': {'small': 3, 'medium': 2, 'large': 1},
        'treemap': {'small': 3, 'medium': 2, 'large': 2},
    },
    'type': {
        'nodelink topology driven': {'sparse': 3, 'dense': 1, 'k-partite': 3, 'tree': 3},
        'nodelink attribute driven faceting': {'sparse': 3, 'dense': 1, 'k-partite': 3, 'tree': 1},
        'nodelink attribute driven positioning': {'sparse': 3, 'dense': 1, 'k-partite': 1, 'tree': 1},
        'adjacency matrix': {'sparse': 2, 'dense': 3, 'k-partite': 2, 'tree': 1},
        'quilts': {'sparse': 3, 'dense': 1, 'k-partite': 3, 'tree': 3},
        'biofabric': {'sparse': 3, 'dense': 3, 'k-partite': 2, 'tree': 1},
        'sunburst': {'sparse': 0, 'dense': 0, 'k-partite': 0, 'tree': 3},
        'treemap': {'sparse': 0, 'dense': 0, 'k-partite': 0, 'tree': 3},
    },
    'node_types': {
        'nodelink topology driven': {'homogeneous': 3, 'heterogeneous': 2},
        'nodelink attribute driven faceting': {'homogeneous': 3, 'heterogeneous': 3},
        'nodelink attribute driven positioning': {'homogeneous': 3, 'heterogeneous': 1},
        'adjacency matrix': {'homogeneous': 3, 'heterogeneous': 2},
        'quilts': {'homogeneous': 3, 'heterogeneous': 3},
        'biofabric': {'homogeneous': 3, 'heterogeneous': 3},
        'sunburst': {'homogeneous': 3, 'heterogeneous': 1},
        'treemap': {'homogeneous': 3, 'heterogeneous': 1},
    },
    'node_attributes': {
        'nodelink topology driven': {'few': 2, 'several': 1},
        'nodelink attribute driven faceting': {'few': 3, 'several': 1},
        'nodelink attribute driven positioning': {'few': 3, 'several': 1},
        'adjacency matrix': {'few': 2, 'several': 3},
        'quilts': {'few': 3, 'several': 3},
        'biofabric': {'few': 3, 'several': 3},
        'sunburst': {'few': 3, 'several': 1},
        'treemap': {'few': 3, 'several': 1},
    },
    'edge_types': {
        'nodelink topology driven': {'homogeneous': 3, 'heterogeneous': 1},
        'nodelink attribute driven faceting': {'homogeneous': 2, 'heterogeneous': 1},
        'nodelink attribute driven positioning': {'homogeneous': 2, 'heterogeneous': 1},
        'adjacency matrix': {'homogeneous': 3, 'heterogeneous': 2},
        'quilts': {'homogeneous': 3, 'heterogeneous': 2},
        'biofabric': {'homogeneous': 3, 'heterogeneous': 3},
        'sunburst': {'homogeneous': 0, 'heterogeneous': 0},
        'treemap': {'homogeneous': 0, 'heterogeneous': 0},
    },
    'edge_attributes': {
        'nodelink topology driven': {'few': 2, 'several': 1},
        'nodelink attribute driven faceting': {'few': 2, 'several': 1},
        'nodelink attribute driven positioning': {'few': 2, 'several': 1},
        'adjacency matrix': {'few': 3, 'several': 2},
        'quilts': {'few': 3, 'several': 3},
        'biofabric': {'few': 3, 'several': 3},
        'sunburst': {'few': 0, 'several': 0},
        'treemap': {'few': 0, 'several': 0},
    },
}

In [19]:
def generate_samples(n_samples, seed=None, testing=False):
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    nobre_samples = []

    for _ in range(n_samples):
        size_category = random.choice(['small', 'medium', 'large'])
        type_category = random.choice(['sparse', 'dense', 'tree', 'k-partite'])
        node_type_category = random.choice(['homogeneous', 'heterogeneous'])
        node_attributes_category = random.choice(['few', 'several'])
        edge_types_category = random.choice(['homogeneous', 'heterogeneous'])
        edge_attributes_category = random.choice(['few', 'several'])

        if size_category == 'small':
            size = np.random.randint(1, 100)
        elif size_category == 'medium':
            size = np.random.randint(100, 1000)
        else:
            size = np.random.randint(1000, 2000)

        if node_type_category == 'homogeneous':
            node_types = 1
        elif node_type_category == 'heterogeneous':
            node_types = np.random.randint(2, 4) * 1
        
        if node_attributes_category == 'few':
            node_attributes = np.random.randint(1, 5)
        elif node_attributes_category == 'several':
            node_attributes = np.random.randint(6, 9)

        if edge_types_category == 'homogeneous':
            edge_types = 1
        elif edge_types_category == 'heterogeneous':
            edge_types = np.random.randint(2, 4) * 1

        if edge_attributes_category == 'few':
            edge_attributes = np.random.randint(1, 2)
        elif edge_attributes_category == 'several':
            edge_attributes = np.random.randint(3, 5)

        if type_category == 'dense':
            lower_bound = max(1, size * (size - 1) // 4)
            upper_bound = max(lower_bound + 1, size * (size - 1) // 2)
            volume = np.random.randint(lower_bound, upper_bound)
        elif type_category == 'sparse':
            upper_bound = max(2, size * (size - 1) // 4)
            volume = np.random.randint(1, upper_bound)
        elif type_category == 'k-partite':
            upper_bound = max(2, size * (size - 1) // 2)
            volume = np.random.randint(1, upper_bound)
        elif type_category == 'tree':
            volume = size - 1

        if size > 1:
            density = volume / (size * (size - 1) // 2)
            density = round(density, 2)
        else:
            density = 0
        
        scores = {vis: 0 for vis in nobre_scores['size'].keys()}
        for vis in scores.keys():
            scores[vis] += nobre_scores['size'][vis][size_category]
            scores[vis] += nobre_scores['type'][vis][type_category]
            scores[vis] += nobre_scores['node_types'][vis][node_type_category]
            scores[vis] += nobre_scores['node_attributes'][vis][node_attributes_category]
            scores[vis] += nobre_scores['edge_types'][vis][edge_types_category]
            scores[vis] += nobre_scores['edge_attributes'][vis][edge_attributes_category]

        if not testing:
            visualizations = list(scores.keys())
            best_visualization = random.choice(visualizations)
            sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
            sorted_visualizations = [vis for vis, _ in sorted_scores]
            ranks = {vis: rank for rank, (vis, _) in enumerate(sorted_scores)}
            rank = ranks[best_visualization]
            max_rank = len(visualizations) - 1
            user_feedback = round(-5 + 10 * (max_rank - rank) / max_rank, 2)
        else:
            best_visualization = random.choice([vis for vis, score in scores.items() if score == max(scores.values())])
            sorted_visualizations = []
            user_feedback = 5

        domain = random.choice(['general'])
        
        statistics = {
            "size": size,
            "density": density,
            "volume": volume,
            "node_types": node_types,
            "node_attributes": node_attributes,
            "edge_types": edge_types,
            "edge_attributes": edge_attributes
        }

        nobre_samples.append({
            "domain": domain,
            "statistics": statistics,
            "visualization": best_visualization,
            "user_feedback": user_feedback,
            "sorted_visualizations": sorted_visualizations
        })

    return nobre_samples


In [20]:
interactions = generate_samples(80000, seed=42)

In [21]:
with open('../data/interactions.json', 'w') as file:
    json.dump(interactions, file, indent=4)