# Generating data for evaluation

In the evaluation, we model three types of users:
- Uninformed user (the opposite of nobre)
- 50-50 user (50% of the time aligned with nobre)
- Informed user (fully aligned with nobre scores)

Given the set of system predicates to start off with, we'll see how the system responds to the user feedback. This notebook generates the data for the evaluation

In [1]:
import json
import random
import numpy as np
from vigor.visualization_types import VisualizationType

In [2]:
nobre_scores = {
    'size': {
        VisualizationType.NODE_LINK: {'small': 3, 'medium': 2, 'large': 1},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'small': 3, 'medium': 1, 'large': 1},
        VisualizationType.ADJACENCY_MATRIX: {'small': 3, 'medium': 1, 'large': 1},
        VisualizationType.QUILTS: {'small': 3, 'medium': 1, 'large': 1},
        VisualizationType.BIOFABRIC: {'small': 3, 'medium': 1, 'large': 1},
        VisualizationType.SUNBURST: {'small': 3, 'medium': 2, 'large': 1},
        VisualizationType.TREEMAP: {'small': 3, 'medium': 2, 'large': 2},
        VisualizationType.SEMANTIC_SUBSTRATES: {'small': 2, 'medium': 1, 'large': 1},
    },
    'type': {
        VisualizationType.NODE_LINK: {'sparse': 3, 'dense': 1, 'k-partite': 3, 'tree': 3},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'sparse': 3, 'dense': 1, 'k-partite': 1, 'tree': 1},
        VisualizationType.ADJACENCY_MATRIX: {'sparse': 2, 'dense': 3, 'k-partite': 2, 'tree': 1},
        VisualizationType.QUILTS: {'sparse': 3, 'dense': 1, 'k-partite': 3, 'tree': 3},
        VisualizationType.BIOFABRIC: {'sparse': 3, 'dense': 3, 'k-partite': 2, 'tree': 1},
        VisualizationType.SUNBURST: {'sparse': 0, 'dense': 0, 'k-partite': 0, 'tree': 3},
        VisualizationType.TREEMAP: {'sparse': 0, 'dense': 0, 'k-partite': 0, 'tree': 3},
        VisualizationType.SEMANTIC_SUBSTRATES: {'sparse': 2, 'dense': 2, 'k-partite': 2, 'tree': 1},
    },
    'node_types': {
        VisualizationType.NODE_LINK: {'homogeneous': 3, 'heterogeneous': 2},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'homogeneous': 3, 'heterogeneous': 1},
        VisualizationType.ADJACENCY_MATRIX: {'homogeneous': 3, 'heterogeneous': 2},
        VisualizationType.QUILTS: {'homogeneous': 3, 'heterogeneous': 3},
        VisualizationType.BIOFABRIC: {'homogeneous': 3, 'heterogeneous': 3},
        VisualizationType.SUNBURST: {'homogeneous': 3, 'heterogeneous': 1},
        VisualizationType.TREEMAP: {'homogeneous': 3, 'heterogeneous': 1},
        VisualizationType.SEMANTIC_SUBSTRATES: {'homogeneous': 3, 'heterogeneous': 2},
    },
    'node_attributes': {
        VisualizationType.NODE_LINK: {'few': 2, 'several': 1},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'few': 3, 'several': 1},
        VisualizationType.ADJACENCY_MATRIX: {'few': 2, 'several': 3},
        VisualizationType.QUILTS: {'few': 3, 'several': 3},
        VisualizationType.BIOFABRIC: {'few': 3, 'several': 3},
        VisualizationType.SUNBURST: {'few': 3, 'several': 1},
        VisualizationType.TREEMAP: {'few': 3, 'several': 1},
        VisualizationType.SEMANTIC_SUBSTRATES: {'few': 2, 'several': 1},
    },
    'edge_types': {
        VisualizationType.NODE_LINK: {'homogeneous': 3, 'heterogeneous': 1},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'homogeneous': 2, 'heterogeneous': 1},
        VisualizationType.ADJACENCY_MATRIX: {'homogeneous': 3, 'heterogeneous': 2},
        VisualizationType.QUILTS: {'homogeneous': 3, 'heterogeneous': 2},
        VisualizationType.BIOFABRIC: {'homogeneous': 3, 'heterogeneous': 3},
        VisualizationType.SUNBURST: {'homogeneous': 0, 'heterogeneous': 0},
        VisualizationType.TREEMAP: {'homogeneous': 0, 'heterogeneous': 0},
        VisualizationType.SEMANTIC_SUBSTRATES: {'homogeneous': 2, 'heterogeneous': 1},
    },
    'edge_attributes': {
        VisualizationType.NODE_LINK: {'few': 2, 'several': 1},
        VisualizationType.ATTRIBUTE_DRIVEN_NODELINK: {'few': 2, 'several': 1},
        VisualizationType.ADJACENCY_MATRIX: {'few': 3, 'several': 2},
        VisualizationType.QUILTS: {'few': 3, 'several': 3},
        VisualizationType.BIOFABRIC: {'few': 3, 'several': 3},
        VisualizationType.SUNBURST: {'few': 0, 'several': 0},
        VisualizationType.TREEMAP: {'few': 0, 'several': 0},
        VisualizationType.SEMANTIC_SUBSTRATES: {'few': 2, 'several': 1},
    },
}

In [3]:
def generate_samples(n_samples, user_type='smart', seed=None, testing=False):
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    nobre_samples = []

    for _ in range(n_samples):
        size_category = random.choice(['small', 'medium', 'large'])
        type_category = random.choice(['sparse', 'dense', 'tree', 'k-partite'])
        node_type_category = random.choice(['homogeneous', 'heterogeneous'])
        node_attributes_category = random.choice(['few', 'several'])
        edge_types_category = random.choice(['homogeneous', 'heterogeneous'])
        edge_attributes_category = random.choice(['few', 'several'])

        # Determine graph size based on size category
        if size_category == 'small':
            size = np.random.randint(1, 100)
        elif size_category == 'medium':
            size = np.random.randint(100, 1000)
        else:
            size = np.random.randint(1000, 2000)

        # Generate node and edge characteristics
        node_types = 1 if node_type_category == 'homogeneous' else np.random.randint(2, 4)
        node_attributes = np.random.randint(1, 5) if node_attributes_category == 'few' else np.random.randint(6, 9)
        edge_types = 1 if edge_types_category == 'homogeneous' else np.random.randint(2, 4)
        edge_attributes = np.random.randint(1, 2) if edge_attributes_category == 'few' else np.random.randint(3, 5)

        # Generate edge volume based on type category
        if type_category == 'dense':
            lower_bound = max(1, size * (size - 1) // 4)
            upper_bound = max(lower_bound + 1, size * (size - 1) // 2)
            volume = np.random.randint(lower_bound, upper_bound)
        elif type_category == 'sparse':
            upper_bound = max(2, size * (size - 1) // 4)
            volume = np.random.randint(1, upper_bound)
        elif type_category == 'k-partite':
            upper_bound = max(2, size * (size - 1) // 2)
            volume = np.random.randint(1, upper_bound)
        elif type_category == 'tree':
            volume = size - 1  # Trees have n-1 edges

        density = round(volume / (size * (size - 1) // 2), 2) if size > 1 else 0

        # Determine is_tree and is_bipartite based on graph type and properties
        is_tree = 1 if type_category == 'tree' and volume == size - 1 else 0
        is_bipartite = 1 if type_category == 'k-partite' and node_types == 2 else 0

        # Score initialization for each visualization type
        scores = {vis: 0 for vis in VisualizationType}
        for vis in scores.keys():
            scores[vis] += nobre_scores['size'][vis][size_category]
            scores[vis] += nobre_scores['type'][vis][type_category]
            scores[vis] += nobre_scores['node_types'][vis][node_type_category]
            scores[vis] += nobre_scores['node_attributes'][vis][node_attributes_category]
            scores[vis] += nobre_scores['edge_types'][vis][edge_types_category]
            scores[vis] += nobre_scores['edge_attributes'][vis][edge_attributes_category]

        # Sort visualizations by score
        sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        sorted_visualizations = [vis for vis, _ in sorted_scores]

        # Implementing user behaviors based on user_type
        if user_type == 'smart':
            best_visualization = sorted_visualizations[0]  # Smart user chooses the best-ranked visualization
        elif user_type == 'dumb':
            sorted_visualizations = sorted_visualizations[::-1]  # Inverse the sorted array for dumb user
            best_visualization = sorted_visualizations[0]  # Dumb user chooses the worst-ranked visualization
        elif user_type == 'balanced':
            if random.random() < 0.5:
                best_visualization = sorted_visualizations[0]  # 50% chance of choosing the best-ranked
            else:
                random.shuffle(sorted_visualizations)  # 50% chance of shuffling the array
                best_visualization = sorted_visualizations[0]

        # Collect statistics
        statistics = {
            "is_bipartite": is_bipartite,
            "is_tree": is_tree,
            "size": size,
            "density": density,
            "node_types": node_types,
            "node_attributes": node_attributes,
            "edge_types": edge_types,
            "edge_attributes": edge_attributes
        }

        # Append the generated sample
        nobre_samples.append({
            "statistics": statistics,
            "best_visualization": best_visualization.name,  # Store visualization as a string representation
            "sorted_visualizations": [vis.name for vis in sorted_visualizations]  # Convert enum to string
        })

    return nobre_samples

In [4]:
smart_samples = generate_samples(5000, user_type='smart')
dumb_samples = generate_samples(5000, user_type='dumb')
fifty_fifty_samples = generate_samples(5000, user_type='balanced')

In [5]:
with open('../data/evaluation/informed_user.json', 'w') as f:
    json.dump(smart_samples, f, indent=4)

In [6]:
with open('../data/evaluation/uninformed_user.json', 'w') as f:
    json.dump(dumb_samples, f, indent=4)

In [7]:
with open('../data/evaluation/fifty_fifty_user.json', 'w') as f:
    json.dump(fifty_fifty_samples, f, indent=4)