# Simulation of User Preferences

In [1]:
import os
import altair as alt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from vigor import generate_graphs, nobre_predicates as predicates, compute_metrics, Predicate, VIGOR, label_graphs, learn_predicates

## Generate graphs

1. Generate graphs using the fast_gnp_random_graph function from networkx
2. Calculate statistics for each graph

In [2]:
file_path = '../data/generated_graphs_example.csv'
# all_graphs = generate_graphs(1000, 2, 200, file_path=file_path)
all_graphs = pd.read_csv(f'{file_path}')

In [3]:
graphs = all_graphs.sample(frac=.8, random_state=0)
test_graphs = all_graphs.loc[~all_graphs.index.isin(graphs.index)]

In [4]:
graphs.head()

Unnamed: 0,graph_type,is_directed_int,has_spatial_attributes,has_temporal_attributes,is_bipartite,n_components,avg_betweenness_centrality,avg_closeness_centrality,avg_eigenvector_centrality,avg_degree,...,eccentricity_avg,n_nodes,node_types,node_attributes,number_of_isolates,density,edge_types,edge_attributes,n_parallel_edges,n_self_loops
1144,4,0,1,0,1,6,0.003506,0.613409,0.073731,66.769231,...,2.0,182,2,12,0,0.368891,5,4,1,1
990,4,0,1,0,0,2,0.004779,0.60125,0.083575,47.007092,...,2.0,141,5,11,0,0.335765,3,12,5,5
478,2,0,0,1,0,8,0.227273,0.305556,0.288675,2.0,...,6.0,12,1,14,0,0.181818,1,9,3,4
161,4,0,0,1,0,4,0.018649,0.49649,0.125289,9.192982,...,3.035088,57,1,10,0,0.16416,2,14,5,3
1153,4,0,0,0,0,1,0.004753,0.560658,0.076448,35.856287,...,2.035928,167,3,5,0,0.216002,1,2,4,3


## Sample designers

We evaluate the ability of VIGOR to recover the rules that were used to represent simulated users. We design 3 versions of this simulated user: bob_informed, who follows the rules 100\% of the time; bob_semi_informed, who follows the rules 75\% of the time and chooses other visualizations randomly the other 25\% of the time; and bob_uninformed who follows the rules 50\% of the time and chooses other visualizations randomly the other 50\% of the time.

In [5]:
informed = label_graphs(graphs, predicates, conformance=1)
semi_informed = label_graphs(graphs, predicates, conformance=0.75)
uninformed = label_graphs(graphs, predicates, conformance=0.5)

In [6]:
informed

1144    BIOFABRIC
990     BIOFABRIC
478        QUILTS
161     BIOFABRIC
1153       QUILTS
          ...    
414     BIOFABRIC
662     BIOFABRIC
1206    BIOFABRIC
1187    BIOFABRIC
1027       MATRIX
Length: 974, dtype: object

### Learning predicates from labeled data

In [7]:
graphs['graph_type'].value_counts()

graph_type
4    657
3    160
2     83
1     74
Name: count, dtype: int64

In [8]:
graph_types = graphs['graph_type'].unique()
graphs['graph_type_' + pd.Series(graph_types).astype(str)] = (graphs['graph_type'].values[None] == graph_types[:,None]).astype(int).T
graphs = graphs.drop('graph_type', axis=1)

In [9]:
nan_counts = graphs.isna().sum()
graphs = graphs.drop('assortativity', axis=1)

In [10]:
learned_predicates_informed = learn_predicates(graphs, informed, 1000)
learned_predicates_semi_informed = learn_predicates(graphs, semi_informed, 1000)
learned_predicates_uninformed = learn_predicates(graphs, uninformed, 1000)

Learning predicates for BIOFABRIC
[   0] loss 17.0614070892334
[ 100] loss 15.621204376220703
[ 200] loss 14.260096549987793
[ 300] loss 12.80773639678955
[ 400] loss 11.236026763916016
[ 500] loss 9.845702171325684
[ 600] loss 8.477493286132812
[ 700] loss 7.0778703689575195
[ 800] loss 5.5895609855651855
[ 900] loss 3.8505022525787354
[   0] loss 4.501351833343506
[ 100] loss 4.122861862182617
[ 200] loss 3.7623157501220703
[ 300] loss 3.403964042663574
[ 400] loss 3.040868043899536
[ 500] loss 2.6698856353759766
[ 600] loss 2.2911109924316406
[ 700] loss 1.909831166267395
[ 800] loss 1.5332194566726685
[ 900] loss 1.161286473274231
Learning predicates for QUILTS
[   0] loss 4.221377372741699
[ 100] loss 3.863262176513672
[ 200] loss 3.524844169616699
[ 300] loss 3.1887197494506836
[ 400] loss 2.847761631011963
[ 500] loss 2.499769926071167
[ 600] loss 2.1457884311676025
[ 700] loss 1.7909520864486694
[ 800] loss 1.4409523010253906
[ 900] loss 1.096062183380127
[   0] loss 17.4466457

### Comparing learned predicates to initial predicates

#### Informed User

In [11]:
test_informed = label_graphs(test_graphs, predicates, conformance=1)
evaluation_informed = compute_metrics(predicates, learned_predicates_informed, graphs, test_graphs, informed, test_informed)
print(evaluation_informed)

{'BIOFABRIC': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_2': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}, 'graph_type_1': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.2063655, 0.       ]), 'recall': array([1., 0.]), 'f1': array([0.34212766, 0.        ]), 'accuracy': np.float64(0.20636550308008214)}, 'generalize': {'precision': array([0.19672131, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.32876712, 0.        ]), 'accuracy': np.float64(0.19672131147540983)}}, 'QUILTS': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_4': {'iou': 0.0, 'deviation': 0.25, 'inclusion': 1}, 'graph_type_3': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}, 'graph_type_1': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.80698152, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.89318182, 0.        ]), 'accuracy':

#### Semi-Informed User

In [12]:
test_semi_informed = label_graphs(test_graphs, predicates, conformance=0.75)
evaluation_semi_informed = compute_metrics(predicates, learned_predicates_semi_informed, graphs, test_graphs, semi_informed, test_semi_informed)
print(evaluation_semi_informed)

{'BIOFABRIC': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_2': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}, 'graph_type_1': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.38193018, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.55274889, 0.        ]), 'accuracy': np.float64(0.38193018480492813)}, 'generalize': {'precision': array([0.33606557, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.50306748, 0.        ]), 'accuracy': np.float64(0.3360655737704918)}}, 'MATRIX': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_2': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.95995893, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.97957046, 0.        ]), 'accuracy': np.float64(0.9599589322381931)}, 'generalize': {'precision': array([0.95491803, 0.        ]), 'recall': array([1., 0.]), 'f1': a

#### Unformed User

In [13]:
test_uninformed = label_graphs(test_graphs, predicates, conformance=0.5)
evaluation_uninformed = compute_metrics(predicates, learned_predicates_uninformed, graphs, test_graphs, uninformed, test_uninformed)
print(evaluation_uninformed)

{'MATRIX': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_2': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.92402464, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.96051227, 0.        ]), 'accuracy': np.float64(0.9240246406570842)}, 'generalize': {'precision': array([0.93442623, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.96610169, 0.        ]), 'accuracy': np.float64(0.9344262295081968)}}, 'BIOFABRIC': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 111.49999999991832, 'inclusion': 0}, 'graph_type_2': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}, 'graph_type_1': {'iou': 0.0, 'deviation': 0.75, 'inclusion': 0}}, 'describe': {'precision': array([0.54106776, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.70219853, 0.        ]), 'accuracy': np.float64(0.5410677618069816)}, 'generalize': {'precision': array([0.54918033, 0.        ]), 'recall': array([1., 0.]), 'f1': ar

### Visualizing Results

In [14]:
data = [evaluation_informed, evaluation_semi_informed, evaluation_uninformed]

#### Extract

In [15]:
evaluations = ['informed', 'semi-informed', 'uninformed']
visualizations = set(vis for d in data for vis in d.keys())

avg_deviation = {vis: {eval: 0 for eval in evaluations} for vis in visualizations}

for i, eval in enumerate(data):
    for vis in visualizations:
        deviations = []
        avg_deviation_round = []
        for entry in eval[vis]['exact'].values():
            avg_deviation_round.append(entry['deviation'])
        avg_deviation[vis][evaluations[i]] = np.mean(avg_deviation_round) if avg_deviation_round else 0

KeyError: 'SUNBURST'

In [None]:
# Prepare data for plotting
x_labels = ["Informed", "Semi-Informed", "Uninformed"]
plot_data = []

for i, vis in enumerate(visualizations):
    for j, var in enumerate(evaluations):
        plot_data.append({
            'variable': var,
            'deviation': avg_deviation[vis][var],
            'visualization': vis
        })

# Convert to DataFrame
df = pd.DataFrame(plot_data)

# Create the Altair chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('variable:N', axis=alt.Axis(title='Variables')),
    y=alt.Y('deviation:Q', axis=alt.Axis(title='Average Deviation')),
    color='visualization:N',
    column='visualization:N',
    tooltip=['variable:N', 'deviation:Q', 'visualization:N']
).properties(
    title='Average Deviations per Variable for Each Visualization',
    width=150,
    height=300
)

# Configure chart appearance
chart.configure_view(
    stroke='transparent'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16,
    anchor='middle'
)

# Display the chart
chart.show()

#### Describe

In [None]:
describe = {eval: {"prec": 0, "recall": 0, 'acc': 0} for eval in evaluations}

# Iterate over each eval in data
for i, eval in enumerate(data):
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        print(i, vis, eval[vis]['describe'])
        # Get the values for accuracy, precision, and recall
        metr = eval[vis]['describe']
        acc = metr['accuracy']
        prec = metr['precision'][0]
        rec = metr['recall'][0]

        # Append the values to the temporary lists
        temp_accuracy.append(acc)
        temp_precision.append(prec)
        temp_recall.append(rec)

    # Store the averages in the res dictionary
    describe[evaluations[i]]['acc'] = np.mean(temp_accuracy)
    describe[evaluations[i]]['prec'] = np.mean(temp_precision)
    describe[evaluations[i]]['recall'] = np.mean(temp_recall)

print(describe)

[np.float64(0.8), np.float64(0.875), np.float64(0.8750000000000001), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]


In [None]:
# Extract values and create lists
accuracy = [describe['informed']['acc'], describe['semi-informed']['acc'], describe['uninformed']['acc']]
precision = [describe['informed']['prec'], describe['semi-informed']['prec'], describe['uninformed']['prec']]
recall = [describe['informed']['recall'], describe['semi-informed']['recall'], describe['uninformed']['recall']]

# Combine lists into a single list for the DataFrame
result = accuracy + precision + recall

describe = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(describe).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
    tooltip=['Evaluation:N', 'Value:Q']
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()

### Generalize

In [None]:
generalize = {eval: {"prec": 0, "recall": 0, 'acc': 0} for eval in evaluations}

# Iterate over each eval in data
for i, eval in enumerate(data):
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        print(i, eval[vis]['generalize'])
        # Get the values for accuracy, precision, and recall
        acc = eval[vis]['generalize']['accuracy']
        prec = eval[vis]['generalize']['precision']
        rec = eval[vis]['generalize']['recall']

        # Check if the values are arrays and take the first element if they are
        if isinstance(acc, (list, np.ndarray)):
            acc = acc[0]
        if isinstance(prec, (list, np.ndarray)):
            prec = prec[0]
        if isinstance(rec, (list, np.ndarray)):
            rec = rec[0]

        # Append the values to the temporary lists
        temp_accuracy.append(acc)
        temp_precision.append(prec)
        temp_recall.append(rec)

    # Store the averages in the res dictionary
    generalize[evaluations[i]]['acc'] = sum(temp_accuracy) / len(temp_accuracy) if temp_accuracy else 0
    generalize[evaluations[i]]['prec'] = sum(temp_precision) / len(temp_precision) if temp_precision else 0
    generalize[evaluations[i]]['recall'] = sum(temp_recall) / len(temp_recall) if temp_recall else 0

print(generalize)

[np.float64(0.8), np.float64(0.875), np.float64(0.8749999999999999), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]


In [None]:
generalize = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(generalize).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()