# Simulation of User Preferences

In [1]:
import os
import altair as alt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from vigor import generate_graphs, nobre_predicates, predicates, compute_metrics, Predicate, VIGOR, label_graphs, learn_predicates

## Generate graphs

1. Generate graphs using the fast_gnp_random_graph function from networkx
2. Calculate statistics for each graph

In [2]:
file_path = '../data/generated_graphs_example.csv'
# all_graphs = generate_graphs(1000, 2, 200, file_path=file_path)
all_graphs = pd.read_csv(f'{file_path}')

In [3]:
graphs = all_graphs.sample(frac=.8, random_state=0)
test_graphs = all_graphs.loc[~all_graphs.index.isin(graphs.index)]

In [4]:
graphs.head()

Unnamed: 0,graph_type,is_directed_int,has_spatial_attributes,has_temporal_attributes,is_bipartite,n_components,avg_betweenness_centrality,avg_closeness_centrality,avg_eigenvector_centrality,avg_degree,...,eccentricity_avg,n_nodes,node_types,node_attributes,number_of_isolates,density,edge_types,edge_attributes,n_parallel_edges,n_self_loops
1144,4,0,1,0,1,6,0.003506,0.613409,0.073731,66.769231,...,2.0,182,2,12,0,0.368891,5,4,1,1
990,4,0,1,0,0,2,0.004779,0.60125,0.083575,47.007092,...,2.0,141,5,11,0,0.335765,3,12,5,5
478,2,0,0,1,0,8,0.227273,0.305556,0.288675,2.0,...,6.0,12,1,14,0,0.181818,1,9,3,4
161,4,0,0,1,0,4,0.018649,0.49649,0.125289,9.192982,...,3.035088,57,1,10,0,0.16416,2,14,5,3
1153,4,0,0,0,0,1,0.004753,0.560658,0.076448,35.856287,...,2.035928,167,3,5,0,0.216002,1,2,4,3


## Sample designers

We evaluate the ability of VIGOR to recover the rules that were used to represent simulated users. We design 3 versions of this simulated user: bob_informed, who follows the rules 100\% of the time; bob_semi_informed, who follows the rules 75\% of the time and chooses other visualizations randomly the other 25\% of the time; and bob_uninformed who follows the rules 50\% of the time and chooses other visualizations randomly the other 50\% of the time.

In [5]:
informed = label_graphs(graphs, predicates, conformance=1)
semi_informed = label_graphs(graphs, predicates, conformance=0.75)
uninformed = label_graphs(graphs, predicates, conformance=0.5)

In [6]:
informed

1144    NODELINK
990     NODELINK
478     NODELINK
161     NODELINK
1153    NODELINK
          ...   
414     NODELINK
662     NODELINK
1206    NODELINK
1187    NODELINK
1027    NODELINK
Length: 974, dtype: object

### Learning predicates from labeled data

In [7]:
graphs['graph_type'].value_counts()

graph_type
4    657
3    160
2     83
1     74
Name: count, dtype: int64

In [8]:
graph_types = graphs['graph_type'].unique()
graphs['graph_type_' + pd.Series(graph_types).astype(str)] = (graphs['graph_type'].values[None] == graph_types[:,None]).astype(int).T
graphs = graphs.drop('graph_type', axis=1)

In [9]:
nan_counts = graphs.isna().sum()
graphs = graphs.drop('assortativity', axis=1)

In [10]:
learned_predicates_informed = learn_predicates(graphs, informed, 1000)
learned_predicates_semi_informed = learn_predicates(graphs, semi_informed, 1000)
learned_predicates_uninformed = learn_predicates(graphs, uninformed, 1000)

Learning predicates for NODELINK
[   0] loss 18.78887367248535
[ 100] loss 17.143857955932617
[ 200] loss 15.38908863067627
[ 300] loss 13.700060844421387
[ 400] loss 12.21738338470459
[ 500] loss 10.741120338439941
[ 600] loss 9.250534057617188
[ 700] loss 7.718362808227539
[ 800] loss 6.077023029327393
[ 900] loss 4.117927074432373
[   0] loss 2.990095615386963
[ 100] loss 2.731027126312256
[ 200] loss 2.4917638301849365
[ 300] loss 2.253180980682373
[ 400] loss 2.0152010917663574
[ 500] loss 1.7781802415847778
[ 600] loss 1.5436054468154907
[ 700] loss 1.3124135732650757
[ 800] loss 1.0793907642364502
[ 900] loss 0.8415242433547974
Learning predicates for MATRIX
[   0] loss 0.8893399834632874
[ 100] loss 0.6991077661514282
[ 200] loss 0.6415713429450989
[ 300] loss 0.5845577716827393
[ 400] loss 0.5283137559890747
[ 500] loss 0.472806841135025
[ 600] loss 0.4182966351509094
[ 700] loss 0.36566731333732605
[ 800] loss 0.31597498059272766
[ 900] loss 0.27083635330200195
[   0] loss 22

### Comparing learned predicates to initial predicates

#### Informed User

In [11]:
test_informed = label_graphs(test_graphs, predicates, conformance=1)
evaluation_informed = compute_metrics(predicates, learned_predicates_informed, graphs, test_graphs, informed, test_informed)
print(evaluation_informed)

[0.14168378 0.        ] [1. 0.] [0.24820144 0.        ]
[0.16803279 0.        ] [1. 0.] [0.2877193 0.       ]
[0.9661191 0.       ] [1. 0.] [0.98276762 0.        ]
[0.95901639 0.        ] [1. 0.] [0.9790795 0.       ]
[0.96406571 0.        ] [1. 0.] [0.98170413 0.        ]
[0.96721311 0.        ] [1. 0.] [0.98333333 0.        ]
[0.94045175 0.        ] [1. 0.] [0.96931217 0.        ]
[0.93442623 0.        ] [1. 0.] [0.96610169 0.        ]
[0.99486653 0.        ] [1. 0.] [0.99742666 0.        ]
[0.98770492 0.        ] [1. 0.] [0.99381443 0.        ]
[0.99383984 0.        ] [1. 0.] [0.9969104 0.       ]
[0.98360656 0.        ] [1. 0.] [0.99173554 0.        ]
[0.99897331 0.        ] [1. 0.] [0.99948639 0.        ]
[1.] [1.] [1.]
{'NODELINK': {'exact': {'eccentricity_avg': {'iou': 0.0, 'deviation': 2.5, 'inclusion': 1}}, 'describe': {'precision': array([0.14168378, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.24820144, 0.        ]), 'accuracy': np.float64(0.14168377823408623)}, '

#### Semi-Informed User

In [12]:
test_semi_informed = label_graphs(test_graphs, predicates, conformance=0.75)
evaluation_semi_informed = compute_metrics(predicates, learned_predicates_semi_informed, graphs, test_graphs, semi_informed, test_semi_informed)
print(evaluation_semi_informed)

[0.30800821 0.        ] [1. 0.] [0.47095761 0.        ]
[0.3647541 0.       ] [1. 0.] [0.53453453 0.        ]
[0.91889117 0.        ] [1. 0.] [0.95773141 0.        ]
[0.88114754 0.        ] [1. 0.] [0.93681917 0.        ]
[0.94558522 0.        ] [1. 0.] [0.97203166 0.        ]
[0.93852459 0.        ] [1. 0.] [0.96828753 0.        ]
[0.93737166 0.        ] [1. 0.] [0.96767356 0.        ]
[0.95081967 0.        ] [1. 0.] [0.97478992 0.        ]
[0.96714579 0.        ] [1. 0.] [0.98329854 0.        ]
[0.95081967 0.        ] [1. 0.] [0.97478992 0.        ]
[0.96098563 0.        ] [1. 0.] [0.98010471 0.        ]
[0.97131148 0.        ] [1. 0.] [0.98544699 0.        ]
[0.96201232 0.        ] [1. 0.] [0.98063841 0.        ]
[0.94262295 0.        ] [1. 0.] [0.97046414 0.        ]
{'NODELINK': {'exact': {'eccentricity_avg': {'iou': 0.0, 'deviation': 2.5, 'inclusion': 1}}, 'describe': {'precision': array([0.30800821, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.47095761, 0.        ]), 

#### Unformed User

In [13]:
test_uninformed = label_graphs(test_graphs, predicates, conformance=0.5)
evaluation_uninformed = compute_metrics(predicates, learned_predicates_uninformed, graphs, test_graphs, uninformed, test_uninformed)
print(evaluation_uninformed)

[0.91786448 0.        ] [1. 0.] [0.95717345 0.        ]
[0.91393443 0.        ] [1. 0.] [0.95503212 0.        ]
[0.92607803 0.        ] [1. 0.] [0.96162047 0.        ]
[0.90163934 0.        ] [1. 0.] [0.94827586 0.        ]
[0.90246407 0.        ] [1. 0.] [0.94873179 0.        ]
[0.93442623 0.        ] [1. 0.] [0.96610169 0.        ]
[0.51437372 0.        ] [1. 0.] [0.67932203 0.        ]
[0.49590164 0.        ] [1. 0.] [0.6630137 0.       ]
[0.89425051 0.        ] [1. 0.] [0.94417344 0.        ]
[0.90163934 0.        ] [1. 0.] [0.94827586 0.        ]
[0.92299795 0.        ] [1. 0.] [0.95995729 0.        ]
[0.92622951 0.        ] [1. 0.] [0.96170213 0.        ]
[0.92197125 0.        ] [1. 0.] [0.95940171 0.        ]
[0.92622951 0.        ] [1. 0.] [0.96170213 0.        ]
{'CHORD_DIAGRAM': {'exact': {'n_nodes': {'iou': 0.0, 'deviation': 158.49999999991832, 'inclusion': 0}}, 'describe': {'precision': array([0.91786448, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.95717345, 0. 

### Visualizing Results

In [14]:
data = [evaluation_informed, evaluation_semi_informed, evaluation_uninformed]

In [28]:
data

[{'NODELINK': {'exact': {'eccentricity_avg': {'iou': 0.0,
     'deviation': 2.5,
     'inclusion': 1}},
   'describe': {'precision': array([0.14168378, 0.        ]),
    'recall': array([1., 0.]),
    'f1': array([0.24820144, 0.        ]),
    'accuracy': np.float64(0.14168377823408623)},
   'generalize': {'precision': array([0.16803279, 0.        ]),
    'recall': array([1., 0.]),
    'f1': array([0.2877193, 0.       ]),
    'accuracy': np.float64(0.1680327868852459)}},
  'MATRIX': {'exact': {'avg_eigenvector_centrality': {'iou': 0.0,
     'deviation': 0.42134710563606315,
     'inclusion': 0},
    'modularity': {'iou': 0.0,
     'deviation': 0.5000322576569045,
     'inclusion': 0},
    'avg_betweenness_centrality': {'iou': 0.0,
     'deviation': 0.34585758421848023,
     'inclusion': 0},
    'density': {'iou': 0.03680648124853544,
     'deviation': 0.433437083438159,
     'inclusion': 1}},
   'describe': {'precision': array([0.9661191, 0.       ]),
    'recall': array([1., 0.]),
   

#### Extract

In [46]:
evaluations = ['informed', 'semi-informed', 'uninformed']
visualizations = set(vis for d in data for vis in d.keys())

avg_deviation = {vis: {eval: 0 for eval in evaluations} for vis in visualizations}

for i, eval in enumerate(data):
    for vis in visualizations:
        deviations = []
        avg_deviation_round = []
        for entry in eval[vis]['exact'].values():
            avg_deviation_round.append(entry['deviation'])
        avg_deviation[vis][evaluations[i]] = np.mean(avg_deviation_round) if avg_deviation_round else 0

In [44]:
# Prepare data for plotting
x_labels = ["Informed", "Semi-Informed", "Uninformed"]
plot_data = []

for i, vis in enumerate(visualizations):
    for j, var in enumerate(evaluations):
        plot_data.append({
            'variable': var,
            'deviation': avg_deviation[vis][var],
            'visualization': vis
        })

# Convert to DataFrame
df = pd.DataFrame(plot_data)

# Create the Altair chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('variable:N', axis=alt.Axis(title='Variables')),
    y=alt.Y('deviation:Q', axis=alt.Axis(title='Average Deviation')),
    color='visualization:N',
    column='visualization:N',
    tooltip=['variable:N', 'deviation:Q', 'visualization:N']
).properties(
    title='Average Deviations per Variable for Each Visualization',
    width=150,
    height=300
)

# Configure chart appearance
chart.configure_view(
    stroke='transparent'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16,
    anchor='middle'
)

# Display the chart
chart.show()

#### Describe

In [56]:
data

[{'NODELINK': {'exact': {'eccentricity_avg': {'iou': 0.0,
     'deviation': 2.5,
     'inclusion': 1}},
   'describe': {'precision': array([0.14168378, 0.        ]),
    'recall': array([1., 0.]),
    'f1': array([0.24820144, 0.        ]),
    'accuracy': np.float64(0.14168377823408623)},
   'generalize': {'precision': array([0.16803279, 0.        ]),
    'recall': array([1., 0.]),
    'f1': array([0.2877193, 0.       ]),
    'accuracy': np.float64(0.1680327868852459)}},
  'MATRIX': {'exact': {'avg_eigenvector_centrality': {'iou': 0.0,
     'deviation': 0.42134710563606315,
     'inclusion': 0},
    'modularity': {'iou': 0.0,
     'deviation': 0.5000322576569045,
     'inclusion': 0},
    'avg_betweenness_centrality': {'iou': 0.0,
     'deviation': 0.34585758421848023,
     'inclusion': 0},
    'density': {'iou': 0.03680648124853544,
     'deviation': 0.433437083438159,
     'inclusion': 1}},
   'describe': {'precision': array([0.9661191, 0.       ]),
    'recall': array([1., 0.]),
   

In [91]:
describe = {eval: {"prec": 0, "recall": 0, 'acc': 0} for eval in evaluations}

# Iterate over each eval in data
for i, eval in enumerate(data):
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        print(i, vis, eval[vis]['describe'])
        # Get the values for accuracy, precision, and recall
        metr = eval[vis]['describe']
        acc = metr['accuracy']
        prec = metr['precision'][0]
        rec = metr['recall'][0]

        # Append the values to the temporary lists
        temp_accuracy.append(acc)
        temp_precision.append(prec)
        temp_recall.append(rec)

    # Store the averages in the res dictionary
    describe[evaluations[i]]['acc'] = np.mean(temp_accuracy)
    describe[evaluations[i]]['prec'] = np.mean(temp_precision)
    describe[evaluations[i]]['recall'] = np.mean(temp_recall)

print(describe)

0 NODELINK {'precision': array([0.14168378, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.24820144, 0.        ]), 'accuracy': np.float64(0.14168377823408623)}
0 MATRIX {'precision': array([0.9661191, 0.       ]), 'recall': array([1., 0.]), 'f1': array([0.98276762, 0.        ]), 'accuracy': np.float64(0.9661190965092402)}
0 NODETRIX {'precision': array([0.96406571, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.98170413, 0.        ]), 'accuracy': np.float64(0.9640657084188912)}
0 PAOHVIS {'precision': array([0.94045175, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.96931217, 0.        ]), 'accuracy': np.float64(0.9404517453798767)}
0 TREEMAP {'precision': array([0.99486653, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.99742666, 0.        ]), 'accuracy': np.float64(0.9948665297741273)}
0 NODELINK_MAP {'precision': array([0.99383984, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.9969104, 0.       ]), 'accuracy': np.float64(0.99383983572895

In [92]:
# Extract values and create lists
accuracy = [describe['informed']['acc'], describe['semi-informed']['acc'], describe['uninformed']['acc']]
precision = [describe['informed']['prec'], describe['semi-informed']['prec'], describe['uninformed']['prec']]
recall = [describe['informed']['recall'], describe['semi-informed']['recall'], describe['uninformed']['recall']]

# Combine lists into a single list for the DataFrame
result = accuracy + precision + recall

describe = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(describe).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
    tooltip=['Evaluation:N', 'Value:Q']
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()

### Generalize

In [89]:
generalize = {eval: {"prec": 0, "recall": 0, 'acc': 0} for eval in evaluations}

# Iterate over each eval in data
for i, eval in enumerate(data):
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        print(i, eval[vis]['generalize'])
        # Get the values for accuracy, precision, and recall
        acc = eval[vis]['generalize']['accuracy']
        prec = eval[vis]['generalize']['precision']
        rec = eval[vis]['generalize']['recall']

        # Check if the values are arrays and take the first element if they are
        if isinstance(acc, (list, np.ndarray)):
            acc = acc[0]
        if isinstance(prec, (list, np.ndarray)):
            prec = prec[0]
        if isinstance(rec, (list, np.ndarray)):
            rec = rec[0]

        # Append the values to the temporary lists
        temp_accuracy.append(acc)
        temp_precision.append(prec)
        temp_recall.append(rec)

    # Store the averages in the res dictionary
    generalize[evaluations[i]]['acc'] = sum(temp_accuracy) / len(temp_accuracy) if temp_accuracy else 0
    generalize[evaluations[i]]['prec'] = sum(temp_precision) / len(temp_precision) if temp_precision else 0
    generalize[evaluations[i]]['recall'] = sum(temp_recall) / len(temp_recall) if temp_recall else 0

print(generalize)

0 {'precision': array([0.16803279, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.2877193, 0.       ]), 'accuracy': np.float64(0.1680327868852459)}
0 {'precision': array([0.95901639, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.9790795, 0.       ]), 'accuracy': np.float64(0.9590163934426229)}
0 {'precision': array([0.96721311, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.98333333, 0.        ]), 'accuracy': np.float64(0.9672131147540983)}
0 {'precision': array([0.93442623, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.96610169, 0.        ]), 'accuracy': np.float64(0.9344262295081968)}
0 {'precision': array([0.98770492, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.99381443, 0.        ]), 'accuracy': np.float64(0.9877049180327869)}
0 {'precision': array([0.98360656, 0.        ]), 'recall': array([1., 0.]), 'f1': array([0.99173554, 0.        ]), 'accuracy': np.float64(0.9836065573770492)}
0 {'precision': array([1.]), 'recall': array([1.])

In [90]:
# Extract values and create lists
accuracy = [generalize['informed']['acc'], generalize['semi-informed']['acc'], generalize['uninformed']['acc']]
precision = [generalize['informed']['prec'], generalize['semi-informed']['prec'], generalize['uninformed']['prec']]
recall = [generalize['informed']['recall'], generalize['semi-informed']['recall'], generalize['uninformed']['recall']]

# Combine lists into a single list for the DataFrame
result = accuracy + precision + recall

generalize = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(generalize).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()