# Simulation of User Preferences

In [1]:
import os
import altair as alt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from vigor import generate_graphs, nobre_predicates, predicates, compute_metrics, Predicate, VIGOR, label_graphs, learn_predicates

## Generate graphs

1. Generate graphs using the fast_gnp_random_graph function from networkx
2. Calculate statistics for each graph

In [2]:
file_path = '../data/generated_graphs_example.csv'
# all_graphs = generate_graphs(1000, 2, 200, file_path=file_path)
all_graphs = pd.read_csv(f'{file_path}')

In [3]:
graphs = all_graphs.sample(frac=.8, random_state=0)
test_graphs = all_graphs.loc[~all_graphs.index.isin(graphs.index)]

In [4]:
graphs.head()

Unnamed: 0,graph_type,is_directed_int,has_spatial_attributes,has_temporal_attributes,is_bipartite,n_components,avg_betweenness_centrality,avg_closeness_centrality,avg_eigenvector_centrality,avg_degree,...,2.129032258064516,62,3.1,10.1,0.2,0.28239026969857217,3.2,1.2,1.3,4
1144,4,0,1,0,1,6,0.003506,0.613409,0.073731,66.769231,...,,,,,,,,,,
990,4,0,1,0,0,2,0.004779,0.60125,0.083575,47.007092,...,,,,,,,,,,
478,2,0,0,1,0,8,0.227273,0.305556,0.288675,2.0,...,,,,,,,,,,
161,4,0,0,1,0,4,0.018649,0.49649,0.125289,9.192982,...,,,,,,,,,,
1153,4,0,0,0,0,1,0.004753,0.560658,0.076448,35.856287,...,,,,,,,,,,


In [5]:
graphs = graphs[['graph_type', 'n_nodes', 'density']]

In [6]:
graphs

Unnamed: 0,graph_type,n_nodes,density
1144,4,182,0.368891
990,4,141,0.335765
478,2,12,0.181818
161,4,57,0.164160
1153,4,167,0.216002
...,...,...,...
414,4,11,0.272727
662,4,98,0.364191
1206,3,161,0.066149
1187,4,109,0.494903


## Sample designers

We evaluate the ability of VIGOR to recover the rules that were used to represent simulated users. We design 3 versions of this simulated user: bob_informed, who follows the rules 100\% of the time; bob_semi_informed, who follows the rules 75\% of the time and chooses other visualizations randomly the other 25\% of the time; and bob_uninformed who follows the rules 50\% of the time and chooses other visualizations randomly the other 50\% of the time.

In [7]:
informed = label_graphs(graphs, predicates, conformance=1)
semi_informed = label_graphs(graphs, predicates, conformance=0.75)
uninformed = label_graphs(graphs, predicates, conformance=0.5)

In [8]:
informed

1144      MATRIX
990       MATRIX
478       MATRIX
161      PAOHVIS
1153      MATRIX
          ...   
414       MATRIX
662       MATRIX
1206     PAOHVIS
1187      MATRIX
1027    NODELINK
Length: 974, dtype: object

### Learning predicates from labeled data

In [9]:
graphs['graph_type'].value_counts()

graph_type
4    657
3    160
2     83
1     74
Name: count, dtype: int64

In [10]:
graph_types = graphs['graph_type'].unique()
graphs['graph_type_' + pd.Series(graph_types).astype(str)] = (graphs['graph_type'].values[None] == graph_types[:,None]).astype(int).T
graphs = graphs.drop('graph_type', axis=1)

In [11]:
learned_predicates_informed = learn_predicates(graphs, informed, 1000)
learned_predicates_semi_informed = learn_predicates(graphs, semi_informed, 1000)
learned_predicates_uninformed = learn_predicates(graphs, uninformed, 1000)

Learning predicates for MATRIX
[   0] loss 5.365243434906006
[ 100] loss 4.062296390533447
[ 200] loss 3.0185632705688477
[ 300] loss 2.039332151412964
[ 400] loss 1.592203140258789
[ 500] loss 1.4379324913024902
[ 600] loss 1.2948869466781616
[ 700] loss 1.1524242162704468
[ 800] loss 1.0067732334136963
[ 900] loss 0.8533504009246826
[   0] loss 6.84258508682251
[ 100] loss 5.968863487243652
[ 200] loss 4.819820880889893
[ 300] loss 4.409422397613525
[ 400] loss 4.030518531799316
[ 500] loss 3.6510426998138428
[ 600] loss 3.256913423538208
[ 700] loss 2.8373770713806152
[ 800] loss 2.3824362754821777
[ 900] loss 1.8555963039398193
Learning predicates for PAOHVIS
[   0] loss 3.276097297668457
[ 100] loss 2.0256848335266113
[ 200] loss 1.7025209665298462
[ 300] loss 1.5366017818450928
[ 400] loss 1.4204962253570557
[ 500] loss 1.3110569715499878
[ 600] loss 1.195203423500061
[ 700] loss 1.0640349388122559
[ 800] loss 0.9185673594474792
[ 900] loss 0.7587756514549255
[   0] loss 9.380393

### Comparing learned predicates to initial predicates

#### Informed User

In [12]:
test_informed = label_graphs(test_graphs, predicates, conformance=1)
evaluation_informed = compute_metrics(predicates, learned_predicates_informed, graphs, test_graphs, informed, test_informed)
print(evaluation_informed)

MATRIX 1144      MATRIX
990       MATRIX
478       MATRIX
161      PAOHVIS
1153      MATRIX
          ...   
414       MATRIX
662       MATRIX
1206     PAOHVIS
1187      MATRIX
1027    NODELINK
Length: 974, dtype: object
MATRIX 11           NODELINK
23             MATRIX
24      CHORD_DIAGRAM
25             MATRIX
28           NODETRIX
            ...      
1199         NODELINK
1202         NODELINK
1207          PAOHVIS
1210         NODELINK
1214           MATRIX
Length: 244, dtype: object
PAOHVIS 1144      MATRIX
990       MATRIX
478       MATRIX
161      PAOHVIS
1153      MATRIX
          ...   
414       MATRIX
662       MATRIX
1206     PAOHVIS
1187      MATRIX
1027    NODELINK
Length: 974, dtype: object
PAOHVIS 11           NODELINK
23             MATRIX
24      CHORD_DIAGRAM
25             MATRIX
28           NODETRIX
            ...      
1199         NODELINK
1202         NODELINK
1207          PAOHVIS
1210         NODELINK
1214           MATRIX
Length: 244, dtype: object
NODE

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Semi-Informed User

In [13]:
test_semi_informed = label_graphs(test_graphs, predicates, conformance=0.75)
evaluation_semi_informed = compute_metrics(predicates, learned_predicates_semi_informed, graphs, test_graphs, semi_informed, test_semi_informed)
print(evaluation_semi_informed)

MATRIX 1144           MATRIX
990            MATRIX
478            MATRIX
161     CHORD_DIAGRAM
1153           MATRIX
            ...      
414            MATRIX
662            MATRIX
1206          TREEMAP
1187           MATRIX
1027         NODELINK
Length: 974, dtype: object
MATRIX 11          NODELINK
23      NODELINK_MAP
24          NODETRIX
25            MATRIX
28          NODETRIX
            ...     
1199    NODELINK_MAP
1202        NODELINK
1207         PAOHVIS
1210        NODELINK
1214          MATRIX
Length: 244, dtype: object
CHORD_DIAGRAM 1144           MATRIX
990            MATRIX
478            MATRIX
161     CHORD_DIAGRAM
1153           MATRIX
            ...      
414            MATRIX
662            MATRIX
1206          TREEMAP
1187           MATRIX
1027         NODELINK
Length: 974, dtype: object
CHORD_DIAGRAM 11          NODELINK
23      NODELINK_MAP
24          NODETRIX
25            MATRIX
28          NODETRIX
            ...     
1199    NODELINK_MAP
1202        NOD

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Unformed User

In [14]:
test_uninformed = label_graphs(test_graphs, predicates, conformance=0.5)
evaluation_uninformed = compute_metrics(predicates, learned_predicates_uninformed, graphs, test_graphs, uninformed, test_uninformed)
print(evaluation_uninformed)

MATRIX 1144           MATRIX
990            MATRIX
478           TREEMAP
161           TREEMAP
1153           MATRIX
            ...      
414     CHORD_DIAGRAM
662            MATRIX
1206          PAOHVIS
1187           MATRIX
1027           MATRIX
Length: 974, dtype: object
MATRIX 11           NODELINK
23             MATRIX
24      CHORD_DIAGRAM
25       NODELINK_MAP
28             MATRIX
            ...      
1199           MATRIX
1202         NODELINK
1207         NODETRIX
1210         NODELINK
1214           MATRIX
Length: 244, dtype: object
TREEMAP 1144           MATRIX
990            MATRIX
478           TREEMAP
161           TREEMAP
1153           MATRIX
            ...      
414     CHORD_DIAGRAM
662            MATRIX
1206          PAOHVIS
1187           MATRIX
1027           MATRIX
Length: 974, dtype: object
TREEMAP 11           NODELINK
23             MATRIX
24      CHORD_DIAGRAM
25       NODELINK_MAP
28             MATRIX
            ...      
1199           MATRIX
1202     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Visualizing Results

In [15]:
data = [evaluation_informed, evaluation_semi_informed, evaluation_uninformed]

#### Extract

In [16]:
visualizations = set(vis for d in data for vis in d.keys())

avg_deviation = {vis: [] for vis in visualizations}

for vis in visualizations:
    for var in data:
        deviations = [attr['deviation'] for attr in var.get(vis, {}).values() if 'deviation' in attr]
        if deviations:
            avg_deviation[vis].append(np.mean(deviations))
        else:
            avg_deviation[vis].append(0)

avg_deviation

{'NODETRIX': [0, 0, 0],
 'TREEMAP': [0, 0, 0],
 'NODELINK': [0, 0, 0],
 'MATRIX': [0, 0, 0],
 'PAOHVIS': [0, 0, 0],
 'CHORD_DIAGRAM': [0, 0, 0]}

In [17]:
# Prepare data for plotting
x_labels = ["Informed", "Semi-Informed", "Uninformed"]
plot_data = []

for i, vis in enumerate(visualizations):
    for j, var in enumerate(x_labels):
        plot_data.append({
            'variable': var,
            'deviation': avg_deviation[vis][j],
            'visualization': vis
        })

# Convert to DataFrame
df = pd.DataFrame(plot_data)

# Create the Altair chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('variable:N', axis=alt.Axis(title='Variables')),
    y=alt.Y('deviation:Q', axis=alt.Axis(title='Average Deviation')),
    color='visualization:N',
    column='visualization:N',
    tooltip=['variable:N', 'deviation:Q', 'visualization:N']
).properties(
    title='Average Deviations per Variable for Each Visualization',
    width=150,
    height=300
)

# Configure chart appearance
chart.configure_view(
    stroke='transparent'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16,
    anchor='middle'
)

# Display the chart
chart.show()

#### Describe

In [18]:
accuracy = []
precision = []
recall = []

# Iterate over each eval in data
for eval in data:
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        # Append the values for accuracy, precision, and recall
        temp_accuracy.append(eval[vis]['describe']['accuracy'])
        temp_precision.append(eval[vis]['describe']['precision'])
        temp_recall.append(eval[vis]['describe']['recall'])

    # Calculate the average of each list
    avg_accuracy = sum(temp_accuracy) / len(temp_accuracy) if temp_accuracy else 0
    avg_precision = sum(temp_precision) / len(temp_precision) if temp_precision else 0
    avg_recall = sum(temp_recall) / len(temp_recall) if temp_recall else 0
    
    # Append the averages to the respective lists
    accuracy.append(avg_accuracy)
    precision.append(avg_precision)
    recall.append(avg_recall)

result = accuracy + precision + recall

print(result)

[np.float64(0.75), np.float64(0.8333333333333334), np.float64(0.8333333333333334), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]


In [19]:
describe = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(describe).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
    tooltip=['Evaluation:N', 'Value:Q']
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()

### Generalize

In [20]:
accuracy = []
precision = []
recall = []

# Iterate over each eval in data
for eval in data:
    # Temporary lists for the current eval
    temp_accuracy = []
    temp_precision = []
    temp_recall = []

    # Iterate over each visualization (vis) in eval
    for vis in eval:
        # Append the values for accuracy, precision, and recall
        temp_accuracy.append(eval[vis]['generalize']['accuracy'])
        temp_precision.append(eval[vis]['generalize']['precision'])
        temp_recall.append(eval[vis]['generalize']['recall'])

    # Calculate the average of each list
    avg_accuracy = sum(temp_accuracy) / len(temp_accuracy) if temp_accuracy else 0
    avg_precision = sum(temp_precision) / len(temp_precision) if temp_precision else 0
    avg_recall = sum(temp_recall) / len(temp_recall) if temp_recall else 0
    
    # Append the averages to the respective lists
    accuracy.append(avg_accuracy)
    precision.append(avg_precision)
    recall.append(avg_recall)

# Concatenate the result into a single list
result = accuracy + precision + recall

print(result)  # This will print the concatenated list with 9 values

[np.float64(0.7879098360655739), np.float64(0.8340163934426229), np.float64(0.7698087431693988), np.float64(0.13194444444444445), np.float64(0.0787037037037037), np.float64(0.08796296296296297), np.float64(0.057926829268292686), np.float64(0.04047619047619048), np.float64(0.09757295975501062)]


In [21]:
generalize = pd.DataFrame({
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy', 
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall'],
    'Evaluation': ['Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed',
                   'Informed', 'Semi-Informed', 'Uninformed'],
    'Value': result
})

# Create the Altair chart
chart = alt.Chart(generalize).mark_bar().encode(
    x=alt.X('Evaluation:N', title='Evaluation', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Value:Q', title='Value'),
    color='Evaluation:N',
    column='Metric:N',
    tooltip=['Evaluation:N', 'Value:Q']
).properties(
    title='Metric Evaluation Comparison',
    width=100
).configure_facet(
    spacing=10
)

# Show the chart
chart.show()