# Evaluation with sample users

In the evaluation, we model three types of users:
- Uninformed user (the opposite of nobre)
- 50-50 user (50% of the time aligned with nobre)
- Informed user (fully aligned with nobre scores)

Given the set of system predicates to start off with, we'll see how the system responds to the user feedback. This notebook performs the evaluation

In [1]:
import json
import altair as alt
import pandas as pd
from vigor import VIGOR, Visualization, predicates, Predicate
from vigor.visualization_types import VisualizationType
from tqdm import tqdm
import numpy as np


In [12]:
def calculate_correlations(file_path, label):
    # Initialize VIGOR
    vigor = VIGOR()
    visualization_preds = {}

    # Load in Nobre et al. predicates
    for vis, score, stat, min_val, max_val in predicates:
        if vis not in visualization_preds:
            visualization_preds[vis] = Visualization(vis)
        visualization_preds[vis].add_predicate(Predicate(stat, min_val, max_val, score))

    # Add visualizations to VIGOR
    for vis in visualization_preds.values():
        vigor.add_visualization(vis)

    # Load in data
    with open(file_path, 'r') as file:
        data = json.load(file)

    correlation_data = []

    # Process interactions and update VIGOR with feedback
    for interaction_index, interaction in enumerate(tqdm(data[:10], desc="Processing interactions")):
        statistics = interaction['statistics']
        sorted_visualizations = interaction['sorted_visualizations']
        recommendations_with_scores = vigor.recommend(statistics, 8)

        top_rank_feedback = []

        for i, (recommendation, score) in enumerate(recommendations_with_scores):
            recommendation_str = recommendation.name.upper()

            if recommendation_str in sorted_visualizations:
                # Calculate feedback based on position in sorted visualizations
                position = sorted_visualizations.index(recommendation_str)
                feedback = 1 / (position + 1)

            for vis in vigor.visualizations:
                if vis.visualization_type == recommendation:
                    vis.update(feedback, statistics)
                    break

            if i == 0:
                top_rank_feedback = feedback

        # Rank correlation metrics
        min_len = min(len(recommendations_with_scores), len(sorted_visualizations))
        recommendations_truncated = recommendations_with_scores[:min_len]
        sorted_visualizations_truncated = sorted_visualizations[:min_len]

        recommendations_indices = [list(VisualizationType).index(rec[0]) for rec in recommendations_truncated]
        sorted_visualizations_indices = [sorted_visualizations.index(visualization) for visualization in sorted_visualizations_truncated]

       # Mean Absolute Error (MAE) calculation
        mae = np.mean([abs(rec_idx - sorted_idx) for rec_idx, sorted_idx in zip(recommendations_indices, sorted_visualizations_indices)])

        # Store results
        correlation_data.append({
            'interaction_index': interaction_index,
            'label': label,
            'feedback': top_rank_feedback,
            'mae': mae,
        })

    return correlation_data

In [13]:
# Calculate correlations for three different JSON files
correlation_data_1 = calculate_correlations('../data/evaluation/uninformed_user.json', 'Uninformed User')
correlation_data_2 = calculate_correlations('../data/evaluation/informed_user.json', 'Informed User')
correlation_data_3 = calculate_correlations('../data/evaluation/fifty_fifty_user.json', 'Balanced User')

Processing interactions:   0%|          | 0/10 [00:00<?, ?it/s]

Processing interactions: 100%|██████████| 10/10 [00:07<00:00,  1.34it/s]
Processing interactions: 100%|██████████| 10/10 [00:07<00:00,  1.34it/s]
Processing interactions: 100%|██████████| 10/10 [00:07<00:00,  1.35it/s]


In [14]:
all_correlation_data = correlation_data_1 + correlation_data_2 + correlation_data_3
correlation_df = pd.DataFrame(all_correlation_data)
correlation_df.head()

Unnamed: 0,interaction_index,label,feedback,mae
0,0,Uninformed User,0.142857,2.0
1,1,Uninformed User,0.166667,2.25
2,2,Uninformed User,0.125,2.25
3,3,Uninformed User,0.125,2.25
4,4,Uninformed User,0.142857,2.25


In [8]:
correlation_df['mae'] = correlation_df['mae'].rolling(window=50).mean()

# MAE over time plot
alt.Chart(correlation_df).mark_line().encode(
    x=alt.X('interaction_index:Q', title='Interaction Index'),
    y=alt.Y('mae:Q', title='Mean Absolute Error (MAE)'),
    color='label:N'
).properties(
    title="Mean Absolute Error (MAE) over Interactions",
    width=600,
    height=400
)

In [9]:
correlation_df['feedback'] = correlation_df['feedback'].rolling(window=50).mean()

# MAE over time plot
alt.Chart(correlation_df).mark_line().encode(
    x=alt.X('interaction_index:Q', title='Interaction Index'),
    y=alt.Y('feedback:Q', title='Feedback'),
    color='label:N'
).properties(
    title="Feedback over Interactions",
    width=600,
    height=400
)