# Highlighting Differences in Data Visualizations

> This notebook serves the purpose of exploring Vega-Lite datasets and experimenting with them. Ultimately, it serves as a backbone to a bachelor thesis with the same topic name.

In [1]:
import os
import json

import pandas as pd
import altair as alt
from vega import VegaLite

from analysis import statistics, compare

### Offline Access to Vega Datasets

In [None]:
# pip install vega_datasets
from vega_datasets import data

In [None]:
datasets_ = data.list_datasets()
print("List of all datasets:\n\n", datasets_)

In [None]:
# Only the .json datasets are of interest to us, therefore we filter out datasets of different file types.

datasets = []
for dataset in datasets_:
    try:
        path = getattr(data, dataset).filepath
        if path.endswith(".json"):
            datasets.append(dataset)
    except (AttributeError, ValueError):
        continue
        
print("Available datasets:", datasets)

All datasets with file extension .json have been copied to a folder called "datasets".

## Iris Dataset

This classic dataset contains lengths and widths of petals and sepals for 150 iris flowers, drawn from three species. It was introduced by R.A. Fisher in 1936.

In [2]:
try:
    os.chdir("datasets")
except:
    pass

In [None]:
x = "sepalLength"
y = "sepalWidth"

In [None]:
# original iris dataset

iris_original = None
with open("iris.json", 'r') as f:
    data = json.load(f)
    iris_original = pd.DataFrame(data)

In [None]:
iris_og_plot = alt.Chart(iris_original).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_og_plot

In [None]:
try:
    os.chdir("datasets_altered")
except:
    pass

In [None]:
# modified iris dataset available 1 - 20
percentage = 20

iris_modified = None
with open(f"iris{percentage}.json", 'r') as f:
    data = json.load(f)
    iris_modified = pd.DataFrame(data)

In [None]:
iris_mod_plot = alt.Chart(iris_modified).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_mod_plot

> This Altair Visualization offers an option "View Source". <br>
> The source code are manually saved as "iris_source.JSON" and "iris20_source.JSON".

### analysis.py - statistics(file)

In [None]:
try:
    os.chdir("../sources")
except:
    pass

In [None]:
# Original Iris Dataset
stats_iris_og = statistics("iris_source.json")
print(stats_iris_og)

In [None]:
# 20% Modified Iris Dataset
stats_iris_mod = statistics("iris20_source.json")
print(stats_iris_mod)

### analysis.py - compare(file1, file2)

In [None]:
# comparision only available for iris.json and iris20.json

print(compare("iris_source.json", "iris20_source.json"))

In [None]:
try:
    os.chdir("comparisons")
except:
    pass

In [None]:
iris_comp20 = None
with open(f"iris_COMP_iris20.json", 'r') as f:
    data = json.load(f)
    iris_comp20 = pd.DataFrame(data)

In [None]:
iris_comp_plot = alt.Chart(iris_comp20).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_comp_plot

## Ploting

In [None]:
os.chdir("../..")

In [None]:
def plot(a, b, x, y):
    """
    a: original json file
    b: modified json file
    x: x value for plots
    y: y value for plots
    output: 3 plots
    """
    
    # ORIGINAL
    original = None
    with open(a, 'r') as og:
        data = json.load(og)
        original = pd.DataFrame(data)
    
    og_plot = alt.Chart(original).mark_point().encode(
        x=x,
        y=y
    ).properties(
        title = "A"
    ).interactive()
    
    # MODIFIED
    os.chdir("datasets_altered")
    modified = None
    with open(b, 'r') as mod:
        data = json.load(mod)
        modified = pd.DataFrame(data)
    
    mod_plot = alt.Chart(modified).mark_point().encode(
        x=x,
        y=y
    ).properties(
        title = "B"
    ).interactive()
    
    # DIFFERENCE
    os.chdir("../sources/comparisons")
    comparison = None
    
    comp_file = f"{a[:-(len('.json'))]}_COMP_{b}"
    with open(comp_file, 'r') as comp:
        data = json.load(comp)
        comparison = pd.DataFrame(data)
        
    comp_plot = alt.Chart(comparison).mark_point().encode(
        x=x,
        y=y
    ).properties(
        title = "Difference - in A but not in B"
    ).interactive()
    
    
    comp_file2 = f"{b[:-(len('.json'))]}_COMP_{a}"
    try:
        with open(comp_file2, 'r') as comp:
            data = json.load(comp)
            comparison = pd.DataFrame(data)

        comp_plot2 = alt.Chart(comparison).mark_point().encode(
            x=x,
            y=y
        ).properties(
            title = "Difference - in B but not in A"
        ).interactive()
    except:
        pass
    
    # PLOTS
    os.chdir("../..")
    plots = [og_plot, mod_plot, comp_plot]
    try:
        plots.append(comp_plot2)
    except:
        pass
    
    for plot in plots:
        plot.display()

In [None]:
plot("iris.json", "iris20.json", x, y)