# Highlighting Differences in Data Visualizations

> This notebook serves the purpose of exploring Vega-Lite datasets and experimenting with them. Ultimately, it serves as a backbone to a bachelor thesis with the same topic name.

In [1]:
# pip install pandas vega
import os
import json

import pandas as pd
import altair as alt
from vega import VegaLite

### Offline Access to Vega Datasets

In [2]:
# pip install vega_datasets
from vega_datasets import data

In [3]:
datasets_ = data.list_datasets()
print("List of all datasets:\n\n", datasets_)

List of all datasets:

 ['7zip', 'airports', 'annual-precip', 'anscombe', 'barley', 'birdstrikes', 'budget', 'budgets', 'burtin', 'cars', 'climate', 'co2-concentration', 'countries', 'crimea', 'disasters', 'driving', 'earthquakes', 'ffox', 'flare', 'flare-dependencies', 'flights-10k', 'flights-200k', 'flights-20k', 'flights-2k', 'flights-3m', 'flights-5k', 'flights-airport', 'gapminder', 'gapminder-health-income', 'gimp', 'github', 'graticule', 'income', 'iowa-electricity', 'iris', 'jobs', 'la-riots', 'londonBoroughs', 'londonCentroids', 'londonTubeLines', 'lookup_groups', 'lookup_people', 'miserables', 'monarchs', 'movies', 'normal-2d', 'obesity', 'ohlc', 'points', 'population', 'population_engineers_hurricanes', 'seattle-temps', 'seattle-weather', 'sf-temps', 'sp500', 'stocks', 'udistrict', 'unemployment', 'unemployment-across-industries', 'uniform-2d', 'us-10m', 'us-employment', 'us-state-capitals', 'volcano', 'weather', 'weball26', 'wheat', 'windvectors', 'world-110m', 'zipcodes']


In [4]:
# Only the .json datasets are of interest to us, therefore we filter out datasets of different file types.

datasets = []
for dataset in datasets_:
    try:
        path = getattr(data, dataset).filepath
        if path.endswith(".json"):
            datasets.append(dataset)
    except (AttributeError, ValueError):
        continue
        
print("Available datasets:", datasets)

Available datasets: ['anscombe', 'barley', 'burtin', 'cars', 'crimea', 'driving', 'iris', 'ohlc', 'wheat']


All datasets with file extension .json have been copied to a folder called "datasets".

## Iris Dataset

This classic dataset contains lengths and widths of petals and sepals for 150 iris flowers, drawn from three species. It was introduced by R.A. Fisher in 1936.

In [5]:
rel_path = "datasets_test"

try:
    os.chdir(rel_path)
except:
    pass

In [6]:
x = "sepalLength"
y = "sepalWidth"

In [7]:
# original iris dataset

iris_test_original = None
with open("iris_test.json", 'r') as f:
    data = json.load(f)
    iris_test_original = pd.DataFrame(data)

In [8]:
alt.Chart(iris_test_original).mark_point().encode(
    x=x,
    y=y,
    color='species',
).interactive()

In [9]:
# modified iris dataset
percentage = 60

iris_test_modified = None
with open(f"iris_test{percentage}.json", 'r') as f:
    data = json.load(f)
    iris_test_modified = pd.DataFrame(data)

In [10]:
alt.Chart(iris_test_modified).mark_point().encode(
    x=x,
    y=y,
    color='species',
).interactive()

In [11]:
# This Altair Visualization offers an option "View Compiled Vega"
# This source code will later serve to compare two visualizations!