# Highlighting Differences in Data Visualizations

> This notebook serves the purpose of exploring Vega-Lite datasets and experimenting with them. Ultimately, it serves as a backbone to a bachelor thesis with the same topic name.

In [1]:
import os
import json

import pandas as pd
import altair as alt
from vega import VegaLite

# from analysis import statistics, compare

### Offline Access to Vega Datasets

In [2]:
# pip install vega_datasets
from vega_datasets import data
vega_datasets = data

In [3]:
datasets_ = data.list_datasets()
print("List of all datasets:\n\n", datasets_)

List of all datasets:

 ['7zip', 'airports', 'annual-precip', 'anscombe', 'barley', 'birdstrikes', 'budget', 'budgets', 'burtin', 'cars', 'climate', 'co2-concentration', 'countries', 'crimea', 'disasters', 'driving', 'earthquakes', 'ffox', 'flare', 'flare-dependencies', 'flights-10k', 'flights-200k', 'flights-20k', 'flights-2k', 'flights-3m', 'flights-5k', 'flights-airport', 'gapminder', 'gapminder-health-income', 'gimp', 'github', 'graticule', 'income', 'iowa-electricity', 'iris', 'jobs', 'la-riots', 'londonBoroughs', 'londonCentroids', 'londonTubeLines', 'lookup_groups', 'lookup_people', 'miserables', 'monarchs', 'movies', 'normal-2d', 'obesity', 'ohlc', 'points', 'population', 'population_engineers_hurricanes', 'seattle-temps', 'seattle-weather', 'sf-temps', 'sp500', 'stocks', 'udistrict', 'unemployment', 'unemployment-across-industries', 'uniform-2d', 'us-10m', 'us-employment', 'us-state-capitals', 'volcano', 'weather', 'weball26', 'wheat', 'windvectors', 'world-110m', 'zipcodes']


In [4]:
# Only the .json datasets are of interest to us, therefore we filter out datasets of different file types.

datasets = []
for dataset in datasets_:
    try:
        path = getattr(data, dataset).filepath
        if path.endswith(".json"):
            datasets.append(dataset)
    except (AttributeError, ValueError):
        continue
        
print("Available datasets:", datasets)

Available datasets: ['anscombe', 'barley', 'burtin', 'cars', 'crimea', 'driving', 'iris', 'ohlc', 'wheat']


All datasets with file extension .json have been copied to a folder called "datasets".

## Iris Dataset

This classic dataset contains lengths and widths of petals and sepals for 150 iris flowers, drawn from three species. It was introduced by R.A. Fisher in 1936.

In [5]:
os.chdir("datasets")

In [6]:
x = "sepalLength"
y = "sepalWidth"

In [7]:
# original iris dataset

iris_original = None
with open("iris.json", 'r') as f:
    data = json.load(f)
    iris_original = pd.DataFrame(data)

In [8]:
iris_og_plot = alt.Chart(iris_original).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_og_plot

In [9]:
os.chdir("datasets_altered")

In [14]:
# modified iris dataset available 1 - 20
percentage = 15

iris_modified = None
with open(f"iris{percentage}.json", 'r') as f:
    data = json.load(f)
    iris_modified = pd.DataFrame(data)

In [15]:
iris_mod_plot = alt.Chart(iris_modified).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_mod_plot

> This Altair Visualization offers an option "View Source". <br>
> The source code are manually saved as "iris_source.JSON" and "iris20_source.JSON".

### analysis.py - statistics(file)

In [12]:
# os.chdir("../sources")

In [13]:
# Original Iris Dataset
# stats_iris_og = statistics("iris_source.json")
# print(stats_iris_og)

In [14]:
# 20% Modified Iris Dataset
# stats_iris_mod = statistics("iris20_source.json")
# print(stats_iris_mod)

### analysis.py - compare(file1, file2)

In [15]:
# comparision only available for iris.json and iris20.json
# print(compare("iris_source.json", "iris20_source.json"))

In [16]:
os.chdir("../../vis-dif/public/data/comparisons")

In [17]:
iris_comp20 = None
with open(f"iris_COMP_iris20.json", 'r') as f:
    data = json.load(f)
    data = data["datasets"][data["data"]["name"]]
    iris_comp20 = pd.DataFrame(data)

In [18]:
iris_comp_plot = alt.Chart(iris_comp20).mark_point().encode(
    x=x,
    y=y,
    color='species',
    tooltip=(x,y)
).interactive()

iris_comp_plot

## Ploting

In [19]:
# os.chdir("../..")

In [20]:
# def plot(a, b, x, y):
#     """
#     a: original json file
#     b: modified json file
#     x: x value for plots
#     y: y value for plots
#     output: 3 plots
#     """
    
#     # ORIGINAL
#     original = None
#     with open(a, 'r') as og:
#         data = json.load(og)
#         original = pd.DataFrame(data)
    
#     og_plot = alt.Chart(original).mark_point().encode(
#         x=x,
#         y=y
#     ).properties(
#         title = "A"
#     ).interactive()
    
#     # MODIFIED
#     os.chdir("datasets_altered")
#     modified = None
#     with open(b, 'r') as mod:
#         data = json.load(mod)
#         modified = pd.DataFrame(data)
    
#     mod_plot = alt.Chart(modified).mark_point().encode(
#         x=x,
#         y=y
#     ).properties(
#         title = "B"
#     ).interactive()
    
#     # DIFFERENCE
#     os.chdir("../sources/comparisons")
#     comparison = None
    
#     comp_file = f"{a[:-(len('.json'))]}_COMP_{b}"
#     with open(comp_file, 'r') as comp:
#         data = json.load(comp)
#         comparison = pd.DataFrame(data)
        
#     comp_plot = alt.Chart(comparison).mark_point().encode(
#         x=x,
#         y=y
#     ).properties(
#         title = "Difference - in A but not in B"
#     ).interactive()
    
    
#     comp_file2 = f"{b[:-(len('.json'))]}_COMP_{a}"
#     try:
#         with open(comp_file2, 'r') as comp:
#             data = json.load(comp)
#             comparison = pd.DataFrame(data)

#         comp_plot2 = alt.Chart(comparison).mark_point().encode(
#             x=x,
#             y=y
#         ).properties(
#             title = "Difference - in B but not in A"
#         ).interactive()
#     except:
#         pass
    
#     # PLOTS
#     os.chdir("../..")
#     plots = [og_plot, mod_plot, comp_plot]
#     try:
#         plots.append(comp_plot2)
#     except:
#         pass
    
#     for plot in plots:
#         plot.display()

In [21]:
# plot("iris.json", "iris20.json", x, y)

# All datasets

In [22]:
os.chdir("../../../../datasets/datasets_altered")

## (anscombe.json)

Anscombe’s Quartet is a famous dataset constructed by Francis Anscombe. The common summary statistics in each of the series identical, despite the subsets' different characteristics. <br> It makes sense to make subplots. Variations of this dataset do not make much sense, however for demonstration purposes in this project, we will generate them anyway.

In [23]:
source = vega_datasets.anscombe()

x = "X"
y = "Y"
color = "Series"

In [24]:
alt.Chart(source).mark_point().encode(
    x=x,
    y=y,
    color=color,
    tooltip=(x,y)
).interactive().facet("Series", columns=2)

In [25]:
percent = 20

alternation = None
with open(f"anscombe{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [26]:
alt.Chart(alternation).mark_point().encode(
    x=x,
    y=y,
    color=color,
    tooltip=(x,y)
).interactive().facet("Series", columns=2)

## barley.json

The Becker’s Barley Trellis charts identify an anomoly in a widely used agriculatural dataset, which is called “The Morris Mistake”. It shows that "Morris" is the only site that is the reverse of other panels. It is usually displayed split into the sites, however I will aggregate the sites.

In [24]:
source = vega_datasets.barley()

alt.Chart(source).mark_point().encode(
    x = "yield",
    y = "site",
    color = "year:N",
    tooltip=("yield", "site")
)

In [29]:
percent = 15

alternation = None
with open(f"barley{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [30]:
alt.Chart(alternation).mark_point().encode(
    x = "yield",
    y = "site",
    color = "year:N",
    tooltip=("yield", "site")
)

## burtin.json

This dataset was gathered by Will Burtin and is used to explore the effectiveness of various antibiotics in treating a variety of bacterial infections.

In [31]:
source = vega_datasets.burtin()

alt.Chart(source).mark_bar().encode(
    x = "Bacteria",
    y = "Streptomycin",
    color = "Penicillin:N",
    tooltip=("Bacteria", "Penicillin")
)

In [36]:
percent = 15

alternation = None
with open(f"burtin{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [37]:
alt.Chart(alternation).mark_bar().encode(
    x = "Bacteria",
    y = "Streptomycin",
    color = "Penicillin:N",
    tooltip=("Bacteria", "Penicillin")
)

## cars.json

Acceleration, horsepower, fuel efficiency, weight, and other characteristics of different makes and models of cars. 

In [38]:
source = vega_datasets.cars()

alt.Chart(source).mark_point().encode(
    x = "Horsepower",
    y = "Miles_per_Gallon",
    color = "Cylinders:N",
    tooltip=("Horsepower", "Miles_per_Gallon")
).interactive()

In [43]:
percent = 15

alternation = None
with open(f"cars{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [44]:
alt.Chart(alternation).mark_point().encode(
    x = "Horsepower",
    y = "Miles_per_Gallon",
    color = "Cylinders:N",
    tooltip=("Horsepower", "Miles_per_Gallon")
).interactive()

## crimea.json

This is a dataset containing monthly casualty counts from the Crimean war. 

In [45]:
source = vega_datasets.crimea()

alt.Chart(source).mark_bar().encode(
    x = "date:N",
    y = "disease",
    color = "other:O",
    tooltip=("disease", "other")
)

In [50]:
percent = 15

alternation = None
with open(f"crimea{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [51]:
alt.Chart(alternation).mark_bar().encode(
    x = "date:N",
    y = "disease",
    color = "other:O",
    tooltip=("disease", "other")
)

## driving.json

This dataset tracks miles driven per capita along with gas prices annually from 1956 to 2010.

In [52]:
source = vega_datasets.driving()

alt.Chart(source).mark_point().encode(
    x = "miles",
    y = "gas",
    color = "year",
    tooltip=("miles", "gas")
).interactive()

In [57]:
percent = 15

alternation = None
with open(f"driving{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [58]:
alt.Chart(alternation).mark_point().encode(
    x = "miles",
    y = "gas",
    color = "year",
    tooltip=("miles", "gas")
).interactive()

## iris.json - see above
## ohlc.json

(open, high, low and closed prices) - This one contains the performance of the Chicago Board Options Exchange

In [59]:
source = vega_datasets.ohlc()

alt.Chart(source).mark_bar().encode(
    x='date:T',
    color=alt.condition('datum.open <= datum.close',
                        alt.value('#1a9850'), alt.value('#d73027')),
    y='low:Q',
    y2='high:Q',
    tooltip=['date:T', 'open:Q', 'high:Q', 'low:Q', 'close:Q']
)

In [64]:
percent = 15

alternation = None
with open(f"ohlc{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [65]:
alt.Chart(alternation).mark_bar().encode(
    x='date:T',
    color=alt.condition('datum.open <= datum.close',
                        alt.value('#1a9850'), alt.value('#d73027')),
    y='low:Q',
    y2='high:Q',
    tooltip=['date:T', 'open:Q', 'high:Q', 'low:Q', 'close:Q']
)

## wheat.json

A collection of data on the yields of different varieties of wheat, as well as various characteristics of the wheat plants such as protein content, kernel weight, and moisture. 

In [66]:
source = vega_datasets.wheat()

alt.Chart(source).mark_point().encode(
    x = "wages",
    y = "wheat",
    color = "year:Q",
    tooltip=("wages", "wheat")
).interactive()

In [71]:
percent = 15

alternation = None
with open(f"wheat{percent}.json", 'r') as f:
    data = json.load(f)
    alternation = pd.DataFrame(data)

In [72]:
alt.Chart(alternation).mark_point().encode(
    x = "wages",
    y = "wheat",
    color = "year:Q",
    tooltip=("wages", "wheat")
).interactive()