# Progressive Loading and Visualization

This notebook shows the simplest code to download all the New York Yellow Taxi trips from 2015. They were all geolocated and the trip data is stored in multiple CSV files.
We visualize progressively the pickup locations (where people have been picked up by the taxis).

First, we define a few constants, where the file is located, the desired resolution, and the url of the taxi file.

In [None]:
import warnings
warnings.filterwarnings("ignore")
LARGE_TAXI_FILE = "https://www.aviz.fr/nyc-taxi/yellow_tripdata_2015-01.csv.bz2"
RESOLUTION=512

In [None]:
# See https://en.wikipedia.org/wiki/Module:Location_map/data/USA_New_York_City
from dataclasses import dataclass
@dataclass
class Bounds:
    top: float = 40.92
    bottom: float = 40.49
    left: float = -74.27
    right: float = -73.68

bounds = Bounds()

In [None]:
from progressivis import (CSVLoader, Histogram2D, ConstDict, Heatmap, PDict, 
    BinningIndexND, RangeQuery2d, Variable)
import progressivis.core.aio as aio

col_x = "pickup_longitude"
col_y = "pickup_latitude"
lo_bnds = PDict({col_x: bounds.left, col_y: bounds.bottom})
up_bnds = PDict({col_x: bounds.right, col_y: bounds.top})
# Create a csv loader filtering out data outside NYC
csv = CSVLoader(LARGE_TAXI_FILE, index_col=False, usecols=[col_x, col_y])
# Create an indexing module on csv loader output columns
index = BinningIndexND()
# actually one index per column
index.input.table = csv.output.result[col_x, col_y]
# Create a querying module
query = RangeQuery2d(column_x=col_x, 
                     column_y=col_y
                    )
# Variable modules allow to dynamically modify queries ranges
var_lo = Variable()
var_up = Variable()
query.input.lower = var_lo.output.result
query.input.upper = var_up.output.result
query.input.index = index.output.result
query.input.min = index.output.min_out
query.input.max = index.output.max_out
# Create a module to compute the 2D histogram of the two columns specified
# with the given resolution
histogram2d = Histogram2D(col_x, col_y, xbins=RESOLUTION, ybins=RESOLUTION)
# Connect the module to the csv results and the min,max bounds to rescale
histogram2d.input.table = query.output.result
histogram2d.input.min = query.output.min
histogram2d.input.max = query.output.max
# Create a module to create an heatmap image from the histogram2d
heatmap = Heatmap()
# Connect it to the histogram2d
heatmap.input.array = histogram2d.output.result

In [None]:
# Show the dataflow
import graphviz 
src = csv.scheduler().to_graphviz()
gvz=graphviz.Source(src)
display(gvz)

In [None]:
heatmap.display_notebook()
# Start the scheduler
csv.scheduler().task_start();
await aio.sleep(1)
await var_lo.from_input(lo_bnds)
await var_up.from_input(up_bnds);

In [None]:
import ipywidgets as widgets
long_slider = widgets.FloatRangeSlider(
    value=[lo_bnds[col_x], up_bnds[col_x]],
    min=lo_bnds[col_x],
    max=up_bnds[col_x],
    step=(up_bnds[col_x]-lo_bnds[col_x])/10,
    description='Longitude:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f',
)
lat_slider = widgets.FloatRangeSlider(
    value=[lo_bnds[col_y], up_bnds[col_y]],
    min=lo_bnds[col_y],
    max=up_bnds[col_y],
    step=(up_bnds[col_y]-lo_bnds[col_y])/10,
    description='Latitude:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f',
)
def observer(_):
    async def _coro():
        long_lo, long_up = long_slider.value
        lat_lo, lat_up = lat_slider.value
        await var_lo.from_input({col_x: long_lo, col_y: lat_lo})
        await var_up.from_input({col_x: long_up, col_y: lat_up})
    aio.create_task(_coro())
long_slider.observe(observer, "value")
lat_slider.observe(observer, "value")
widgets.VBox([long_slider, lat_slider])

In [None]:
# Show what runs
csv.scheduler()

In [None]:
csv.scheduler().task_stop();