This notebook is a derivation of the [datashader time series](https://datashader.org/user_guide/Timeseries.html) notebook and serves as a mean to compare the datashader vs. plotly-resampler functionality.

In [None]:
%load_ext autoreload
%autoreload 2

# !pip install jupyter_bokeh datashader panel holoviews bokeh

In [None]:
import datetime
import pandas as pd
import numpy as np
import xarray as xr
import datashader as ds
import datashader.transfer_functions as tf
from collections import OrderedDict
import panel as pn
import holoviews as hv
from holoviews.operation.datashader import datashade

from plotly_resampler import FigureResampler, EfficientLTTB
import plotly.graph_objects as go

In [None]:
hv.extension("bokeh")
pn.extension(comms='ipywidgets')

### Generate fake data

This data has 10 signal modalties, which are highly correlated and one modality (`a`) has some outliers.

In [None]:
# Constants
np.random.seed(2)
n = 1_000_000                                # Number of points
cols = list('abcdefg')                       # Column names of samples
start = datetime.datetime(2010, 10, 1, 0)    # Start time

# Generate a fake signal
signal = np.random.normal(0, 0.3, size=n).cumsum() + 50

# Generate many noisy samples from the signal
noise = lambda var, bias, n: np.random.normal(bias, var, n)
data = {c: signal + noise(1, 10*(np.random.random() - 0.5), n) for c in cols}

# Add some "rogue lines" that differ from the rest 
cols += ['x'] ; data['x'] = signal + np.random.normal(0, 0.02, size=n).cumsum() # Gradually diverges
cols += ['y'] ; data['y'] = signal + noise(1, 20*(np.random.random() - 0.5), n) # Much noisier
cols += ['z'] ; data['z'] = signal # No noise at all

# Pick a few samples from the first line and really blow them out
locs = np.random.choice(n, 10)
data['a'][locs] *= 2

# Create a dataframe
data['Time'] = [start + datetime.timedelta(minutes=1)*i for i in range(n)]

df = pd.DataFrame(data)
df.tail(3)

In [None]:
df['ITime'] = pd.to_datetime(df['Time']).astype('int64')

In [None]:
# Default plot ranges:
x_range = (df.iloc[0].ITime, df.iloc[-1].ITime)
y_range = (1.2*signal.min(), 1.2*signal.max())

print("x_range: {0} y_range: {1}".format(x_range,y_range))

## 1. Plotting all the datapoints

In [None]:
%%time
cvs = ds.Canvas(x_range=x_range, y_range=y_range, plot_height=300, plot_width=900)
aggs= OrderedDict((c, cvs.line(df, 'ITime', c)) for c in cols)

### A single, noisy trace

In [None]:
%%time
# Visualize a single column
img = tf.shade(aggs['a'])
img

The result looks similar to what you might find in any plotting program, but it uses all 100,000 datapoints, and would work similarly for 1, 10, or 100 million points (determined by the n attribute above).

Why is using all the datapoints important? To see, let’s downsample the data by a factor of 10, plotting 10,000 datapoints for the same curve:

In [None]:
mask = (df.index % 10) == 0
tf.shade(cvs.line(df[mask][['a','ITime']], 'ITime', 'a'))

In [None]:
%%time
fr = FigureResampler(default_n_shown_samples=2000)
for c in ['a']:
    fr.add_trace(go.Scattergl(name=c, line_width=1), hf_x=df.Time, hf_y=df[c])
fr.update_layout(template='plotly_white')
fr.show_dash(mode='inline', port=8049)

### All the traces

In [None]:
renamed = [aggs[key].rename({key: 'value'}) for key in aggs]
merged = xr.concat(renamed, 'cols')

In [None]:
total = tf.shade(merged.sum(dim='cols').astype('uint32'), how='linear')
total

With study, the overall structure of this dataset should be clear, according to what we know we put in when we created them:

1. Individual rogue datapoints from curve ‘a’ are clearly visible (the seven sharp spikes)
2. The trend is clearly visible (for the viridis colormap, the darkest greens show the areas of highest overlap)
3. Line ‘x’ that gradually diverges from the trend is clearly visible (as the light blue (low-count) areas that increase below the right half of the plot).

(Note that if you change the random seed or the number of datapoints, the specific values and locations will differ from those mentioned in the text.)

**None of these observations would have been possible with downsampled, overplotted curves as would be typical of other plotting approaches.**

In [None]:
%%time
fr = FigureResampler(default_n_shown_samples=2_000)
for c in cols:
    fr.add_trace(
        go.Scattergl(name=c, marker_color='darkblue', opacity=.15, line_width=1),
        hf_x=df.Time, hf_y=df[c]
    )
fr.update_layout(template='plotly_white')
fr.show_dash(mode='inline', port=8048)

---

## **Intermezzo** Incorporating LTTB into holoviews

In [None]:
s = df['a']
s.index = df['ITime']
s.index.name = 'timestamp'

In [None]:
s.reset_index()

In [None]:
s = df['x']
s.index = df['ITime']

TODO alter this into a dynamic map of an overlay of traces.

In [None]:
%%time
def resample_lttb(x_range) -> hv.Curve:
    if x_range is None or (np.isnan(x_range[0]) or np.isnan(x_range[1])):
        s_ = s
    else:
        s_ = s.loc[int(x_range[0]) : int(x_range[1])]

    s_ = EfficientLTTB().aggregate(s_, n_out=2000)
    s_.index.name = "timestamp"
    return hv.Curve(s_.reset_index(), "timestamp")


layout = hv.Overlay(
    [hv.DynamicMap(resample_lttb, streams=[hv.streams.RangeX()]) for _ in range(1)]
).collate()
layout.opts(hv.opts.Curve(axiswise=True, width=800, height=500, tools=["xwheel_zoom"]))

---

## Datashader vs plotly-resampler

### Datashader vs plotly-resampler: `noisy-sine`

In [None]:
n = 1_000_000
x = np.arange(n)
noisy_sine = (np.sin(x / 3_000) + (np.random.randn(n) / 10)) * x / 5_000
df_ = pd.DataFrame({"ns": noisy_sine, "ns_abs": np.abs(noisy_sine)})

In [None]:
opts = hv.opts.RGB(width=800, height=400)
ndoverlay = hv.NdOverlay({c:hv.Curve((df_.index, df_[c])) for c in df_.columns})
datashade(ndoverlay, cnorm='linear', aggregator=ds.count(), line_width=3).opts(opts)

In [None]:
fr = FigureResampler(default_n_shown_samples=3000)
for c in set(df_.columns).difference(["Time"]):
    fr.add_trace(
        go.Scattergl(
            name=c,
            marker_color="blue",
            mode="lines+markers",
            opacity=0.1,
            marker_size=3,
        ),
        hf_y=df_[c],
    )
fr.show_dash(mode="inline", port=8091)

### Datashader vs plotly-resampler: `multiple-trends`

In [None]:
signals = [np.random.normal(0, 0.3, size=n).cumsum() + 50,
           np.random.normal(0, 0.3, size=n).cumsum() + 50,
           np.random.normal(0, 0.3, size=n).cumsum() + 50]
data = {c: signals[i%3] + noise(1+i, 5*(np.random.random() - 0.5), n)  for (i,c) in enumerate(cols)}
y_range = (1.2*min([s.min() for s in signals]), 1.2*max([s.max() for s in signals]))    

data['Time'] = df['Time']
dfm = pd.DataFrame(data)
dfm.shape

In [None]:
opts = hv.opts.RGB(width=600, height=300)
ndoverlay = hv.NdOverlay({c:hv.Curve((dfm.index, dfm[c]), vdims=['Time']) for c in cols})
datashade(ndoverlay, cnorm='linear', aggregator=ds.count(), line_width=3).opts(opts)

In [None]:
fr = FigureResampler(default_n_shown_samples=2000)
for c in set(dfm.columns).difference(['Time']):
    fr.add_trace(go.Scattergl(name=c, marker_color='blue', opacity=0.1), hf_x=dfm.Time, hf_y=dfm[c])
fr.update_layout(template='plotly_white')
fr.show_dash(mode='inline', port=8091)