In [None]:
import vaex
df = vaex.open('data.hdf5')

In [None]:
import pandas as pd
x_and_y = pd.DataFrame(df["xy"].to_numpy(), columns=["x", "y"])

In [None]:
x_and_y["gold"] = pd.Categorical(df["gold"].tolist())

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

# Categorically aggregates
# agg = ds.Canvas().points(x_and_y, "x", "y", agg=ds.count_cat(column="gold"))
agg = ds.Canvas().points(x_and_y, 'x', 'y', ds.by('gold', ds.count()))

# agg shape 600 x 600 x 2

# Transformations
#tf.Images(tf.shade(agg.where(agg>=np.percentile(agg,99)), name="99th Percentile"))

# Colormap
tf.shade(agg, name="Default color key")

In [None]:
# also can do custom color key
image = tf.shade(agg, name="Custom Color Map", color_key={0: "green", 1: "orange"})

In [None]:
# output of tf.shade is an image?

# tf.shade takes as input the counts per pixel and figures out
# how to properly map them to colors
# 1) masks out the background
# 2) transform the bin values - i.e. linear (min-max = 0-1) would 
# be a poor choice, log sometimes works, eq_hist makes sure that all
# values in the colormap are used: "each equal-sized histogram bin to ensure even usage of every displayable color"
# 3) map - simply go from transformed range to the actual colormap
# range. If a colormap is used, masked values are given a fully 
# transparent alpha value, and non-masked ones are given a fully 
# opaque alpha value. If a single color is used, the alpha value 
# starts at min_alpha and increases proportionally to the mapped data 
# value up to the full alpha value. see plotting pitfalls for more info

# Equal sized histograms why does this formula work?
# new_pdf/new_histogram=(L−1)cdf_of_old_normalized_histogram(x)
# eq_histogram somehow not yet supported by plotly backend?


# For categorical aggregates, the shade function works similarly
# to providing a single color to a non-categorical aggregate,
# with the alpha (opacity) calculated from the total value across
# all categories (and the color calculated as a weighted mixture of
# the colors for each category).


In [None]:
# Spreading - turns square bins into circles

tf.shade(tf.spread(agg, px=0)) # From a high level this doesn't seem to do anything

In [None]:
# Putting it together with plotly

image = tf.shade(tf.spread(agg, px=0))

In [None]:
pil_image = image.to_pil()

In [None]:
import plotly.express as px
px.imshow(pil_image)

In [None]:
import holoviews as hv
from holoviews.operation.datashader import datashade

hv.extension("plotly")

In [None]:
datashade(hv.Points(x_and_y)) # Points is recommended as opposed to scatter
# because then there isn't a concept of y being dependent on x

In [None]:
# Down bellow I use hv.Scatter which should probably changes to hv.Points
# in terms of functionality they seem pretty interchangeable

In [None]:
import numpy as np
hv.RGB(np.array(pil_image))

In [None]:
# Interactive using hd.datashade

# This should theoretically work but it doesn't? Datashader let's you do this
datashade(hv.Points(x_and_y, vdims=["gold"]), aggregator="count_cat") #is this vdim useful?
# default aggregator is count


In [None]:
# Using holoviews Dataset object (which might help with finding the selected points later?)
dataset = hv.Dataset(x_and_y)

In [None]:
# Holoviews doesn't actually operate on the data, but rather is 
# a container for data + metadata that chains together operations
# until it's time to execute them
from holoviews.operation.datashader import datashade, shade, dynspread, spread, rasterize


# rasterize() uses Datashader to render the data into what is by default
# a 2D histogram, where every array cell counts the data points falling
# into that pixel. Bokeh then colormaps that array, turning each cell
# into a pixel in an image.

# Instead of having Bokeh do the colormapping, you can instruct
# Datashader to do so, by wrapping the output of rasterize() in a call
# to shade(), where shade() is Datashader’s colormapping function.
# The datashade() operation is also provided as a simple macro, where
# datashade(x) is equivalent to shade(rasterize(x)):

scatter = datashade(hv.Scatter(dataset, kdims="x", vdims="y"))
scatter

In [None]:
# Some more useful info: https://examples.pyviz.org/nyc_taxi/nyc_taxi.html

In [None]:
dynspread(rasterize(hv.Scatter(dataset, kdims="x", vdims="y")).opts(tools=["hover"]), max_px=40)

In [None]:
# this also doesn't work
# a = shade(hv.Scatter(dataset, kdims="x", vdims="y"), color_key={0: "green", 1: "orange"})

In [None]:
hv.help(hv.Points)

In [4]:
!pip install -q plotly

Collecting plotly
  Using cached plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
Collecting tenacity>=6.2.0
  Using cached tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.5.0 tenacity-8.0.1


In [13]:
import vaex
import plotly.express as px
import pandas as pd
import warnings

warnings.filterwarnings("ignore")


def get_opacity(len_input):
    """Inversely scales a value from range (0,500_000) to (0.8, 0.2)"""
    return ( (1-( len_input  / 500_000 )) * (0.8-0.2)) + 0.2


def get_size(len_input):
    """Inversely scales a value from range (0,500_000) to (4, 1)"""
#     return (1-( len_input  / 500_000 )) + 1
    return ( (1-( len_input  / 500_000 )) * (3-1)) + 1


def get_viz(data, color_by="pred"):
    if color_by in ("pred","gold"):
        color = f"{color_by}_ind"
        data[color] = pd.Categorical(data[color_by].values)
        mapping = None
#         mapping = {
#             0:"rgba(255,0,0,0.5)",
#             1:"rgba(0,0,255,0.5)"
#         }
#         mapping={0:"red",1:"blue"}
    else:
        color = color_by
        mapping = {"easy": "green", "hard":"red", "boundary":"grey"}
#         mapping = {0: "green", 1:"red", 2:"grey"}

    print(color, mapping)
    print(data[color].unique())
    fig = px.scatter(
        data, x="x", y="y", color=color, 
        hover_data=['gold', "pred",'text_sample', "data_error_potential"], 
        color_discrete_map=mapping, 
#         category_orders=["easy","hard","boundary"]
    )

    fig.update_traces(marker=dict(size=get_size(len(data)), opacity=get_opacity(len(data))),
                      selector=dict(mode='markers'))
    
    fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    fig.update_yaxes(matches=None, showticklabels=False, visible=False)
    fig.update_xaxes(matches=None, showticklabels=False, visible=False)
    
    config = dict({'scrollZoom': True})

    fig.show(config=config)
    return fig
    
    
def get_hard_easy(df):
    avg_dep = df["data_error_potential"].mean()
    std_dep = df["data_error_potential"].std()
    easy = avg_dep
    hard = avg_dep + std_dep
    return hard, easy

def format_df(df, num_samples=500_000):
    num_samples = min(num_samples, 500_000)
    df_copy = df.copy()
    df_copy["x"] = df_copy["xy"][:,0]
    df_copy["y"] = df_copy["xy"][:,1]
    df_copy["text_sample"] = df_copy["text"].str.slice(start=0,stop=50)
    
    hard, easy = get_hard_easy(df_copy)
    
    pdf = df_copy[["id","x","y","gold","pred", "text_sample", "data_error_potential"]][:num_samples].to_pandas_df()
    pdf["gold_ind"] = pd.Categorical(pdf["gold"].values)
    pdf["data_error_potential"] = pdf["data_error_potential"].apply(lambda dep: "hard" if dep>=hard else "easy" if dep <= easy else "boundary")
#     pdf["data_error_potential"] = pdf["data_error_potential"].apply(lambda dep: 1 if dep>=hard else 0 if dep <= easy else 2)
#     pdf["data_error_potential"] = pd.Categorical(pdf["data_error_potential"].values)

    pdf["pred_ind"] = pd.Categorical(pdf["pred"].values)
    return pdf

In [14]:
import vaex
from minio import Minio 
# !rm trec6.hdf5
# client = Minio("data.dev.rungalileo.io",'minioadmin','minioadmin')
# file = "3c73b131-67b5-428c-b5bb-ca9a09f7f560/680eea57-7e71-4f12-94da-9cac62f572b2/training/data/data.hdf5"  # trec6
# # file = "3c73b131-67b5-428c-b5bb-ca9a09f7f560/8fab3177-ac3d-4200-bb18-b8e5ef38c0fc/training/data/data.hdf5"  # conv_intent
# client.fget_object('galileo-project-runs-results',file, 'trec6.hdf5')
df = vaex.open('trec6.hdf5')
pdf = format_df(df)
fig = get_viz(pdf, "data_error_potential")


data_error_potential {'easy': 'green', 'hard': 'red', 'boundary': 'grey'}
['easy' 'boundary' 'hard']


In [None]:
df = vaex.open('/Users/benepstein/Downloads/data_for_nikita.hdf5')
# df = vaex.open('/Users/benepstein/Downloads/data.hdf5')
pdf = format_df(df, 500_000)
fig = get_viz(pdf, "data_error_potential")

In [None]:
df = vaex.open('/Users/benepstein/Downloads/data_for_nikita.hdf5')
# df = vaex.open('/Users/benepstein/Downloads/data.hdf5')
pdf = format_df(df, 50_000)
fig = get_viz(pdf, "data_error_potential")

In [None]:
df = vaex.open('/Users/benepstein/Downloads/data_for_nikita.hdf5')
# df = vaex.open('/Users/benepstein/Downloads/data.hdf5')
pdf = format_df(df, 25_000)
fig = get_viz(pdf, "data_error_potential")

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px


some_data = pd.DataFrame({"id":list(range(50_000)), "x":np.random.normal(size=50_000), "y":np.random.normal(size=50_000)})
some_data["c"] = pd.Categorical(np.random.randint(low=0,high=2,size=50_000))


fig = px.scatter(some_data, x="x", y="y", color="c")

fig.update_traces(marker=dict(size=3, opacity=0.5),
                  selector=dict(mode='markers'))

fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.update_yaxes(matches=None, showticklabels=False, visible=False)
fig.update_xaxes(matches=None, showticklabels=False, visible=False)


config = dict({'scrollZoom': True})

fig.show(config=config)

In [None]:
np.random.randint(low=0,high=2,size=50_000)

In [None]:
import numpy as np

# data = pd.DataFrame({"id":list(range(50_000)), "x":np.random.rand(50_000), "y":np.random.rand(50_000)})
# data
len(np.random.uniform(size=50_000))

In [None]:
nifty_data.plot(kind='scatter',
        x='NIFTY FMCG index', 
        y='NIFTY Bank index',
        title = 'Scatter Plot for NIFTY Index values in 2020',
        figsize=(10,6));

In [None]:
import pandas as pd
import pandas_bokeh
pandas_bokeh.output_notebook()
nifty_data.plot_bokeh.scatter(x='NIFTY FMCG index', y='NIFTY Bank index');

In [None]:
import vaex
vaex.open("/Users/benepstein/Downloads/data.hdf5")