In [None]:
import itertools

import numpy as np
import pandas as pd

from IPython.display import display_html

from engine import query_df
from split_query.expressions import Float
from interface import DataSet

In [None]:
class StaticDataFrameBackend(object):

    def __init__(self, df):
        self.df = df

    def query(self, expr):
        return query_df(self.df, expr)

    def estimate_count(self, expr):
        ''' Cheating here: the idea is to have a custom estimate based on the
        provided expression (e.g. from known properties of time series data). '''
        return query_df(self.df, expr).shape[0]

# Stand in for a backend function: runs queries on a grid.
backend = StaticDataFrameBackend(pd.DataFrame(
    columns=['x', 'y'],
    data=list(itertools.product(range(10), range(10)))))

# Interface object: filters like a dataframe.
attributes = [Float('x'), Float('y')]
dataset = DataSet('My dataset', attributes, backend)

# Querying returns a new object
filtered = dataset[dataset.x < 3][(dataset.y < 2) | (dataset.y >= 8)]
assert dataset.expr != filtered.expr

In [None]:
# get() method retrieves the actual data. This returns a dataframe which
# can immediately be operated on. Alternative is to run get() automagically
# when a function is called and apply the given function to the resulting
# dataframe. But that is probably a bad idea where large remote datasets are
# concerned.
display_html(dataset)
dataset.get().sum()

In [None]:
display_html(filtered)
filtered.get().sum()

In [None]:
display_html(filtered[filtered.x > 5])
filtered[filtered.x > 5].get()