In [74]:
import pandas as pd
from sqlalchemy import create_engine

In [75]:
engine = create_engine('postgresql://{}:{}@{}/{}'.format(
                os.environ['PGUSER'], os.environ['PGPASSWORD'],
                os.environ['PGHOST'], os.environ['PGDATABASE']))

In [76]:
import features
db = features.FeatureStorage()

# Features Validation

The purpose of this is to sanity check the feature generation code. 

The output of `features.py` are the following tables when using the default parameters:

In [77]:
feature_tables = ['features.burst_binned_lengths', 
                  'features.burst_length_aggregates', 
                  'features.burst_lengths', 
                  'features.cell_numbers', 
                  'features.cell_ordering', 
                  'features.cell_ordering_differences', 
                  'features.cell_timings', 
                  'features.initial_cell_directions', 
                  'features.interpacket_timings', 
                  'features.size_30_windows']

# Check number of rows 

Get the list of exampleids from the `raw.frontpage_examples`:

In [78]:
master_indices = pd.read_sql("select exampleid from raw.frontpage_examples", engine)

In [79]:
len(master_indices)

1332

Now let's get the list of exampleids from each table and see if they're consistent, and if not, why not:

In [80]:
def to_list(x):
    return list(x.exampleid.values)

In [81]:
feature_indices = {}
for table in feature_tables:
    featind = pd.read_sql("select exampleid from {}".format(table), engine)
    feature_indices.update({table: featind})

In [91]:
for table in feature_tables:
    print(table, len(feature_indices[table]))

features.burst_binned_lengths 1330
features.burst_length_aggregates 1330
features.burst_lengths 1330
features.cell_numbers 1330
features.cell_ordering 1330
features.cell_ordering_differences 1330
features.cell_timings 1330
features.initial_cell_directions 1330
features.interpacket_timings 1330
features.size_30_windows 1330


In [92]:
set(to_list(indices)) - set(to_list(feature_indices['features.burst_binned_lengths']))

{23, 1340}

What's this then? Examples that do not appear in our feature tables? 

In [84]:
db.get_trace_cells(23)

Unnamed: 0,ingoing,t_trace


In [85]:
db.get_trace_cells(1340)

Unnamed: 0,ingoing,t_trace


It turns out this is by design. Our feature generation queries filter out those demon examples that snuck their way into our raw data table despite their not having any cells in the recorded trace. 

This is how we get all features for all examples:

In [93]:
df_features = pd.read_sql("select * from features.frontpage_features", engine)