# Tabulate the flu data with sliding windows
After the data is [preprocessed](preprocess_flu_data.ipynb),
the resulting `.json` file processed further and turned into tabular `.csv` files that are saved. This is done by a sliding-window technique: the user specifies a sliding 
window to emulate cross sectional variables by setting the `LAG` constant below.

In [1]:
import itertools
import json
import numpy as np
import pandas as pd

In [2]:
def tabulate_array(series, lag):
    values = np.asarray([series[i-lag:i] for i in xrange(lag, len(series)+1)])
    return values

In [3]:
def tabulate_state(state, lag, censor=None):
    with open('fludata/generated/Flu_Vacc_ILI_State_Aggregate.json', 'r') as f:
        data = json.load(f)[state]
    tweets = np.asarray([v[1] for v in data['No. of Tweets']])
    vaccs = np.asarray([v[1] for v in data['Vaccination percentage % diff']])
    ili = np.asarray([v[1] for v in data['ILI Rate']])
    # Censor values using -1e-7 [as not to collide with nan, genuine missing]
    if censor:
        ili[censor] = -1e7
    # vaccs must have one less entry, since it is a first difference.
    assert len(tweets) == len(ili) == len(vaccs)+1
    # Tabulate the series.
    #  tweet: raw number.
    #  ili: from percetnage to basis points.
    #  vaccs: raw fraction to basis points.
    tabular_tweets = tabulate_array(tweets[1:], lag)
    tabular_vaccs = tabulate_array(1e4 * vaccs, lag)
    tabular_ili = tabulate_array(1e2 * ili[1:], lag)
    assert len(tabular_tweets) == len(tabular_vaccs) == len(tabular_ili)
    # Create an overall matrix by column stacking and,
    # ignoring rows with more than 1.5*lag missing values.
    overall_raw = np.column_stack((tabular_tweets, tabular_vaccs, tabular_ili))
    # keep = [np.sum(np.isnan(overall_raw), axis=1) < 5]
    # overall = overall_raw[keep]
    overall = overall_raw
    # Return a dataframe.
    columns = list(itertools.chain.from_iterable([
        ['%s_t%d' % (c,i) for i in xrange(lag)] for c in ['tweets','vaccs','ili']
    ]))
    return pd.DataFrame(overall, columns=columns)

In [4]:
def get_filename(state, lag, censor):
    censor_str = '_censor' if censor else ''
    return 'fludata/generated/tabulated_flu_vacc_ili_%s_lag%d%s.csv' % (state, lag, censor_str)

In [5]:
# States with ILI data and their aggregate data.
LAG = 5
CENSOR = [10, 11, 12, 13, 24, 25, 26, 27, 37, 38, 39,]
STATES = ['MA', 'MS', 'NC', 'RI', 'TX', 'TN',]
TABLES = [tabulate_state(s, lag=LAG, censor=CENSOR) for s in STATES]
for state, table in zip(STATES, TABLES):
    with open(get_filename(state, LAG, CENSOR), 'w') as f:
        table.to_csv(f, index=False)