# Notebook goal
See if xarray could be a good way of storing our data.
The nice thing is that it can handle multidimensional arrays
while retaining meta data information.

# Notebook conclusions
this looks pretty good. we can store both deaths and confirmed cases in the
xr.Dataset structure. It's a labeled 3D array. In addition, we can add
any metadata from the separate ancillary files we would like.

In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr
from src.utils.paths import get_parent_dir

In [2]:
pdir = get_parent_dir(2)

def read_csse2df(path):
    """
    Preliminary reader for CSSE time series data.

    Parameters
    ----------
    path : str
        path to .csv file

    Returns
    -------
    df : pd.DataFrame
        CSSE timeseries data with county FIPS codes as cols.
    """
    # TODO: replace usage with class above, then delete
    df = pd.read_csv(path, index_col=[0])
    df.index = pd.to_datetime(df.index)
    return df

Read CSSE timeseries data

In [3]:
csse_dir = os.path.join(pdir, 'data', 'processed', 'csse', 'US')

fname_confirmed = "time_series_covid19_confirmed_US_timeseries.csv"
fname_deaths = "time_series_covid19_deaths_US_timeseries.csv"

path_confirmed = os.path.join(csse_dir, fname_confirmed)
path_deaths = os.path.join(csse_dir, fname_deaths)

ts_confirmed = read_csse2df(path_confirmed)
ts_deaths = read_csse2df(path_deaths)

ts_confirmed.index.name = 'time'
ts_deaths.index.name = 'time'

cheap temporary solutions for renaming the problematic "Unnamed: ..." columns

In [4]:
# ts_confirmed
search = np.array(["Unnamed" in x for x in ts_confirmed.columns.values])
cols = ts_confirmed.columns[search].values

coldict = {}
for col in cols:
    coldict[col] = [int(s) for s in col.split() if s.isdigit()][0]

ts_confirmed = ts_confirmed.rename(columns=coldict)
ts_confirmed.columns = pd.to_numeric(ts_confirmed.columns,
                                     downcast='integer')

# ts_deaths
search = np.array(["Unnamed" in x for x in ts_deaths.columns.values])
cols = ts_deaths.columns[search].values

coldict = {}
for col in cols:
    coldict[col] = [int(s) for s in col.split() if s.isdigit()][0]

ts_deaths = ts_deaths.rename(columns=coldict)
ts_deaths.columns = pd.to_numeric(ts_deaths.columns,
                                  downcast='integer')

In [5]:
# diffs in cols or index?
assert np.alltrue(ts_confirmed.columns == ts_deaths.columns)
assert np.alltrue(ts_confirmed.index == ts_deaths.index)

# create coord vars
times = ts_confirmed.index.values
locs = ts_confirmed.columns.values

# create dims
dims=['time', 'county']

lets try a DataArray object

In [6]:
da = xr.DataArray(data=ts_confirmed.values,
                  coords=[times, locs],
                  dims=dims)
print(da)

<xarray.DataArray (time: 111, county: 3252)>
array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0, 151,  16, ...,   0,   0, 103],
       [  0, 151,  16, ...,   0,   0, 103],
       [  0, 151,  19, ...,   0,   0, 103]])
Coordinates:
  * time     (time) datetime64[ns] 2020-01-22 2020-01-23 ... 2020-05-11
  * county   (county) int64 60 66 69 72 78 ... 90053 90054 90055 90056 99999


lets try a Dataset object

In [7]:
data_vars = {'confirmed': (dims, ts_confirmed.values),
             'deaths': (dims, ts_deaths.values)}
coords = {'time': times,
          'county': locs}
ds = xr.Dataset(data_vars=data_vars,
                coords=coords)

In [14]:
df_deaths = ds.deaths.to_dataframe().reset_index()

In [18]:
df_deaths.to_feather("test.feather")
df_deaths.to_pickle("test.pkl")