# Here we're making arrays holding metadata for idx conversion v5
## In particular we get:
- Timestamp of each datum
- Boolean representing if datum was resampled from 1041x381 to 1081x381 grid
- All NetCDF attributes (ds.attr) from the dispersion.nc file that the current timestamp is from (CDATE, CTIME, etc...)

## Import necessary libraries

In [1]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used for interacting with OS file system (to get directory file names)
import os

# Used for processing netCDF time data
import time
import datetime

# Used for numerical work
import numpy as np

# Used for loading data from pickle data
import pickle

# Accessory for generating progress bar to see progress of loops
from tqdm import tqdm

## Import sequence of IDX calls

In [2]:
# copy and paste .pkl file into working directory
with open("idx_calls_v5.pkl", "rb") as input_file:
    idx_calls = pickle.load(input_file)

Location of all netCDF files

In [3]:
# location of netCDF files
firesmoke_dir = "/opt/wired-data/firesmoke/final_union_set"

## Step through all `idx_calls` and obtain metadata.

### Specifically we:
1. Get timestamps as saved in `idx_calls`.
2. Get grid size from original netCDF files.
3. Get last initialized time as shown in `idx_calls`.
---
These instructions could have been run during the IDX conversion. However I decided to do it separately by stepping through the `idx_calls` sequence so the IDX conversion loop would not be so bloated.

In [4]:
# obtaining the preferred tflag based on original metadata
def reverse_parse_tflag(full_datetime):
    '''
    Return the UBC style tflag for given datetime stamp
    :param datetime full_datetime: full datetime object containing the year, day, hour, minute, and second
    :return: tuple containing two tflags, where the first represents the year and day of the year,
             and the second represents the hour, minute, and second
    '''
    year = full_datetime.year
    day_of_year = full_datetime.timetuple().tm_yday
    hours = full_datetime.hour
    minutes = full_datetime.minute
    seconds = full_datetime.second

    tflag0 = year * 1000 + day_of_year
    tflag1 = hours * 10000 + minutes * 100 + seconds

    return tflag0, tflag1

In [5]:
# the final array of n UBC style tflags
tflags = []

# whether datapoint at n'th timestep is resampled or not
is_resampled = []

# hold the attributes for each file
ds_attr = xr.open_dataset(f'{firesmoke_dir}/dispersion_2025316_154209.nc')
attrs_desired = ds_attr.attrs.keys()

# Only keep attrs_desired that are NOT grid attributes
grid_attrs = {'XORIG', 'YORIG', 'XCELL', 'YCELL', 'NCOLS', 'NROWS', 'XCENT', 'YCENT'}
attrs_desired_non_grid = [attr for attr in attrs_desired if attr not in grid_attrs]

attr_data = {}
for attr in attrs_desired_non_grid:
    attr_data[attr] = []

# create array of UBC style tflags from our sequence of idx_calls
for call in tqdm(idx_calls):
    # get instructions from call:
    # [file name to open, timestamp, TSTEP index to select]
    file_name = call[0]
    timestamp = call[1]
    tstep_index = call[2]

    # open the file with xarray
    ds = xr.open_dataset(f'{firesmoke_dir}/{file_name}')

    # indicate we resample data if not already on max lat/lon grid
    # TODO: this is currently hard-coded.. so fragile
    is_resampled.append(ds.XORIG != -160.0)

    # make UBC style tflag and append to our final array
    tflags.append(reverse_parse_tflag(timestamp))

    # save the non-grid attribute information to the arrays
    for attr in attrs_desired_non_grid:
        if attr in ds.attrs:
            attr_data[attr].append(ds.attrs[attr])
        else:
            attr_data[attr].append(None)


100%|██████████| 39546/39546 [03:04<00:00, 214.70it/s]


In [6]:
# save all arrays, the metadata
np.save('firesmoke_v5-tflags.npy', tflags)
np.save('firesmoke_v5-resamp.npy', is_resampled)

for attr in attrs_desired_non_grid:
    np.save(f'firesmoke_v5-{attr}.npy', attr_data[attr])