# Here we're making arrays holding metadata for idx conversion v4
## In particular we get:
- Timestamp of each datum
- Boolean representing if datum was resampled from 1041x381 to 1081x381 grid
- Timestamp of each datum's last [WRF-ARW weather forecast initialization](https://firesmoke.ca/resources/BSC-2015ForecastScheduleV6.pdf)

## Import necessary libraries

In [1]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used for interacting with OS file system (to get directory file names)
import os

# Used for processing netCDF time data
import time
import datetime

# Used for numerical work
import numpy as np

# Used for loading data from pickle data
import pickle

# Accessory for generating progress bar to see progress of loops
from tqdm import tqdm

## Import sequence of IDX calls

In [2]:
# copy and paste .pkl file into working directory
with open("idx_calls_v4.pkl", "rb") as input_file:
    idx_calls = pickle.load(input_file)

In [3]:
idx_calls

[['BSC12CA12-01',
  'dispersion_20210303.nc',
  datetime.datetime(2021, 3, 4, 0, 0),
  3],
 ['BSC12CA12-01',
  'dispersion_20210303.nc',
  datetime.datetime(2021, 3, 4, 1, 0),
  4],
 ['BSC12CA12-01',
  'dispersion_20210303.nc',
  datetime.datetime(2021, 3, 4, 2, 0),
  5],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 3, 0),
  0],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 4, 0),
  1],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 5, 0),
  2],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 6, 0),
  3],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 7, 0),
  4],
 ['BSC18CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 8, 0),
  5],
 ['BSC00CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 9, 0),
  0],
 ['BSC00CA12-01',
  'dispersion_20210304.nc',
  datetime.datetime(2021, 3, 4, 10, 0),
  1]

Location of all netCDF files

In [4]:
# location of netCDF files
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

## Step through all `idx_calls` and obtain metadata.

### Specifically we:
1. Get timestamps as saved in `idx_calls`.
2. Get grid size from original netCDF files.
3. Get last initialized time as shown in `idx_calls`.
---
These instructions could have been run during the IDX conversion. However I decided to do it separately by stepping through the `idx_calls` sequence so the IDX conversion loop would not be so bloated.

In [5]:
# obtaining the preferred tflag based on original metadata
def reverse_parse_tflag(full_datetime):
    '''
    Return the UBC style tflag for given datetime stamp
    :param datetime full_datetime: full datetime object containing the year, day, hour, minute, and second
    :return: tuple containing two tflags, where the first represents the year and day of the year,
             and the second represents the hour, minute, and second
    '''
    year = full_datetime.year
    day_of_year = full_datetime.timetuple().tm_yday
    hours = full_datetime.hour
    minutes = full_datetime.minute
    seconds = full_datetime.second

    tflag0 = year * 1000 + day_of_year
    tflag1 = hours * 10000 + minutes * 100 + seconds

    return tflag0, tflag1

In [6]:
def get_last_timestamp(id_, file):
    '''
    Return datetime stamp of the last forecast update seen for the given dataset ID and file name.
    :param str id_: The dataset ID used
    :param str file: The name of the file, with the date
    '''
    # use file's date
    date = datetime.datetime.strptime(file[-len('20210304.nc'):-len('.nc')], "%Y%m%d")
    
    # set hour of intialization based on dataset ID
    if id_ == "BSC18CA12-01":
        date = date.replace(hour=18)
    if id_ == "BSC00CA12-01": 
        date = date.replace(hour=0)
    if id_ == "BSC06CA12-01":
        date = date.replace(hour=6)
    if id_ == "BSC12CA12-01":
        date = date.replace(hour=12)
    return date

In [7]:
# the final array of UBC style tflags
tflags = []

# whether resampled or not
is_resampled = []

# hold the timestamp of when each timestamp saw it's last forecast initialization
last_init = []

# hold the attributes for each file, TODO NEED TO CONFIRM WHAT THEY ARE?, SEEMS USEFUL ATTRS
attrs_desired = ['CDATE', 'CTIME', 'WDATE', 'WTIME', 'SDATE', 'STIME']

attr_data = {attr: [] for attr in attrs_desired}

# create array of UBC style tflags from our sequence of idx_calls
for call in tqdm(idx_calls):
    # get instructions from call, call looks like:
    # [curr_id, file_str, parse_tflag(ds['TFLAG'].values[tstep_idx][0]), tstep_idx]
    curr_id = call[0]
    curr_file = call[1]
    curr_tflag = call[2]
    tstep_index = call[3]    

    # open the file with xarray
    ds = xr.open_dataset(f'{firesmoke_dir}/{curr_id}/{curr_file}')
    
    # resample data if not already on max lat/lon grid
    is_resampled.append(ds.XORIG != -160.0)

    # make UBC style tflag and append to our final array
    tflags.append(reverse_parse_tflag(curr_tflag))

    # get UBC style tflag of last forecast update time
    last_timestamp = get_last_timestamp(curr_id, curr_file)
    last_init.append(reverse_parse_tflag(last_timestamp))
    
    # save the attribute information to the arrays
    for attr in attrs_desired:
        attr_data[attr].append(ds.attrs[attr])

100%|██████████| 27357/27357 [05:10<00:00, 88.07it/s]


In [9]:
# save all arrays, the metadata
np.save('firesmoke_v4-tflags.npy', tflags)
np.save('firesmoke_v4-resamp.npy', is_resampled)
np.save('firesmoke_v4-last_init.npy', last_init)

for attr in attrs_desired:
    np.save(f'firesmoke_v4-{attr}.npy', attr_data[attr])