# Firesmoke Data Conversion to IDX using OpenVisus

## Import necessary libraries, install them if you do not have them. This was developed in Python 3.9

In [1]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used to convert to .idx
from OpenVisus import *

# Used for numerical work
import numpy as np

# Used for processing netCDF time data
import datetime

# Used for interacting with OS file system (to get directory file names)
import os

# To load/save final sequence array to file
import pickle

# Used for resampling arrays to fit the same lat/lon grid
from scipy.interpolate import griddata

# for plotting
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# for checking and using timestamps
import pandas as pd

# Accessory, used to generate progress bar for running for loops
# from tqdm.notebook import tqdm
# import ipywidgets
# import jupyterlab_widgets
from tqdm import tqdm

## Get relevant directory paths

In [2]:
# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

# path to save idx file and data
idx_dir = "/usr/sci/scratch_nvme/arleth/idx/firesmoke"

In [3]:
# get metadata of datasets, had to be obtained manually
ids = ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
start_dates = ["20210304", "20210304", "20210304", "20210303"]
end_dates = ["20240627", "20240627", "20240627", "20240627"]

id_dates = {ids[i]: {"start_date": start_dates[i], "end_date": end_dates[i]} for i in range(len(ids))}

## Gather information about the metadata of our files, since it is inconsistent file to file. We need to know what to normalize across all files.

### In particular:
#### 1. Count number of files there are per firesmoke directory.
#### 2. Determine maximum row,col dimension sizes for pm25 array.
#### 3. Determine maximum latitude longitude grid parameters.

In [4]:
# List of all files that are available from UBC
successful_files = {id_: [] for id_ in ids}

# Variables to hold maxes, also to track the unique max values
max_ncols = {id_: 0 for id_ in ids}
max_nrows = {id_: 0 for id_ in ids}
ncols = {id_: set() for id_ in ids}
nrows = {id_: set() for id_ in ids}

# Max grid dimensions
max_grid_x = {id_: {"xorig": 0.0, "xcell": 0.0} for id_ in ids}
max_grid_y = {id_: {"yorig": 0.0, "ycell": 0.0} for id_ in ids}
xorigs = {id_: set() for id_ in ids}
xcells = {id_: set() for id_ in ids}
yorigs = {id_: set() for id_ in ids}
ycells = {id_: set() for id_ in ids}

for id_ in ids:
    # get list of netcdf file names for each dataset
    file_names = os.listdir(f'{firesmoke_dir}/{id_}/')
    
    # try opening each file, process only if it successfully opens
    for file in tqdm(file_names):
        # get file's path
        path = f'{firesmoke_dir}/{id_}/{file}'
        
        # keep track of which files successfully open
        try:
            # open the file with xarray
            ds = xr.open_dataset(path)
    
            # append file name to successful_files
            successful_files[id_].append(file)
    
            # update maxes accordingly
            # these *are* allowed to get mixed up between files right? in this case don't need to worry bout it
            max_ncols[id_] = max(max_ncols[id_], ds.NCOLS)
            max_nrows[id_] = max(max_nrows[id_], ds.NROWS)
    
            # these should not get mixed up between files right? or can they?
            # if they do get mixed up, wouldn't it be a ill-defined grid?
            # ref: https://stackoverflow.com/questions/18296755/python-max-function-using-key-and-lambda-expression
            max_grid_x[id_]["xorig"] = max(max_grid_x[id_]["xorig"], ds.XORIG, key=abs)
            max_grid_y[id_]["yorig"] = max(max_grid_y[id_]["yorig"], ds.YORIG, key=abs)
            max_grid_x[id_]["xcell"] = max(max_grid_x[id_]["xcell"], ds.XCELL, key=abs)
            max_grid_y[id_]["ycell"] = max(max_grid_y[id_]["ycell"], ds.YCELL, key=abs)
    
            # update sets
            ncols[id_].add(ds.NCOLS)
            nrows[id_].add(ds.NROWS)
            xorigs[id_].add(ds.XORIG)
            yorigs[id_].add(ds.YORIG)
            xcells[id_].add(ds.XCELL)
            ycells[id_].add(ds.YCELL)
            
        except:
            # netcdf file does not exist
            continue

# Sort datasets' lists of successful files so they're in order of date
for id_ in successful_files:
    successful_files[id_] = np.sort(successful_files[id_]).tolist()

100%|██████████| 1023/1023 [00:12<00:00, 84.06it/s]
100%|██████████| 1210/1210 [00:14<00:00, 86.40it/s]
100%|██████████| 1022/1022 [00:11<00:00, 85.87it/s]
100%|██████████| 1022/1022 [00:11<00:00, 86.91it/s]


In [5]:
# Print the information for all ids
for id_ in ids:
    print(f'dataset: {id_}')
    print(f'Number of successful files: {len(successful_files[id_])}')
    print(f'Max cell sizes: max_ncols = {max_ncols[id_]} and max_nrows = {max_nrows[id_]}')
    print(f'Max xorig & xcell: {max_grid_x[id_]}')
    print(f'Max yorig & ycell: {max_grid_y[id_]}')
    print(f'ncols: {ncols[id_]}')
    print(f'nrows: {nrows[id_]}')
    print(f'xorigs: {xorigs[id_]}')
    print(f'yorigs: {yorigs[id_]}')
    print(f'xcells: {xcells[id_]}')
    print(f'ycells: {ycells[id_]}')
    print()

dataset: BSC18CA12-01
Number of successful files: 1010
Max cell sizes: max_ncols = 1081 and max_nrows = 381
Max xorig & xcell: {'xorig': -160.0, 'xcell': 0.10000000149011612}
Max yorig & ycell: {'yorig': 32.0, 'ycell': 0.10000000149011612}
ncols: {1081, 1041}
nrows: {381}
xorigs: {-160.0, -156.0}
yorigs: {32.0}
xcells: {0.10000000149011612}
ycells: {0.10000000149011612}

dataset: BSC00CA12-01
Number of successful files: 1200
Max cell sizes: max_ncols = 1081 and max_nrows = 381
Max xorig & xcell: {'xorig': -160.0, 'xcell': 0.10000000149011612}
Max yorig & ycell: {'yorig': 32.0, 'ycell': 0.10000000149011612}
ncols: {1081, 1041}
nrows: {381}
xorigs: {-160.0, -156.0}
yorigs: {32.0}
xcells: {0.10000000149011612}
ycells: {0.10000000149011612}

dataset: BSC06CA12-01
Number of successful files: 997
Max cell sizes: max_ncols = 1081 and max_nrows = 381
Max xorig & xcell: {'xorig': -160.0, 'xcell': 0.10000000149011612}
Max yorig & ycell: {'yorig': 32.0, 'ycell': 0.10000000149011612}
ncols: {1081,

### Get latitude/longitude coordinates using the max values and non-max values AMONG ALL DATASETS, this is used for resampling during conversion

Luckily, all datasets have the same 'smaller' and 'larger' lat/lon grid parameters :)

In [6]:
# get parameters for bigger lat/lon
max_xorig = max_grid_x[ids[0]]['xorig']
max_xcell = max_grid_x[ids[0]]['xcell']
max_yorig = max_grid_y[ids[0]]['yorig']
max_ycell = max_grid_y[ids[0]]['ycell']

# get arrays of bigger lat/lon grid
big_lon = np.linspace(max_xorig, max_xorig + max_xcell * (max_ncols[ids[0]] - 1), max_ncols[ids[0]])
big_lat = np.linspace(max_yorig, max_yorig + max_ycell * (max_nrows[ids[0]] - 1), max_nrows[ids[0]])

# get coordinates made of new lat/lon arrays
big_lon_pts, big_lat_pts = np.meshgrid(big_lon, big_lat)
big_tups = np.array([tup for tup in zip(big_lon_pts.flatten(), big_lat_pts.flatten())])

# get arrays of smaller lat/lon grid
sml_ds = xr.open_dataset(firesmoke_dir + "/BSC00CA12-01/dispersion_20210304.nc")
sml_lon = np.linspace(sml_ds.XORIG, sml_ds.XORIG + sml_ds.XCELL * (sml_ds.NCOLS - 1), sml_ds.NCOLS)
sml_lat = np.linspace(sml_ds.YORIG, sml_ds.YORIG + sml_ds.YCELL * (sml_ds.NROWS - 1), sml_ds.NROWS)

# get coordinates made of small lat/lon arrays
sml_lon_pts, sml_lat_pts = np.meshgrid(sml_lon, sml_lat)
sml_tups = np.array([tup for tup in zip(sml_lon_pts.flatten(), sml_lat_pts.flatten())])

## TESTING `resample_array` AND SCRIBBLES

### This is plotting the oiginal 381x1041 file

In [None]:
# # Get the PM25 values, squeeze out empty axis
# vals = np.squeeze(sml_ds['PM25'].values)

# # Perform the interpolation
# arr = griddata(sml_tups, vals[15].flatten(), big_tups, method='cubic', fill_value=0)

# # Any values that are less than a given threshold, make it 0
# arr[arr < 1e-15] = 0

# # Reshape the result to match the new grid shape
# arr = arr.reshape((len(big_lat), len(big_lon)))

# arr = arr.astype(np.float32)

In [None]:
# np.min(arr)

In [None]:
# type(arr[0,0])

In [None]:
# # Let's use matplotlib's imshow, since our data is on a grid
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # Initialize a figure and plot, so we can customize figure and plot of data
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html
# # ref: https://scitools.org.uk/cartopy/docs/latest/getting_started/index.html
# my_fig, my_plt = plt.subplots(figsize=(15, 6), subplot_kw=dict(projection=ccrs.PlateCarree()))

# # Let's set some parameters to get the visualization we want
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # color PM25 values on a log scale, since values are small
# my_norm = "log" 
# # this will number our x and y axes based on the longitude latitude range
# my_extent = [np.min(sml_lon), np.max(sml_lon), np.min(sml_lat), np.max(sml_lat)]
# # ensure the aspect ratio of our plot fits all data, matplotlib can does this automatically
# my_aspect = 'auto'
# # tell matplotlib, our origin is the lower-left corner
# my_origin = 'lower'
# # select a colormap for our plot and the color bar on the right
# my_cmap = 'viridis'

# # create our plot using imshow
# plot = my_plt.imshow(arr, norm=my_norm, extent=my_extent, 
#           aspect=my_aspect, origin=my_origin, cmap=my_cmap)

# # draw coastlines
# my_plt.coastlines()

# # draw latitude longitude lines
# # ref: https://scitools.org.uk/cartopy/docs/latest/gallery/gridlines_and_labels/gridliner.html
# my_plt.gridlines(draw_labels=True)

# # add a colorbar to our figure, based on the plot we just made above
# my_fig.colorbar(plot,location='right', label='ug/m^3')

# # # Set x and y axis labels on our ax
# # my_plt.set_xlabel('Longitude')
# # my_plt.set_ylabel('Latitude')

# # Set title of our figure
# my_fig.suptitle('Ground level concentration of PM2.5 microns and smaller')

# # # Set title of our plot as the timestamp of our data
# # my_plt.set_title(f'{my_timestamp}')

# # Show the resulting visualization
# plt.show()

### This is visualizing the resampled version of array above, from 381x1041 -> 381x1081 grid

In [None]:
# # Let's use matplotlib's imshow, since our data is on a grid
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # Initialize a figure and plot, so we can customize figure and plot of data
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html
# # ref: https://scitools.org.uk/cartopy/docs/latest/getting_started/index.html
# my_fig, my_plt = plt.subplots(figsize=(15, 6), subplot_kw=dict(projection=ccrs.PlateCarree()))

# # Let's set some parameters to get the visualization we want
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # color PM25 values on a log scale, since values are small
# my_norm = "log" 
# # this will number our x and y axes based on the longitude latitude range
# my_extent = [np.min(big_lon), np.max(big_lon), np.min(big_lat), np.max(big_lat)]
# # ensure the aspect ratio of our plot fits all data, matplotlib can does this automatically
# my_aspect = 'auto'
# # tell matplotlib, our origin is the lower-left corner
# my_origin = 'lower'
# # select a colormap for our plot and the color bar on the right
# my_cmap = 'viridis'

# # create our plot using imshow
# plot = my_plt.imshow(arr_resamp, norm=my_norm, extent=my_extent, 
#           aspect=my_aspect, origin=my_origin, cmap=my_cmap, vmin=.00001, vmax=1)

# # draw coastlines
# my_plt.coastlines()

# # draw latitude longitude lines
# # ref: https://scitools.org.uk/cartopy/docs/latest/gallery/gridlines_and_labels/gridliner.html
# my_plt.gridlines(draw_labels=True)

# # add a colorbar to our figure, based on the plot we just made above
# my_fig.colorbar(plot,location='right', label='ug/m^3')

# # # Set x and y axis labels on our ax
# # my_plt.set_xlabel('Longitude')
# # my_plt.set_ylabel('Latitude')

# # Set title of our figure
# my_fig.suptitle('Ground level concentration of PM2.5 microns and smaller')

# # # Set title of our plot as the timestamp of our data
# # my_plt.set_title(f'{my_timestamp}')

# # Show the resulting visualization
# plt.show()

## Determine sequence of files to load later for IDX conversion

### First determine what hours are available in all datasets, from there we construct final sequence

In [7]:
# for parsing time flags (TFLAG) from netcdf files
def parse_tflag(tflag):
    year = int(tflag[0] // 1000)
    day_of_year = int(tflag[0] % 1000)
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=day_of_year - 1)

    time_in_day = int(tflag[1])
    hours = time_in_day // 10000
    minutes = (time_in_day % 10000) // 100
    seconds = time_in_day % 100

    full_datetime = datetime.datetime(year, date.month, date.day, hours, minutes, seconds)
    return full_datetime

In [8]:
# get set of all available hours for each dataset using successful_files
id_sets = {id_: {} for id_ in ids}

for id_ in ids:    
    # get successful files to add all successful hours to set
    for file in tqdm(successful_files[id_]):
        # get file's path
        path = f'{firesmoke_dir}/{id_}/{file}'
        
        # open the file with xarray
        ds = xr.open_dataset(path)

        # add each available hour to successful_seq, store the index h, needed for idx conversion
        for h in range(ds.sizes["TSTEP"]):
            id_sets[id_][(file, parse_tflag(ds['TFLAG'].values[h][0]))] = h

100%|██████████| 1010/1010 [00:16<00:00, 60.32it/s]
100%|██████████| 1200/1200 [00:19<00:00, 61.64it/s]
100%|██████████| 997/997 [00:16<00:00, 61.74it/s]
100%|██████████| 1003/1003 [00:16<00:00, 61.69it/s]


### Ideally we use all dates, so step through all hours and grab from datasets accordingly.
**Importantly, we should ideally use first six hours of each dataset**

In [13]:
def next_id(curr_id):
    '''
    Return the string of the next dataset ID to use based on the current ID.
    'Next' means, next most recently updated forecast after curr_id.
    
    Details on forecast update time can be found here: https://firesmoke.ca/forecasts/
    
    Listed in order: ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
    
    :param string curr_id: the ID used:
    '''
    ret = ''
    
    if curr_id == "BSC18CA12-01":
        ret = "BSC00CA12-01"
    if curr_id == "BSC00CA12-01": 
        ret = "BSC06CA12-01"
    if curr_id == "BSC06CA12-01":
        ret = "BSC12CA12-01"
    if curr_id == "BSC12CA12-01":
        ret = "BSC18CA12-01"
    
    return ret

def prev_id(curr_id):
    '''
    Return the string of the previous dataset ID to use based on the current ID.
    'Previous' means, last most recently updated forecast before curr_id.
    
    Details on forecast update time can be found here: https://firesmoke.ca/forecasts/
    
    Listed in order: ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
    
    :param string curr_id: the ID used:
    '''
    ret = ''
    
    if curr_id == "BSC18CA12-01":
        ret = "BSC12CA12-01"
    if curr_id == "BSC00CA12-01": 
        ret = "BSC18CA12-01"
    if curr_id == "BSC06CA12-01":
        ret = "BSC00CA12-01"
    if curr_id == "BSC12CA12-01":
        ret = "BSC06CA12-01"
    
    return ret

In [14]:
def get_id_from_date(date, hour):
    '''
    Return the string of the dataset ID to use based on the date and hour given.
    
    We aim to use the dataset that provides the latest forecast update available for the hour.
    
    Details on forecast update time can be found here: https://firesmoke.ca/forecasts/
    
    :param datetime date: pandas timestamp of the YYYYMMDD date:
    :param datetime hour: pandas timestamp of the 00:00:00 hour:
    '''
    ret = ''
    
    # based on the hour, grab from corresponding dataset id
    if hour <= date.replace(hour=2):
        # HERE WE NEED TO USE PRIOR DATE
        ret = 'BSC12CA12-01'
    if current_hour >= date.replace(hour=3) and date <= date.replace(hour=8):
        ret = 'BSC18CA12-01'
    if current_hour >= date.replace(hour=9) and date <= date.replace(hour=14):
        ret = 'BSC00CA12-01'
    if (current_hour >= date.replace(hour=15) and date <= date.replace(hour=20)):
        ret = 'BSC06CA12-01'
    if current_hour >= date.replace(hour=21):
        ret = 'BSC18CA12-01'
        
    return ret

In [15]:
def dispersion_date_str(date):
    '''
    For a given date object, generate the string for the dispersion file.
    :param pd.Timestamp date: pandas timestamp of the date to make file name string out of
    '''
    return f'dispersion_{date.strftime("%Y%m%d")}.nc'

In [22]:
type(id_sets)

dict

In [23]:
def update_idx_calls(arr, curr_id, hour_file_tuple, id_sets):
    '''
    For the given array, append data specified by tuple if available in id_sets
    :param list arr: array that holds final idx write sequence
    :param tuple hour_file_tuple: tuple that holds the hour and file name to read
    :param dict id_sets: dictionary that holds files that successfully open for each dataset:
    '''
    file_str = hour_file_tuple[0]
    current_hour = hour_file_tuple[1]
    
    # get index of TFLAG of the hour in the file
    tstep_idx = id_sets[curr_id][(file_str, current_hour)]
    
    # get file's path
    path = f'{firesmoke_dir}/{curr_id}/{file_str}'
    # open the file with xarray
    ds = xr.open_dataset(path)
    arr.append([curr_id, file_str, parse_tflag(ds['TFLAG'].values[tstep_idx][0]), tstep_idx])
    
    return arr

In [None]:
# get dispersion file for current_date
today_file_str = dispersion_date_str(current_date)

# get dispersion file for previous 4 days, may be needed:
last_4_days_strs = []
for i in range(5):
    d = current_date + datetime.timedelta(days=-i)
    last_4_days_strs.append(dispersion_date_str(d))

In [19]:
# Arrays to hold the final order we will index files
idx_calls = []

# Define the start and end dates we will step through
start_date = datetime.datetime.strptime("20210304", "%Y%m%d")
end_date = datetime.datetime.strptime("20240627", "%Y%m%d")

# the dataset we wanna use
curr_id_idx = 0

# iterate over each day
current_date = start_date

In [16]:
# step through all files of dates specified
while current_date <= end_date:
    # iterate over each hour of the current day
    current_hour = datetime.datetime(current_date.year, current_date.month, current_date.day)
    # file to open
    file_str = ''
    
    # tuple to hold the current hour and file to index
    hour_file_tuple = (file_str, current_hour)
    
    # if we need to use yesterday dispersion file or not
    yes_yesterday = False
    
    while current_hour < current_date + datetime.timedelta(days=1):
        # get dataset id
        curr_id = get_id_from_date(current_date, current_hour)

        # BSC00CA12-01 generates first hours of the curr_date in yesterday's file
        # e.g. hours 12am-6am for January 2, 2023 are generated in dispersion_01012023.nc in BSC00CA12-01 dataset
        if curr_id == "BSC00CA12-01": # so use yesterday's file
            file_str = dispersion_date_str(current_date + datetime.timedelta(days=-1))
            yes_yesterday = True
        else: # otherwise use today's file
            file_str = dispersion_date_str(current_date)
        
        # if desired timestamp at desired dataset id is available, use it
        if (file_str, current_hour) in id_sets[curr_id]:
            idx_calls = update_idx_calls(idx_calls, curr_id, (file_str, current_hour), id_sets)
        else: # search the others in order of most recent to least
            found = 0
            search_count = 0
            
            # if we are using yesterday file, we now use today file
            if yes_yesterday:
                file_str = dispersion_date_str(current_date)
            
            # first, do a pass over the other datasets
            while found == 0 and search_count < 3:
                # try previous id in sequence
                curr_id = prev_id(curr_id)
                # if timestamp is available, use it
                if (file_str, current_hour) in id_sets[curr_id]:
                    update_idx_calls(idx_calls, curr_id, (file_str, current_hour), id_sets)
                    found = 1
                else:
                    search_count += 1

            # if we *still* haven't found file, try a final check, check ALL datasets again but for yesterday str
            if not found:
                # use yesterday file
                file_str = yest_file_str

                # reset search counters and conditions
                found = 0
                search_count = 0

                while found == 0 and search_count < 4:
                    # try previous id in sequence
                    curr_id_idx = (curr_id_idx - 1) % 4
                    # if timestamp is available, use it
                    if (file_str, current_hour) in id_sets[ids[curr_id_idx]]:
                        # append dataset id, file name, and index of time step for idx conversion
                        tstep_idx = id_sets[ids[curr_id_idx]][(file_str, current_hour)]
                        
                        # append dataset id, file name, and index of time step for idx conversion
                        conversion_seq.append([ids[curr_id_idx], file_str, tstep_idx])
    
                        # get file's path
                        path = f'{firesmoke_dir}/{ids[curr_id_idx]}/{file_str}'
                        # open the file with xarray
                        ds = xr.open_dataset(path)
                        idx_calls.append([ids[curr_id_idx], file_str, parse_tflag(ds['TFLAG'].values[tstep_idx][0])])
    
                        found = 1
                    else:  
                        search_count += 1

                    print(f'FAILED TO FIND, LAST RESORT:')
                    print(f'found = {found}, search_count = {search_count}')
                    print(f'ids[curr_id_idx] = {ids[curr_id_idx]}')
                if not found:
                    print('FAILed TO FIND ANYTHING')
                    
        # move to next hour
        current_hour += datetime.timedelta(hours=1)

    # move to the next day
    current_date += datetime.timedelta(days=1)
    print(f'{current_date}, ids[curr_id_idx] = {ids[curr_id_idx]}')

NameError: name 'current_hour' is not defined

In [None]:
%%capture captured_output
for c in idx_calls:
    print(c)

with open('idx_calls.txt', 'w') as f:
    f.write(captured_output.stdout)

In [None]:
# save conversion_seq to file
with open('conversion_seq.pkl', 'wb') as f:
    pickle.dump(conversion_seq, f)

# save idx_calls to file
with open('idx_calls.pkl', 'wb') as f:
    pickle.dump(idx_calls, f)

### Let's check what hours are missing and _why_ **before** doing conversion

In [17]:
# Load conversion_seq from a file
with open('conversion_seq.pkl', 'rb') as f:
    conversion_seq = pickle.load(f)

# Load idx_calls from a file
with open('idx_calls.pkl', 'rb') as f:
    idx_calls = pickle.load(f)

In [None]:
# last 3 hours of dataset
idx_calls[-3:]

In [None]:
# first 3 hours of dataset
idx_calls[0:3]

In [None]:
# Define the start and end dates
start_date = pd.Timestamp(datetime.datetime.strptime("20210303", "%Y%m%d"))
end_date = pd.Timestamp(datetime.datetime.strptime("20240627", "%Y%m%d"))

# Get all hours between the start and end dates
desired_tflag_set = {start_date + pd.Timedelta(hours=x) for x in range(int((end_date - start_date).total_seconds() // 3600) + 1)}

print(f'There are {len(desired_tflag_set)} hours between 3/3/21 and 6/27/24')

In [None]:
# Get all hours in idx conversion
idx_hours = {call[2] for call in idx_calls}

# Get set of missing hours
hours_missing_set = desired_tflag_set.difference(idx_hours)

### For all missing hours, try opening files to find hour and report errors seen.

In [None]:
# dictionary to hold where one can find hour in files
hours_found = {h : [] for h in hours_missing_set}

for h in hours_missing_set:
    # open the dispersion.nc file on the date of the hour and 3 days before
    for i in range(5):
        # get date string and build file name
        curr_date = h + pd.Timedelta(days=-i)
        file_name = f'dispersion_{curr_date.strftime("%Y%m%d")}.nc'
        
        # search for the file in each dataset
        for id_ in ids:
            # get file's path
            path = f'{firesmoke_dir}/{id_}/{file_name}'
            
            # try opening the file with xarray
            try:
                ds = xr.open_dataset(path)

                # get timestamps
                tflags = [pd.Timestamp(parse_tflag(t[0])) for t in ds['TFLAG'].values]
                
                # if h is in timestamps, add the file to hours_found dictionary
                hours_found[h].append(path) 
                
            except Exception as e:
                print(f'failed to open {path}, exception: {e}')
                print('---')
                continue

In [None]:
for h in hours_found.keys():
    # print if one of the hours' lists have no files..
    if len(hours_found[h]) == 0:
        print(f'hour {h} was in no files')

At this point the issue is not a lack of missing netCDF files, it is of incorrect sequencing.

## Do conversion from netCDF files to IDX

In [None]:
#********** FOR TESTING
# my_seq = conversion_seq[22200:22200+145]
my_seq = conversion_seq

In [None]:
print(f'len(conversion_seq) = {len(conversion_seq)}')
print(f'len(idx_calls) = {len(idx_calls)}')
print(f'len(my_seq) = {len(my_seq)}')

In [None]:
# # Create idx file of i'th dataset
# # useful for dealing with fields that are not all the same size:
# # https://github.com/sci-visus/OpenVisus/blob/master/Samples/jupyter/nasa_conversion_example.ipynb
   
# # create OpenVisus field for the pm25 variable
# f = Field('PM25', 'float32')

# # create the idx file for this dataset using field f
# # dims is maximum array size, we will resample data accordingly to fit this
# # time is number of files * 24 (hours)
# db = CreateIdx(url=idx_dir + '/firesmoke.idx', fields=[f], 
#                dims=[int(max_ncols[ids[0]]), int(max_nrows[ids[0]])], time=[0, len(my_seq) - 1, '%00000000d/'])

# # to track what timestep we are on in idx
# tstep = 0

# # threshold to use to change small-enough resampled values to 0
# thresh = 1e-15

# for call in tqdm(my_seq):
#     # get instructions from call
#     curr_id = call[0]
#     curr_file = call[1]
#     tstep_index = call[2]
#     # open the file with xarray
#     ds = xr.open_dataset(f'{firesmoke_dir}/{curr_id}/{curr_file}')
    
#     # Get the PM25 values, squeeze out empty axis
#     file_vals = np.squeeze(ds['PM25'].values)
    
#     # to decide if we need to resample or not
#     resamp = ds.XORIG != max_xorig
    
#     # resample data if not already on max lat/lon grid
#     if resamp:
#         # Perform the interpolation
#         file_vals_resamp = griddata(sml_tups, file_vals[tstep_index].flatten(), big_tups, method='cubic', fill_value=0)
        
#         # Any values that are less than a given threshold, make it 0
#         file_vals_resamp[file_vals_resamp < thresh] = 0
        
#         # Reshape the result to match the new grid shape
#         file_vals_resamp = file_vals_resamp.reshape((len(big_lat), len(big_lon)))
#         # Write resampled values at hour h to timestep t and field f
#         db.write(data=file_vals_resamp.astype(np.float32),field=f,time=tstep)
#     else:
#         # Write original values at hour h to timestep t and field f
#         db.write(data=file_vals[tstep_index], field=f, time=tstep)

#     # move to next timestep in IDX
#     tstep = tstep + 1

In [None]:
# # go to idx data directory
# os.chdir(idx_dir)

In [None]:
# # compress dataset
# db.compressDataset(['zip'])