# Firesmoke Data Conversion to IDX using OpenVisus

## Import necessary libraries, install them if you do not have them. This was developed in Python 3.9

In [23]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used to convert to .idx
from OpenVisus import *

# Used for numerical work
import numpy as np

# Used for processing netCDF time data
import datetime

# Used for interacting with OS file system (to get directory file names)
import os

# To load/save final sequence array to file
import pickle

# Used for resampling arrays to fit the same lat/lon grid
from scipy.interpolate import griddata

# for plotting
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# for checking and using timestamps
import pandas as pd

# Accessory, used to generate progress bar for running for loops
# from tqdm.notebook import tqdm
# import ipywidgets
# import jupyterlab_widgets
from tqdm import tqdm

## Get relevant directory paths

In [7]:
# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
firesmoke_dir = "/usr/sci/scratch_nvme/arleth/download/"

# path to save idx file and data
idx_dir = "/usr/sci/scratch_nvme/arleth/idx_conversions/westerncanada"

In [8]:
# get metadata of datasets, had to be obtained manually
ids = ["BSC00WC04-01"]
start_dates = ["20210315"]
end_dates = ["20241021"]

id_dates = {ids[i]: {"start_date": start_dates[i], "end_date": end_dates[i]} for i in range(len(ids))}

## Gather information about the metadata of our files, since it is inconsistent file to file. We need to know what to normalize across all files.

### In particular:
#### 1. Count number of files there are per firesmoke directory.
#### 2. Determine maximum row,col dimension sizes for pm25 array.
#### 3. Determine maximum latitude longitude grid parameters.

In [9]:
# List of all files that are available from UBC
successful_files = {id_: [] for id_ in ids}

# Variables to hold maxes, also to track the unique max values
max_ncols = {id_: 0 for id_ in ids}
max_nrows = {id_: 0 for id_ in ids}
ncols = {id_: set() for id_ in ids}
nrows = {id_: set() for id_ in ids}

# Max grid dimensions
max_grid_x = {id_: {"xorig": 0.0, "xcell": 0.0} for id_ in ids}
max_grid_y = {id_: {"yorig": 0.0, "ycell": 0.0} for id_ in ids}
xorigs = {id_: set() for id_ in ids}
xcells = {id_: set() for id_ in ids}
yorigs = {id_: set() for id_ in ids}
ycells = {id_: set() for id_ in ids}

for id_ in ids:
    # get list of netcdf file names for each dataset
    file_names = os.listdir(f'{firesmoke_dir}/{id_}/')
    
    # try opening each file, process only if it successfully opens
    for file in tqdm(file_names):
        # get file's path
        path = f'{firesmoke_dir}/{id_}/{file}'
        
        # keep track of which files successfully open
        try:
            # open the file with xarray
            ds = xr.open_dataset(path)
    
            # append file name to successful_files
            successful_files[id_].append(file)
    
            # update maxes accordingly
            # these *are* allowed to get mixed up between files right? in this case don't need to worry bout it
            max_ncols[id_] = max(max_ncols[id_], ds.NCOLS)
            max_nrows[id_] = max(max_nrows[id_], ds.NROWS)
    
            # these should not get mixed up between files right? or can they?
            # if they do get mixed up, wouldn't it be a ill-defined grid?
            # ref: https://stackoverflow.com/questions/18296755/python-max-function-using-key-and-lambda-expression
            max_grid_x[id_]["xorig"] = max(max_grid_x[id_]["xorig"], ds.XORIG, key=abs)
            max_grid_y[id_]["yorig"] = max(max_grid_y[id_]["yorig"], ds.YORIG, key=abs)
            max_grid_x[id_]["xcell"] = max(max_grid_x[id_]["xcell"], ds.XCELL, key=abs)
            max_grid_y[id_]["ycell"] = max(max_grid_y[id_]["ycell"], ds.YCELL, key=abs)
    
            # update sets
            ncols[id_].add(ds.NCOLS)
            nrows[id_].add(ds.NROWS)
            xorigs[id_].add(ds.XORIG)
            yorigs[id_].add(ds.YORIG)
            xcells[id_].add(ds.XCELL)
            ycells[id_].add(ds.YCELL)
            
        except:
            # netcdf file does not exist
            continue

# Sort datasets' lists of successful files so they're in order of date
for id_ in successful_files:
    successful_files[id_] = np.sort(successful_files[id_]).tolist()

100%|██████████| 901/901 [00:28<00:00, 31.57it/s]


In [10]:
# Print the information for all ids
for id_ in ids:
    print(f'dataset: {id_}')
    print(f'Number of successful files: {len(successful_files[id_])}')
    print(f'Max cell sizes: max_ncols = {max_ncols[id_]} and max_nrows = {max_nrows[id_]}')
    print(f'Max xorig & xcell: {max_grid_x[id_]}')
    print(f'Max yorig & ycell: {max_grid_y[id_]}')
    print(f'ncols: {ncols[id_]}')
    print(f'nrows: {nrows[id_]}')
    print(f'xorigs: {xorigs[id_]}')
    print(f'yorigs: {yorigs[id_]}')
    print(f'xcells: {xcells[id_]}')
    print(f'ycells: {ycells[id_]}')
    print()

dataset: BSC00WC04-01
Number of successful files: 901
Max cell sizes: max_ncols = 841 and max_nrows = 601
Max xorig & xcell: {'xorig': -143.0, 'xcell': 0.05000000074505806}
Max yorig & ycell: {'yorig': 40.0, 'ycell': 0.05000000074505806}
ncols: {841}
nrows: {601}
xorigs: {-143.0}
yorigs: {40.0}
xcells: {0.05000000074505806}
ycells: {0.05000000074505806}



## Determine sequence of files to load later for IDX conversion

### First determine what hours are available in the dataset

In [12]:
# for parsing time flags (TFLAG) from netcdf files
def parse_tflag(tflag):
    year = int(tflag[0] // 1000)
    day_of_year = int(tflag[0] % 1000)
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=day_of_year - 1)

    time_in_day = int(tflag[1])
    hours = time_in_day // 10000
    minutes = (time_in_day % 10000) // 100
    seconds = time_in_day % 100

    full_datetime = datetime.datetime(year, date.month, date.day, hours, minutes, seconds)
    return full_datetime

In [17]:
# get set of all available hours for each dataset using successful_files
id_sets = {id_: {} for id_ in ids}

for id_ in ids:    
    # get successful files to add all successful hours to set
    for file in tqdm(successful_files[id_]):
        # get file's path
        path = f'{firesmoke_dir}/{id_}/{file}'
        
        # open the file with xarray
        ds = xr.open_dataset(path)

        # add each available hour to successful_seq, store the index h, needed for idx conversion
        for h in range(ds.sizes["TSTEP"]):
            id_sets[id_][(file, parse_tflag(ds['TFLAG'].values[h][0]))] = h

100%|██████████| 901/901 [00:43<00:00, 20.53it/s]


In [18]:
id_sets[ids[0]]

{('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 9, 0)): 0,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 10, 0)): 1,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 11, 0)): 2,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 12, 0)): 3,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 13, 0)): 4,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 14, 0)): 5,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 15, 0)): 6,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 16, 0)): 7,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 17, 0)): 8,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 18, 0)): 9,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 19, 0)): 10,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 20, 0)): 11,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 21, 0)): 12,
 ('dispersion_20210315.nc', datetime.datetime(2021, 3, 15, 22, 0)): 13,
 ('

### Ideally we have a forecast prediciton for all dates, so step through all hours and grab from datasets accordingly.
**Importantly, we should ideally use first 24 hours of each dataset.**
**For any given timestep t, if it's not available in the current file for date d, search the 3 files previous to date d (dates d-1, d-2, d-3)**

In [43]:
def update_idx_calls(arr, curr_id, hour_file_tuple, global_tstep, id_sets):
    '''
    For the given array, append data specified by tuple if available in id_sets
    :param list arr: array that holds final idx write sequence
    :param tuple hour_file_tuple: tuple that holds the hour and file name to read
    :param global_tstep: the global timestep of the current tuple
    :param dict id_sets: dictionary that holds files that successfully open for each dataset:
    '''
    file_str = hour_file_tuple[0]
    current_hour = hour_file_tuple[1]
    
    # get index of TFLAG of the hour in the file
    tstep_idx = id_sets[curr_id][(file_str, current_hour)]
    
    # get file's path
    path = f'{firesmoke_dir}/{curr_id}/{file_str}'
    # open the file with xarray
    ds = xr.open_dataset(path)
    arr.append({'forecast_id': curr_id, 'file_name': file_str, 
              'timestamp': parse_tflag(ds['TFLAG'].values[tstep_idx][0]), 
                'timestamp_index': tstep_idx, 'global_timestamp': global_tstep})

    return arr

In [44]:
# Arrays to hold the final order we will index files
idx_calls = []

# Define the start and end dates we will step through
start_date = datetime.datetime.strptime("20210304", "%Y%m%d")
end_date = datetime.datetime.strptime("20240627", "%Y%m%d")

# iterate over each day
current_date = start_date

# iterate over each hour of the current day
current_hour = datetime.datetime(current_date.year, current_date.month, current_date.day)

# file to open
file_str = ''

# tell functions to print for debugging
verbose = 1

**This could be parallelized, global_tstep preserves global_order**

plan:
1. make list of all timestamps to use
2. for each timestamp

acutally, global_tstep is updated as we step through each date and check if the timestep's data is available or not, so with more wrangling it *might* be parallelizable?

In [45]:
global_tstep = 0
while current_date <= end_date:    
    while current_hour < current_date + datetime.timedelta(days=1):        
        # set search counters and conditions
        prev_day_count = 0
        found = 0

        # search for best dataset id from current date and previous 3 days
        while found == 0 and prev_day_count <= 3:
            # to hold which current date we're trying
            curr_date = current_hour + datetime.timedelta(days=-prev_day_count)

            # get dataset id
            curr_id = ids[0]

            # get dispersion file to load
            file_str = f'dispersion_{curr_date.strftime("%Y%m%d")}.nc'

            # if timestamp is available, use it
            if (file_str, current_hour) in id_sets[curr_id]:
                update_idx_calls(idx_calls, curr_id, (file_str, current_hour), global_tstep, id_sets)
                found = 1
                global_tstep += 1

            # try the previous day
            prev_day_count += 1

        # move to next hour
        current_hour += datetime.timedelta(hours=1)

    # move to the next day
    current_date += datetime.timedelta(days=1)

In [46]:
# save idx_calls to file
with open('westerncanada_calls_v4.pkl', 'wb') as f:
    pickle.dump(idx_calls, f)

## Do conversion from netCDF files to IDX

In [51]:
# glance at the idx_calls generated above
print(f'len(idx_calls) = {len(idx_calls)}')
idx_calls[20:27]

len(idx_calls) = 19593


[{'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210315.nc',
  'timestamp': datetime.datetime(2021, 3, 16, 5, 0),
  'timestamp_index': 20,
  'global_timestamp': 20},
 {'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210315.nc',
  'timestamp': datetime.datetime(2021, 3, 16, 6, 0),
  'timestamp_index': 21,
  'global_timestamp': 21},
 {'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210315.nc',
  'timestamp': datetime.datetime(2021, 3, 16, 7, 0),
  'timestamp_index': 22,
  'global_timestamp': 22},
 {'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210315.nc',
  'timestamp': datetime.datetime(2021, 3, 16, 8, 0),
  'timestamp_index': 23,
  'global_timestamp': 23},
 {'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210316.nc',
  'timestamp': datetime.datetime(2021, 3, 16, 9, 0),
  'timestamp_index': 0,
  'global_timestamp': 24},
 {'forecast_id': 'BSC00WC04-01',
  'file_name': 'dispersion_20210316.nc',
  'timestamp': datetime.datetime

In [60]:
# Create idx file of i'th dataset
# useful for dealing with fields that are not all the same size:
# https://github.com/sci-visus/OpenVisus/blob/master/Samples/jupyter/nasa_conversion_example.ipynb

# create OpenVisus field for the pm25 variable
f = Field('PM25', 'float32')

# create the idx file for this dataset using field f
# dims is maximum array size, we will resample data accordingly to fit this
# time is number of files * 24 (hours)
db = CreateIdx(url=idx_dir + '/BSC00WC04-01.idx', fields=[f], 
               dims=[int(max_ncols[ids[0]]), int(max_nrows[ids[0]])], time=[0, len(idx_calls) - 1, '%00000000d/'])

# to track what timestep we are on in idx
tstep = 0

# threshold to use to change small-enough resampled values to 0
thresh = 1e-15

def write_call_to_db(call):
    '''
    given a call from idx_calls as created above, parallelize the 
    task of writing each timestep to the IDX dataset
    :param call: the idx_call dictionary which contains:
        {'forecast_id': _, 'file_name': _, 'timestamp': _, 
            'timestamp_index': _, 'global_timestamp': _}
    :param idx_db: the idx database to write to
    :param idx_f: the field defining which variable to update
    '''
    # get parameters from call
    curr_id = call['forecast_id']
    curr_file = call['file_name']
    tstep = call['timestamp']
    tstep_index = call['timestamp_index']
    global_tstep = call['global_timestamp']
    
    # open the file with xarray
    ds = xr.open_dataset(f'{firesmoke_dir}/{curr_id}/{curr_file}')
    
    # Get the PM25 values, squeeze out empty axis
    file_vals = np.squeeze(ds['PM25'].values)
    
    # Write original values at hour h to timestep t and field f
    db.write(data=file_vals[tstep_index], field=f, time=global_tstep)

In [61]:
 # create frames, capturing issues 
with multiprocessing.Pool() as pool:
    # Start a timer to measure how long the conversion takes
    start_time = time.time()
    print('starting')
    issues = pool.map(write_call_to_db, idx_calls)
    print('done!')
    # End the timer and print the elapsed time
    end_time = time.time()
    print(f'Total elapsed time: {end_time - start_time}')

starting
done!
Total elapsed time: 348.3490369319916


In [62]:
# go to idx data directory
os.chdir(idx_dir)

In [63]:
# compress dataset
db.compressDataset(['zip'])