# Firesmoke Data Conversion to IDX using OpenVisus

## Import necessary libraries, install them if you do not have them. This was developed in Python 3.9

In [None]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used to convert to .idx
from OpenVisus import *

# Used for numerical work
import numpy as np

# Used for processing netCDF time data
import datetime

# Used for interacting with OS file system (to get directory file names)
import os

# # Used for redownloading netCDF files from https://firesmoke.ca/ if opening currently downloaded file fails
# import wget

# Accessory, used to generate progress bar for running for loops
# from tqdm.notebook import tqdm
# import ipywidgets
# import jupyterlab_widgets
from tqdm import tqdm

## Get relevant directory paths and file information

In [None]:
# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
# directory to all firesmoke data, mounted on my personal machine, change accordingly
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

# path to save idx file and data
idx_dir = "/usr/sci/scratch_nvme/arleth/idx/firesmoke2d"

In [None]:
# # ******* THIS IS WHEN RUNNING FROM MY MACBOOK **************
# # directory to all firesmoke data, mounted on my personal machine, change accordingly
# firesmoke_dir = "/Users/arleth/Mount/firesmoke"

# # path to save idx file and data
# idx_dir = "/Users/arleth/Mount/idx/firesmoke2d"

In [None]:
# init times are only necessary if opening file below fails and need to redownload
init_time = "08"

# directory name for each dataset
dataset_name = "BSC00CA12-01"

dataset_dir = firesmoke_dir + "/" + dataset_name
idx_dir = idx_dir + "/" + dataset_name

## Count number of files there are per firesmoke directory and max dimensionality per directory

In [None]:
# list of all files that are available from UBC
successful_files = []

# get list of file names for BSC00CA12-01 dataset
file_names = os.listdir(dataset_dir)

# try opening each file
for file in tqdm(file_names):
    # get file's path
    path = dataset_dir + "/" + file
    
    # keep track of which files successfully open
    try:
        # open the dataset
        f = xr.open_dataset(path)

        # append file name to successful_files
        successful_files.append(file)
    except:
        continue

In [None]:
np.shape(successful_files)

## Do conversion from netCDF files to IDX

In [None]:
# Create idx file of i'th dataset
# useful for dealing with fields that are not all the same size:
# https://github.com/sci-visus/OpenVisus/blob/master/Samples/jupyter/nasa_conversion_example.ipynb
# for each dataset
for i, dir in tqdm(enumerate(dataset_dirs)):
    # get the dimensions for i'th dataset
    curr_dims = dataset_dims[i]
    
    print("creatingING field!")    
    # create OpenVisus field for the PM25 data's *variables*
    # ****** This probably means the 'preprocessing' that we did of the data was no use, since it won't be converted as well? *******
    f = Field('PM25', 'float32')
    
    print("created field!")
    print(f"idx path is {idx_dirs[i] + '/' + ids[i] + '.idx'}")
    
    # create the idx file for this dataset, it is 2D and time steps is number of files * 24 (total number of hours)
    # note: dims must 
    db = CreateIdx(url=idx_dirs[i] + '/' + ids[i] + '.idx', fields=[f], 
                   dims=[int(curr_dims[5]), int(curr_dims[4])], time=[0, dataset_files_count[i] * 24, '%00000000d/'])

    print("created empty idx!")
    
    # get list of file names for dataset at dir
    print(f"getting files at {dir}")
    file_names = os.listdir(dir)
    file_names = file_names
    
    # to track what timestep we are on in idx
    tstep = 0
    
    # for all netcdf files we downloaded for i'th dataset
    for j, file in tqdm(enumerate(np.sort(file_names))):
        # get file's path
        path = dir + "/" + file
    
        open_failed = 0
        
        # if file opening file with xarray fails, skip the file
        try:
            # open the dataset's files
            ds = xr.open_dataset(path)
        except:
            open_failed = 1
            # print("open FAILED")
        
        if not open_failed:
            # For all 24 hours in current file, a full day is time slices 15 through 15+23
            for h in np.arange(15, 15+24):
                # current idx timestep:
                t = tstep
                
                # Get the PM25 values, squeeze out empty axis
                file_vals = (np.squeeze(ds['PM25'].values))
                
                # Get h'th hour of data, a 2D slice
                tstep_hour_vals = file_vals[h]

                print(f'np.shape(tstep_hour_vals) = {tstep_hour_vals}')
        
                # Write values at hour h to timestep t and field f
                db.write(data=tstep_hour_vals,field=f,time=t)
        
                # move to next timestep in IDX
                tstep = tstep + 1
                

In [None]:
dataset_dirs

In [None]:
dataset_files_count

In [None]:
# change directory to directory to hold idx data
os.chdir('/Users/arleth/Mount/idx/firesmoke2d/BSC00CA12-01')

In [None]:
db.compressDataset(['zip'])