# Firesmoke Data Conversion to IDX using OpenVisus

## Import necessary libraries, install them if you do not have them. This was developed in Python 3.9

In [None]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used to convert to .idx
from OpenVisus import *

# Used for numerical work
import numpy as np

# Used for processing netCDF time data
import datetime

# Used for interacting with OS file system (to get directory file names)
import os

# Used for resampling arrays to fit the same lat/lon grid
from scipy.interpolate import griddata

# for plotting
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# Accessory, used to generate progress bar for running for loops
# from tqdm.notebook import tqdm
# import ipywidgets
# import jupyterlab_widgets
from tqdm import tqdm

## Get relevant directory paths

In [None]:
# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
# directory to all firesmoke data, mounted on my personal machine, change accordingly
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

# path to save idx file and data
idx_dir = "/usr/sci/scratch_nvme/arleth/idx/firesmoke2d"

In [None]:
# # ******* THIS IS WHEN RUNNING FROM MY MACBOOK **************
# # directory to all firesmoke data, mounted on my personal machine, change accordingly
# firesmoke_dir = "/Users/arleth/Mount/firesmoke"

# # path to save idx file and data
# idx_dir = "/Users/arleth/Mount/idx/firesmoke2d"

In [None]:
# directory name for dataset
dataset_name = "BSC00CA12-01"

# Inside dataset_dir are the netcdf files
dataset_dir = firesmoke_dir + "/" + dataset_name

# Inside idx_dir is where to save the final idx file
idx_dir = idx_dir + "/" + dataset_name

## Gather information about the metadata of our files, since it is inconsistent file to file. We need to know what to normalize across all files.

### In particular:
#### 1. Count number of files there are per firesmoke directory.
#### 2. Determine maximum row,col dimension sizes for pm25 array.
#### 3. Determine maximum latitude longitude grid parameters.

In [None]:
# list of all files that are available from UBC
successful_files = []

# variables to hold maxes, also to track the unique max values
max_ncols = 0.0
max_nrows = 0.0
ncols = set()
nrows = set()

# useful:
# longitude = np.linspace(xorig, xorig + xcell * (ncols - 1), ncols)
# latitude = np.linspace(yorig, yorig + ycell * (nrows - 1), nrows)
max_grid_x = {"xorig" : 0.0, "xcell" : 0.0}
max_grid_y = {"yorig" : 0.0, "ycell" : 0.0}
xorigs = set()
xcells = set()
yorigs = set()
ycells = set()

# get list of netcdf file names for BSC00CA12-01 dataset
file_names = os.listdir(dataset_dir)

# try opening each file, process only if it successfully opens
for file in tqdm(file_names):
    # get file's path
    path = dataset_dir + "/" + file
    
    # keep track of which files successfully open
    try:
        # open the file with xarray
        ds = xr.open_dataset(path)

        # append file name to successful_files
        successful_files.append(file)

        # update maxes accordingly
        # these *are* allowed to get mixed up between files right? in this case don't need to worry bout it
        max_ncols = max(max_ncols, ds.NCOLS)
        max_nrows = max(max_nrows, ds.NROWS)

        # these should not get mixed up between files right? or can they?
        # if they do get mixed up, wouldn't it be a ill-defined grid?
        # ref: https://stackoverflow.com/questions/18296755/python-max-function-using-key-and-lambda-expression
        max_grid_x["xorig"] = max(max_grid_x["xorig"], ds.XORIG, key=abs)
        max_grid_y["yorig"] = max(max_grid_y["yorig"], ds.YORIG, key=abs)
        max_grid_x["xcell"] = max(max_grid_x["xcell"], ds.XCELL, key=abs)
        max_grid_y["ycell"] = max(max_grid_y["ycell"], ds.YCELL, key=abs)

        # update sets
        ncols.add(ds.NCOLS)
        nrows.add(ds.NROWS)
        xorigs.add(ds.XORIG)
        yorigs.add(ds.YORIG)
        xcells.add(ds.XCELL)
        ycells.add(ds.YCELL)
        
    except:
        # netcdf file does not exist
        continue

In [None]:
print(f'Number of successful files is {np.size(successful_files)}')
print(f'Max cell sizes are max_ncols = {max_ncols} and max_nrows = {max_nrows}')
print(f'Max xorig & xcell: {max_grid_x}')
print(f'Max yorig & ycell: {max_grid_y}')
print(f'ncols: {ncols}')
print(f'nrows: {nrows}')
print(f'xorigs: {xorigs}')
print(f'yorigs: {yorigs}')
print(f'xcells: {xcells}')
print(f'ycells: {ycells}')

### Get latitude/longitude coordinates using the max values and non-max values, this is used for resampling during conversion

In [None]:
# get parameters for bigger lat/lon
max_xorig = max_grid_x['xorig']
max_xcell = max_grid_x['xcell']
max_yorig = max_grid_y['yorig']
max_ycell = max_grid_y['ycell']

# get arrays of bigger lat/lon grid
big_lon = np.linspace(max_xorig, max_xorig + max_xcell * (max_ncols - 1), max_ncols)
big_lat = np.linspace(max_yorig, max_yorig + max_ycell * (max_nrows - 1), max_nrows)

# get coordinates made of new lat/lon arrays
big_lon_pts, big_lat_pts = np.meshgrid(big_lon, big_lat)
big_tups = np.array([tup for tup in zip(big_lon_pts.flatten(), big_lat_pts.flatten())])

# get arrays of smaller lat/lon grid
sml_ds = xr.open_dataset(firesmoke_dir + "/BSC00CA12-01/dispersion_20210304.nc")
sml_lon = np.linspace(sml_ds.XORIG, sml_ds.XORIG + sml_ds.XCELL * (sml_ds.NCOLS - 1), sml_ds.NCOLS)
sml_lat = np.linspace(sml_ds.YORIG, sml_ds.YORIG + sml_ds.YCELL * (sml_ds.NROWS - 1), sml_ds.NROWS)

# get coordinates made of small lat/lon arrays
sml_lon_pts, sml_lat_pts = np.meshgrid(sml_lon, sml_lat)
sml_tups = np.array([tup for tup in zip(sml_lon_pts.flatten(), sml_lat_pts.flatten())])

## TESTING `resample_array` AND SCRIBBLES

### This is plotting the oiginal 381x1041 file

In [None]:
# # Get the PM25 values, squeeze out empty axis
# vals = np.squeeze(sml_ds['PM25'].values)

# # Perform the interpolation
# arr = griddata(sml_tups, vals[15].flatten(), big_tups, method='cubic', fill_value=0)

# # Any values that are less than a given threshold, make it 0
# arr[arr < 1e-15] = 0

# # Reshape the result to match the new grid shape
# arr = arr.reshape((len(big_lat), len(big_lon)))

# arr = arr.astype(np.float32)

In [None]:
# np.min(arr)

In [None]:
# type(arr[0,0])

In [None]:
# # Let's use matplotlib's imshow, since our data is on a grid
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # Initialize a figure and plot, so we can customize figure and plot of data
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html
# # ref: https://scitools.org.uk/cartopy/docs/latest/getting_started/index.html
# my_fig, my_plt = plt.subplots(figsize=(15, 6), subplot_kw=dict(projection=ccrs.PlateCarree()))

# # Let's set some parameters to get the visualization we want
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # color PM25 values on a log scale, since values are small
# my_norm = "log" 
# # this will number our x and y axes based on the longitude latitude range
# my_extent = [np.min(sml_lon), np.max(sml_lon), np.min(sml_lat), np.max(sml_lat)]
# # ensure the aspect ratio of our plot fits all data, matplotlib can does this automatically
# my_aspect = 'auto'
# # tell matplotlib, our origin is the lower-left corner
# my_origin = 'lower'
# # select a colormap for our plot and the color bar on the right
# my_cmap = 'viridis'

# # create our plot using imshow
# plot = my_plt.imshow(arr, norm=my_norm, extent=my_extent, 
#           aspect=my_aspect, origin=my_origin, cmap=my_cmap)

# # draw coastlines
# my_plt.coastlines()

# # draw latitude longitude lines
# # ref: https://scitools.org.uk/cartopy/docs/latest/gallery/gridlines_and_labels/gridliner.html
# my_plt.gridlines(draw_labels=True)

# # add a colorbar to our figure, based on the plot we just made above
# my_fig.colorbar(plot,location='right', label='ug/m^3')

# # # Set x and y axis labels on our ax
# # my_plt.set_xlabel('Longitude')
# # my_plt.set_ylabel('Latitude')

# # Set title of our figure
# my_fig.suptitle('Ground level concentration of PM2.5 microns and smaller')

# # # Set title of our plot as the timestamp of our data
# # my_plt.set_title(f'{my_timestamp}')

# # Show the resulting visualization
# plt.show()

### This is visualizing the resampled version of array above, from 381x1041 -> 381x1081 grid

In [None]:
# # Let's use matplotlib's imshow, since our data is on a grid
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # Initialize a figure and plot, so we can customize figure and plot of data
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html
# # ref: https://scitools.org.uk/cartopy/docs/latest/getting_started/index.html
# my_fig, my_plt = plt.subplots(figsize=(15, 6), subplot_kw=dict(projection=ccrs.PlateCarree()))

# # Let's set some parameters to get the visualization we want
# # ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html

# # color PM25 values on a log scale, since values are small
# my_norm = "log" 
# # this will number our x and y axes based on the longitude latitude range
# my_extent = [np.min(big_lon), np.max(big_lon), np.min(big_lat), np.max(big_lat)]
# # ensure the aspect ratio of our plot fits all data, matplotlib can does this automatically
# my_aspect = 'auto'
# # tell matplotlib, our origin is the lower-left corner
# my_origin = 'lower'
# # select a colormap for our plot and the color bar on the right
# my_cmap = 'viridis'

# # create our plot using imshow
# plot = my_plt.imshow(arr_resamp, norm=my_norm, extent=my_extent, 
#           aspect=my_aspect, origin=my_origin, cmap=my_cmap, vmin=.00001, vmax=1)

# # draw coastlines
# my_plt.coastlines()

# # draw latitude longitude lines
# # ref: https://scitools.org.uk/cartopy/docs/latest/gallery/gridlines_and_labels/gridliner.html
# my_plt.gridlines(draw_labels=True)

# # add a colorbar to our figure, based on the plot we just made above
# my_fig.colorbar(plot,location='right', label='ug/m^3')

# # # Set x and y axis labels on our ax
# # my_plt.set_xlabel('Longitude')
# # my_plt.set_ylabel('Latitude')

# # Set title of our figure
# my_fig.suptitle('Ground level concentration of PM2.5 microns and smaller')

# # # Set title of our plot as the timestamp of our data
# # my_plt.set_title(f'{my_timestamp}')

# # Show the resulting visualization
# plt.show()

## Do conversion from netCDF files to IDX

In [None]:
# Create idx file of i'th dataset
# useful for dealing with fields that are not all the same size:
# https://github.com/sci-visus/OpenVisus/blob/master/Samples/jupyter/nasa_conversion_example.ipynb
   
# create OpenVisus field for the pm25 variable
f = Field('PM25', 'float32')

# create the idx file for this dataset using field f
# dims is maximum array size, we will resample data accordingly to fit this
# time is number of files * 24 (hours)
db = CreateIdx(url=idx_dir + '/' + dataset_name + '.idx', fields=[f], 
               dims=[int(max_ncols), int(max_nrows)], time=[0, (len(successful_files) * 24) - 1, '%00000000d/'])

# put file names *in order* so we store data chronologically
# perhaps do this in a less *fragile* way?
successful_files = np.sort(successful_files)

# to track what timestep we are on in idx
tstep = 0

# threshold to use to change small-enough resampled values to 0
thresh = 1e-15

# for all netcdf files we downloaded for i'th dataset
for file in tqdm(successful_files):
    # open the file with xarray
    ds = xr.open_dataset(dataset_dir + "/" + file)
    
    # Get the PM25 values, squeeze out empty axis
    file_vals = np.squeeze(ds['PM25'].values)
    
    # to decide if we need to resample or not
    resamp = ds.XORIG != max_xorig
    
    # For all 24 hours in current file, a full day is time slices 15 through 15+23
    for h in tqdm(np.arange(15, 15+24)):
        # resample data if not already on max lat/lon grid
        if resamp:
            print(f'resampling... {ds.XORIG} != {max_xorig}')
            # Perform the interpolation
            file_vals_resamp = griddata(sml_tups, file_vals[h].flatten(), big_tups, method='cubic', fill_value=0)
            
            # Any values that are less than a given threshold, make it 0
            file_vals_resamp[file_vals_resamp < thresh] = 0
            
            # Reshape the result to match the new grid shape
            file_vals_resamp = file_vals_resamp.reshape((len(big_lat), len(big_lon)))
            print(f'np.min(file_vals_resamp) = {np.min(file_vals_resamp)}')
            # Write resampled values at hour h to timestep t and field f
            db.write(data=file_vals_resamp.astype(np.float32),field=f,time=tstep)
        else:
            # Write original values at hour h to timestep t and field f
            db.write(data=file_vals[h], field=f, time=tstep)
    
        # move to next timestep in IDX
        tstep = tstep + 1

In [None]:
# go to idx data directory
os.chdir('/usr/sci/scratch_nvme/arleth/idx/firesmoke2d')

In [None]:
# compress dataset
db.compressDataset(['zip'])