# Visualize ECCC and UBC

## Import libraries and data

In [1]:
# for numerical work
import numpy as np
import itertools
# from sklearn.preprocessing import normalize

# for accessing file system
import os

# for loading netcdf files, for metadata
import xarray as xr
# for connecting OpenVisus framework to xarray
# from https://github.com/sci-visus/openvisuspy, 
from openvisuspy.xarray_backend import OpenVisusBackendEntrypoint

# Used for processing netCDF time data
import time
import datetime
import requests
# Used for indexing via metadata
import pandas as pd

# for plotting
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import folium
from folium.plugins import HeatMap

#Stores the OpenVisus cache in the local direcrtory 
import os
os.environ["VISUS_CACHE"]="./visus_cache_can_be_erased"
os.environ['CURL_CA_BUNDLE'] = ''

### Import UBC data from IDX

In [2]:
# path to tiny NetCDF
url = 'https://github.com/sci-visus/NSDF-WIRED/raw/main/data/firesmoke_metadata.nc'

# Download the file using requests
response = requests.get(url)
local_netcdf = 'firesmoke_metadata.nc'
with open(local_netcdf, 'wb') as f:
    f.write(response.content)
    
# open tiny netcdf with xarray and OpenVisus backend
ds_idx = xr.open_dataset(local_netcdf, engine=OpenVisusBackendEntrypoint)

ov.LoadDataset(http://atlantis.sci.utah.edu/mod_visus?dataset=UBC_fire_smoke_BSC&cached=1)
PM25
Adding field  PM25 shape  [27357, 381, 1081, 21] dtype  float32 labels  ['time', 'ROW', 'COL', 'resolution'] Max Resolution  20


### Compute metadata for IDX dataset

In [3]:
# for parsing time flags (TFLAG) from netcdf files
def parse_tflag(tflag):
    year = int(tflag[0] // 1000)
    day_of_year = int(tflag[0] % 1000)
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=day_of_year - 1)

    time_in_day = int(tflag[1])
    hours = time_in_day // 10000
    minutes = (time_in_day % 10000) // 100
    seconds = time_in_day % 100

    full_datetime = datetime.datetime(year, date.month, date.day, hours, minutes, seconds)
    return full_datetime

In [4]:
# Get metadata to compute lon and lat
xorig = ds_idx.XORIG
yorig = ds_idx.YORIG
xcell = ds_idx.XCELL
ycell = ds_idx.YCELL
ncols = ds_idx.NCOLS
nrows = ds_idx.NROWS

longitude = np.linspace(xorig, xorig + xcell * (ncols - 1), ncols)
latitude = np.linspace(yorig, yorig + ycell * (nrows - 1), nrows)

# Create coordinates for lat and lon
ds_idx.coords['lat'] = ('ROW', latitude)
ds_idx.coords['lon'] = ('COL', longitude)

# Replace col and row dimensions with newly calculated lon and lat arrays (credit: Aashish Panta)
ds_idx = ds_idx.swap_dims({'COL': 'lon', 'ROW': 'lat'})

In [5]:
# get all tflags
tflag_values = ds_idx['TFLAG'].values

# to store pandas timestamps
timestamps = []

# convert all tflags to pandas timestamps, store in timestamps list
for tflag in tflag_values:
    timestamps.append(pd.Timestamp(parse_tflag(tflag[0])))

# set coordinates to each timestep with these pandas timestamps
ds_idx.coords['time'] = ('time', timestamps)

### Import ECCC 2021 and 2022 Data

In [6]:
# read from CSV, skip first 7 rows of non-data info
skiprows = [0,1,2,3,4,5,6]
df_2021 = pd.read_csv('PM25_2021.csv', skiprows=skiprows)
df_2022 = pd.read_csv('PM25_2022.csv', skiprows=skiprows)

# combine into 1 dataframe
df_eccc = pd.concat([df_2021, df_2022])

FileNotFoundError: [Errno 2] No such file or directory: 'PM25_2021.csv'

### Data Cleaning
1. drop unused columns
2. convert dates to pandas timestamp datatype
3. make each hour header to a pandas timestamp datatype for easier processing

In [None]:
# names of columns in CSV to drop
cols_to_drop = ['Province/Territory//Province/Territoire', 'Pollutant//Polluant', 'Method Code//Code Méthode', 
                'NAPS ID//Identifiant SNPA', 'City//Ville']

# drop the columns
df_eccc.drop(cols_to_drop, axis=1, inplace=True)

# set -999 to NaN, then drop all NaNs
df_eccc = df_eccc.replace(-999, np.NaN)
df_eccc = df_eccc.dropna()

# change hours to just an integer
new_names = dict(zip(df_eccc.columns.tolist()[3:27], np.arange(24)))
df_eccc = df_eccc.rename(columns=new_names)

# convert column to be dates of pandas timestamp datatype
df_eccc['Date//Date'] = pd.to_datetime(df_eccc['Date//Date'])
df_eccc.head()

---

## Select date to visualize (non-timeseries visualization)
Change `curr_date` to the date you would like to see

In [None]:
# the date ranges we wanna see
curr_date = pd.Timestamp(year=2021, month=7, day=1, hour=0)

# folium (visual) parameters
static_r = 4 # radius
static_b = 3 # blur
map_start = [51, -106] # start in canada
z = 4

### Plot ECCC Smoke Emissions (static)

In [None]:
# get values for curr_date
# date_cond = df_eccc['Date//Date'].astype(str) == np.datetime64(curr_date).astype(str)[0:10]
date_cond = df_eccc['Date//Date'] == curr_date
df_curr_date = df_eccc[date_cond]

# use only non NaN data
eccc_data = df_curr_date[['Latitude//Latitude', 'Longitude//Longitude', curr_date.hour]]
np.shape(eccc_data.values)

In [None]:
# Initialize map to Calgary's lat and lon (as in firesmoke.ca)
eccc_m = folium.Map(map_start, zoom_start=z)

# Create a HeatMap layer with custom intensity parameters
HeatMap(
    eccc_data,
    radius=static_r,
    blur=static_b,
).add_to(eccc_m)

# Display the map
eccc_m

### Plot IDX Smoke Emissions (static)

In [None]:
# get lats and lons used in ECCC data
eccc_lats = eccc_data['Latitude//Latitude'].values
eccc_lons = eccc_data['Longitude//Longitude'].values

# desired resolution
data_resolution = 0

# find nearest lat,lon values in idx to eccc lat,lon values
ds_curr_date = ds_idx.loc[dict(time=curr_date, resolution=data_resolution)].sel(
    lat=eccc_lats, lon=eccc_lons, method='nearest')

# loop thru the coords, populate array with coord vals and pm25 vals
idx_coords = np.column_stack([ds_curr_date.lat.values, ds_curr_date.lon.values])

# init arr to hold points of interest
idx_data = []

for c in range(len(idx_coords)):
    # get curr vals
    lat = idx_coords[c][0]
    lon = idx_coords[c][1]

    # i think it returns an error if there's no value there
    try:
        val = ds_idx['PM25'].sel(time=curr_date, lat=lat, 
            lon=lon, resolution=data_resolution).values
        # populate with lat, lon, and PM25 value
        idx_data.append([lat, lon, float(val)])
    except:
        print(f'failed at {lat}, {lon}')

there may be a better way to index for values instead of querying for each lat,lon

In [None]:
# Initialize map to Calgary's lat and lon (as in firesmoke.ca)
idx_map = folium.Map(map_start, zoom_start=z)

# Create a HeatMap layer with custom intensity parameters
HeatMap(
    idx_data,
    radius=static_r,
    blur=static_b,
).add_to(idx_map)

# Display the map
idx_map

---

## Select dates to visualize (timeseries)
Change `first_date` and `last_date` to the date range you would like to see.

**Note**: If the date range is more than a few days, visualization takes a long time to produce.

In [None]:
def normalize(arr):
    '''
    normalize all values in given array to be between 0 and 1
        using min and max of entire array
    ref: https://www.codecademy.com/article/normalization
    '''
    # get min and max
    min_val = np.min(arr)
    max_val = np.max(arr)
    
    # perform normalization
    norm_arr = (arr - min_val) / (max_val - min_val)
    
    return norm_arr

Set this to 1 to normalize or 0 to skip

In [None]:
normalize_data = 1

### Plot ECCC Smoke Emissions over time

In [None]:
# get rows that are within desired time range
date_cond = (df_eccc['Date//Date'] <= last_date) & (df_eccc['Date//Date'] >= curr_date)
df_eccc_time = df_eccc[date_cond]

num_coords = len(df_eccc_time['Latitude//Latitude'])

# init arr to hold time steps and the lats, lons, vals at each timestep
eccc_time_data = np.zeros((time_delta, num_coords, 3))

# populate array with coordinate data
eccc_time_data[:,:,0] = df_eccc_time['Latitude//Latitude'].values
eccc_time_data[:,:,1] = df_eccc_time['Longitude//Longitude'].values

# we will loop through each hour
d = pd.to_timedelta(1, unit='h')
tstep = 0
tmp_date = curr_date

# loop through each hour
while tmp_date < last_date:
    # date to select
    date_cond = df_eccc_time['Date//Date'] == pd.Timestamp(day=tmp_date.day, month=tmp_date.month, year=tmp_date.year)
    
    # for each coordinate, get value at current time step
    for c in range(num_coords):
        try:
            # filter to get values for given date and lat,lon
            lat = eccc_time_data[0,:,0][c]
            lon = eccc_time_data[0,:,1][c]
            lat_lon_cond = (df_eccc_time['Latitude//Latitude'] == lat) & (df_eccc_time['Longitude//Longitude'] == lon)
            
            # get PM2.5 value for this date and hour
            df_latlon = df_eccc_time[lat_lon_cond & date_cond]
            eccc_time_data[tstep, c, 2] = df_latlon[tmp_date.hour].values[0]
        except:
            continue
            # print(f'failed for {tmp_date}, {lat}, {lon}')

    tmp_date += d
    tstep += 1
    
    # print progress of stepping through timesteps
    if tstep % 2 == 0:
        print('.', end='', flush=True)

# normalize smoke readings across all timesteps and coordinates
if normalize_data:
    eccc_time_data[:, :, 2] = normalize(eccc_time_data[:, :, 2])

In [None]:
np.max(eccc_time_data[:, :, 2])

In [None]:
eccc_time_map = folium.Map([51, -106], zoom_start=4)

# data needs to be in list format
# ref: https://stackoverflow.com/questions/75330933/folium-heatmapwithtime-html-file-generated-is-blank
eccc_time_hm = folium.plugins.HeatMapWithTime(eccc_time_data.tolist(), radius=20,
    blur=1, auto_play=True)

eccc_time_hm.add_to(eccc_time_map)

eccc_time_map

---

### Plot IDX Smoke Emissions (over time)

In [None]:
# init arr to hold time steps and the lats,lons,vals at each timestep
idx_time_data = np.zeros((time_delta, len(idx_coords), 3))

for c in range(len(idx_coords)):
    # get curr vals
    lat = idx_coords[c][0]
    lon = idx_coords[c][1]

    # i think it returns an error if there's no value there
    try:
        # get all PM25 values of current coord for all time steps
        vals = ds_idx['PM25'].sel(time=time_slice, resolution=data_resolution,
            lat=lat, lon=lon).values
        # populate array lat, lon, and PM25 values
        # create array holding lats, lons, and values
        curr_arr = np.column_stack([np.full(time_delta, lat), np.full(time_delta, lon), vals])
        idx_time_data[:, c] = curr_arr
    except:
        print(f'failed at {lat}, {lon}')
        
# normalize PM2.5 values to be between 0 and 1
idx_time_data[:, :, 2] = normalize(idx_time_data[:, :, 2])

In [None]:
idx_time_map = folium.Map([51, -106], zoom_start=4)

# data needs to be in list format
# ref: https://stackoverflow.com/questions/75330933/folium-heatmapwithtime-html-file-generated-is-blank
hm = folium.plugins.HeatMapWithTime(idx_time_data.tolist(), radius=20,
    blur=1, auto_play=True)

hm.add_to(idx_time_map)

idx_time_map