# Identify missing hours between March 3, 2021 and February 10, 2024.
#### We want to fill all gaps if possible.

#### Import necessary libraries

In [None]:
# for loading netcdf files, for metadata
import xarray as xr
from backend_v3 import *

# Used for processing netCDF time data
import time
import datetime

# Used for numerical work
import numpy as np

# Used for loading data from pickle data
import pickle

# For working with pandas timestamps
import pandas as pd

# For downloading from the internet, retrying netCDF files
import wget

# Accessory for generating progress bar to see progress of loops
from tqdm import tqdm

#### Set relevant directories

In [None]:
# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

### 1. Determine the hours that failed to visualize in video, even though those hours are available in netCDF files/idx...

#### Import the issues found while generating visualizations of each timestep from IDX conversion and original netCDF files

In [None]:
with open("new_idx_issues.pkl", "rb") as input_file:
    idx_issues = pickle.load(input_file)

with open("new_netcdf_issues.pkl", "rb") as input_file:
    netcdf_issues = pickle.load(input_file)

# print how many issues there are
print(f'Number of IDX issues: {len(idx_issues)}')
print(f'Number of netCDF issues: {len(netcdf_issues)}')

#### Let's see where the IDX conversion and netCDF files agree on issues. 
If it's an issue in netCDF original file, it'll be an issue in IDX too. However converse is not necessarily true.

In [None]:
# make set of common datetime keys betwix idx and netcdf issues
common_dates = set(netcdf_issues.keys()).intersection(idx_issues.keys())

# get issues unique to each version of firesmoke data
idx_only_issues = set(idx_issues.keys()).difference(netcdf_issues.keys())
netcdf_only_issues = set(netcdf_issues.keys()).difference(idx_issues.keys())

# see how many such issues exist in these sets
print(f'Num. of issues IDX and netCDF agree on is {len(common_dates)}')
print(f'Num. of issues unique to IDX is {len(idx_only_issues)}')
print(f'Num. of issues unique to netCDF is {len(netcdf_only_issues)}')

#### Now let's see what's wrong with the array at the issues found above...
I suspect these arrays are all zeros

In [None]:
netcdf_zeros = set()
idx_zeros = set()

for i in netcdf_issues:
    if np.all(netcdf_issues[i] == 0):
        netcdf_zeros.add(i)
        
for i in idx_issues:
    if np.all(idx_issues[i] == 0):
        idx_zeros.add(i)
        
print(f'Number of idx issues where array is all zeros is: {len(idx_zeros)}')
print(f'Number of netcdf issues where array is all zeros is: {len(netcdf_zeros)}')

#### Let's see what dates failed i.e. are just all zeros

In [None]:
zero_hours = set()

for k in idx_issues.keys():
    zero_hours.add(pd.Timestamp(year=k.year, month=k.month, day=k.day, hour=k.hour))

In [None]:
print(f'there are {len(zero_hours)} hours whose data are all zeros')

## 2. See what timesteps are unavailable due to netCDF files being missing
#### This is the available conversion from our IDX conversion

#### Import data available from URL in our metadata file.

In [None]:
# path to tiny netcdf
tiny_netcdf = "firesmoke_metadata.nc"

# open tiny netcdf with xarray and OpenVisus backend
ds = xr.open_dataset(tiny_netcdf, engine=OpenVisusBackendEntrypoint)
ds['TFLAG'].values[0][0]

In [None]:
def parse_tflag(tflag):
    """
    Return the tflag as a datetime object
    :param list tflag: a list of two int32, the 1st representing date and 2nd representing time
    """
    # obtain year and day of year from tflag[0] (date)
    date = int(tflag[0])
    year = date // 1000 # first 4 digits of tflag[0]
    day_of_year = date % 1000 # last 3 digits of tflag[0]

    # create datetime object representing date
    final_date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=day_of_year - 1)

    # obtain hour, mins, and secs from tflag[1] (time)
    time = int(tflag[1])
    hours = time // 10000 # first 2 digits of tflag[1]
    minutes = (time % 10000) // 100 # 3rd and 4th digits of tflag[1] 
    seconds = time % 100  # last 2 digits of tflag[1]

    # create final datetime object
    full_datetime = datetime.datetime(year, final_date.month, final_date.day, hours, minutes, seconds)
    return full_datetime

#### Create the set of the available hours in this dataset
Including the hours with all zeros...

In [None]:
actual_tflag_set = set()

# get TFLAGs as datetime objects, friendlier, make a set, wanna see what's missing
for t in ds['TFLAG'].values:
    actual_tflag_set.add(parse_tflag(t[0]))

print(f'there are {len(actual_tflag_set)} hours available in firesmoke dataset')

#### Create the set of all hours we would like, between dates 3/3/2021 - 2/10/2024

In [None]:
# Define the start and end dates
start_date = pd.Timestamp(datetime.datetime.strptime("20210303", "%Y%m%d"))
end_date = pd.Timestamp(datetime.datetime.strptime("20240210", "%Y%m%d"))

# Get all hours between the start and end dates
desired_tflag_set = {start_date + pd.Timedelta(hours=x) for x in range(int((end_date - start_date).total_seconds() // 3600) + 1)}

print(f'There are {len(desired_tflag_set)} hours between 3/3/21 and 2/10/24')

#### Determine what hours are apparently missing netCDF files using the sets above

In [None]:
missing_tflags = desired_tflag_set.difference(actual_tflag_set)

print(f'there are {len(missing_tflags)} missing hours from our dataset')

## We must account for these missing hours.

### Import all calls made during conversion. Determine holes in those calls and, again, confirm the holes really are because of missing data.

In [None]:
with open("idx_calls.pkl", "rb") as input_file:
    idx_calls = pickle.load(input_file)

print(f'total hours called during idx conversion is {len(idx_calls)}')

#### Ensure that all the missing_tflags were not called during IDX conversion in the first place.

In [None]:
idx_calls_set = set()

for call in idx_calls:
    idx_calls_set.add(pd.Timestamp(call[2]))

desired_not_called = desired_tflag_set.difference(idx_calls_set)

print(f'len(idx_calls_set) = {len(idx_calls_set)}')
print(f'len(desired_not_called.difference(missing_tflags)) = {len(desired_not_called.difference(missing_tflags))}')

#### Good, now let's see if the missing_tflags aren't available from firesmoke.ca...
For each missing date, let's just grab that day and the 4 that **precede** it, at each dataset. Then we'll check if any of those downloaded files have the missing hours available in them...

In [None]:
# directory to hold the downloaded files
garbage = '/usr/sci/scratch_nvme/arleth/total_garbage'

In [None]:
# from all the failed hours, get the DDMMYYYY dates, this is what we use to query firesmoke.ca
missing_dates = set()

for t in missing_tflags:
    missing_dates.add(t.normalize())

missing_dates

In [None]:
# make sequence of queries to download, using missing_tflags and dataset metadata
ids = ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
init_times = ["02", "08", "14", "20"]
to_download = set()

# for all missing dates
for date in missing_dates:
    # for 4 days
    for i in range(4):
        curr_date = date + pd.Timedelta(days=-i)
        
        # build date string for query
        date_str = curr_date.strftime('%Y%m%d')
        
        # for each dataset
        for id_, init_time in zip(ids, init_times):
            # build URL string to download from and directory & filename to download to
            url = f'https://firesmoke.ca/forecasts/{id_}/{date_str}{init_time}/dispersion.nc'
            directory = f'{garbage}/{id_}/dispersion_{date_str}.nc'
            to_download.add((url, directory))

print(f'there are {len(to_download)} files to try downloading...')

In [None]:
# download files, do this as a python script by doing `nbconvert --to script...`, 
# it's faster somehow
for q in tqdm(to_download):
    print(f'downloading {q[0]} to {q[1]}')
    wget.download(q[0], out=q[1])

#### See all hours available from files we downloaded. Check intersection with our missing tflags.

In [None]:
# # Track files that successfully opened
# successful_files = {id_: [] for id_ in ids}

# # Try opening all downloaded files
# for q in to_download:
    