# Here we debug the sequence produced by our firesmoke conversion scripts

## We need to make sure that, the missing dates are truly because the netCDF files are unavailable

In [None]:
# Used to read/manipulate netCDF data
import xarray as xr

# Used for numerical work
import numpy as np

# Used for processing netCDF time data
import datetime

# To load/save final sequence array to file
import pickle

# for checking and using timestamps
import pandas as pd

# for downloading from internet, we use wget here to forcibly download whatever is available at URL...
import wget

# Accessory, used to generate progress bar for running for loops
# from tqdm.notebook import tqdm
# import ipywidgets
# import jupyterlab_widgets
from tqdm import tqdm

In [None]:
# Load idx_calls from a file
with open('idx_calls_v4.pkl', 'rb') as f:
    idx_calls = pickle.load(f)

In [None]:
# Define the start and end dates
start_date = pd.Timestamp(datetime.datetime.strptime("20210303", "%Y%m%d"))
end_date = pd.Timestamp(datetime.datetime.strptime("20240627", "%Y%m%d"))

# Get all hours between the start and end dates
desired_tflag_set = {start_date + pd.Timedelta(hours=x) for x in range(int((end_date - start_date).total_seconds() // 3600) + 1)}

print(f'There are {len(desired_tflag_set)} hours between 3/3/21 and 6/27/24')

In [None]:
# Get all hours in idx conversion
idx_hours = {call[2] for call in idx_calls}

# Get set of missing hours
hours_missing_set = desired_tflag_set.difference(idx_hours)

print(f'There are {len(hours_missing_set)} missing hours')

In [None]:
hours_missing_sorted = np.sort([i for i in hours_missing_set])

# check out first 1000 missing hours
print(hours_missing_sorted[0:1000])

#### Let's see if the hours missing aren't available from firesmoke.ca...
For each missing date, let's just grab that day and the 4 that **precede** it, at each dataset. Then we'll check if any of those downloaded files have the missing hours available in them...

We may have somehow failed to download all available netCDF files in `data_download` workflow...

In [None]:
# directory to hold the downloaded files
garbage = '/usr/sci/scratch_nvme/arleth/total_garbage'

From all the failed hours, get the DDMMYYYY dates, this is what we use to query firesmoke.ca

In [None]:
missing_dates = set()

for t in hours_missing_sorted:
    missing_dates.add(t.normalize())

missing_dates

print(f'{len(missing_dates)} total missing dates')

In [None]:
# make sequence of queries to download, using missing_tflags and dataset metadata
ids = ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
init_times = ["02", "08", "14", "20"]
to_download = set()

# for all missing dates
for date in missing_dates:
    # for 4 days
    for i in range(4):
        curr_date = date + pd.Timedelta(days=-i)
        
        # build date string for query
        date_str = curr_date.strftime('%Y%m%d')
        
        # for each dataset
        for id_, init_time in zip(ids, init_times):
            # build URL string to download from and directory & filename to download to
            url = f'https://firesmoke.ca/forecasts/{id_}/{date_str}{init_time}/dispersion.nc'
            directory = f'{garbage}/{id_}/dispersion_{date_str}.nc'
            to_download.add((url, directory))

print(f'there are {len(to_download)} files to try downloading...')

The following code block is for downloading the files, I commented it out to avoid rerunning the download script, overwriting files.. etc.

In [None]:
# # download files, do this as a python script by doing `nbconvert --to script...`, 
# # it's faster somehow
# for q in tqdm(to_download):
#     print(f'downloading {q[0]} to {q[1]}')
#     wget.download(q[0], out=q[1])

In [None]:
to_download

#### See all hours available from files we downloaded. Check intersection with our missing tflags.

In [None]:
# Track files that successfully opened
successful_files = {id_: [] for id_ in ids}

# Set of all hours available from the files newly downloaded
new_hours = set()

# Try opening all downloaded files
for query in to_download:
    # get file path from current query from to_download
    path = query[1]
    # get file name from query
    file_name = path[-len('dispersion_20210228.nc'):]
    # get id from path string
    id_ = path[-len('BSC00CA12-01/dispersion_20210228.nc'):-len('/dispersion_20210228.nc')]

    # keep track of which files successfully open
    try:
        # open the file with xarray
        ds = xr.open_dataset(path)

        # append file name to successful_files
        successful_files[id_].append(path)
        # add each available hour to new_hours
        for h in range(ds.sizes["TSTEP"]):
            vals = ds['TFLAG'].values[h]
            curr_tflag = parse_tflag(ds['TFLAG'].values[h][0])
            panda_tflag = pd.Timestamp(curr_tflag)
            new_hours.add((id_, panda_tflag))
    except:
        # netcdf file does not exist
        print(f'FAILED, {id_}, {file_name}')
        continue

In [None]:
for id_ in ids:
    print(f'For {id_}:')
    print(f'Of files downloaded, there are {len(successful_files[id_])} that open successfully.')
    print('---')
print(f'Of files downloaded, there are {len(new_hours)} hours available.')

#### Determine how many new hours were downloaded that we could add to the final IDX conversion, if any...

In [None]:
# get all timestamps downloaded
hours_set = set()

# get each timestamp, add it to a set
for hour in new_hours:
    hours_set.add(hour[1])

print(f'there are {len(hours_set.intersection(hours_missing_set))} hours available from downloaded data')

In [None]:
hours_set.intersection(hours_missing_set)

## Turns out all of the hours we see as missing are truly unavailable.
We will proceed to share.

In [None]:
np.shape(hours_missing_sorted)

In [None]:
hours_missing_sorted

In [None]:
csv_arr = [['hour', 'day', 'month', 'year']]

for h in hours_missing_sorted:
    csv_arr.append([h.hour, h.day, h.month, h.year])
    
np.shape(csv_arr)

In [None]:
# ref: https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
# ref: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
import pandas as pd 
df = pd.DataFrame(csv_arr)
df.to_csv("missing_hours.csv", header=False)