# Identify missing hours between March 3, 2021 and February 10, 2024.
#### We want to fill all gaps if possible.

#### Import necessary libraries

In [49]:
# to download latest firesmoke metadata netCDF file
import requests

# for loading netcdf files, for metadata
import xarray as xr
# for connecting OpenVisus framework to xarray
# from https://github.com/sci-visus/openvisuspy, 
from openvisuspy.xarray_backend import OpenVisusBackendEntrypoint

# Used for processing netCDF time data
import time
import datetime

# Used for numerical work
import numpy as np

# Used for loading data from pickle data
import pickle

# For working with pandas timestamps
import pandas as pd

# # For downloading from the internet, retrying netCDF files
# import wget

# For controlling via operating system
import os
import shutil

# Accessory for generating progress bar to see progress of loops
from tqdm import tqdm

### 1. Determine the hours that failed to visualize in video, even though those hours are available in netCDF files/idx...

#### Import the issues found while generating visualizations of each timestep from IDX conversion and original netCDF files

These were generated while running IDX conversion at `conversion`, v3.

In [2]:
with open("new_idx_issues.pkl", "rb") as input_file:
    idx_issues = pickle.load(input_file)

with open("new_netcdf_issues.pkl", "rb") as input_file:
    netcdf_issues = pickle.load(input_file)

# print how many issues there are
print(f'Number of IDX issues: {len(idx_issues)}')
print(f'Number of netCDF issues: {len(netcdf_issues)}')

Number of IDX issues: 444
Number of netCDF issues: 444


#### Let's see where the IDX conversion and netCDF files agree on issues. 
If it's an issue in netCDF original file, it'll be an issue in IDX too. However converse is not necessarily true.

In [3]:
# make set of common datetime keys betwix idx and netcdf issues
common_dates = set(netcdf_issues.keys()).intersection(idx_issues.keys())

# get issues unique to each version of firesmoke data
idx_only_issues = set(idx_issues.keys()).difference(netcdf_issues.keys())
netcdf_only_issues = set(netcdf_issues.keys()).difference(idx_issues.keys())

# see how many such issues exist in these sets
print(f'Num. of issues IDX and netCDF agree on is {len(common_dates)}')
print(f'Num. of issues unique to IDX is {len(idx_only_issues)}')
print(f'Num. of issues unique to netCDF is {len(netcdf_only_issues)}')

Num. of issues IDX and netCDF agree on is 444
Num. of issues unique to IDX is 0
Num. of issues unique to netCDF is 0


#### Now let's see what's wrong with the array at the issues found above...
I suspect these arrays are all zeros

In [4]:
netcdf_zeros = set()
idx_zeros = set()

for i in netcdf_issues:
    if np.all(netcdf_issues[i] == 0):
        netcdf_zeros.add(i)
        
for i in idx_issues:
    if np.all(idx_issues[i] == 0):
        idx_zeros.add(i)
        
print(f'Number of idx issues where array is all zeros is: {len(idx_zeros)}')
print(f'Number of netcdf issues where array is all zeros is: {len(netcdf_zeros)}')

Number of idx issues where array is all zeros is: 444
Number of netcdf issues where array is all zeros is: 444


#### Let's get what dates failed i.e. are just all zeros

In [5]:
zero_hours = set()

for k in idx_issues.keys():
    zero_hours.add(pd.Timestamp(year=k.year, month=k.month, day=k.day, hour=k.hour))

In [6]:
print(f'there are {len(zero_hours)} hours whose data are all zeros')

there are 444 hours whose data are all zeros


## 2. See what timesteps are unavailable due to netCDF files being missing
#### This is the available conversion from our IDX conversion

#### Import data available from firesmoke_metadata.nc

In [7]:
# path to tiny NetCDF
url = 'https://github.com/sci-visus/NSDF-WIRED/raw/main/data/firesmoke_metadata_recent.nc'

# Download the file using requests
response = requests.get(url)
local_netcdf = 'firesmoke_metadata.nc'
with open(local_netcdf, 'wb') as f:
    f.write(response.content)

# open tiny netcdf with xarray and OpenVisus backend
ds = xr.open_dataset(local_netcdf, engine=OpenVisusBackendEntrypoint)

ov.LoadDataset(http://atlantis.sci.utah.edu/mod_visus?dataset=UBC_fire_smoke_BSC&cached=1)
PM25
Adding field  PM25 shape  [25053, 381, 1081, 21] dtype  float32 labels  ['time', 'ROW', 'COL', 'resolution'] Max Resolution  20


In [8]:
def parse_tflag(tflag):
    """
    Return the tflag as a datetime object
    :param list tflag: a list of two int32, the 1st representing date and 2nd representing time
    """
    # obtain year and day of year from tflag[0] (date)
    date = int(tflag[0])
    year = date // 1000 # first 4 digits of tflag[0]
    day_of_year = date % 1000 # last 3 digits of tflag[0]

    # create datetime object representing date
    final_date = datetime.datetime(year, 1, 1) + datetime.timedelta(days=day_of_year - 1)

    # obtain hour, mins, and secs from tflag[1] (time)
    time = int(tflag[1])
    hours = time // 10000 # first 2 digits of tflag[1]
    minutes = (time % 10000) // 100 # 3rd and 4th digits of tflag[1] 
    seconds = time % 100  # last 2 digits of tflag[1]

    # create final datetime object
    full_datetime = datetime.datetime(year, final_date.month, final_date.day, hours, minutes, seconds)
    return full_datetime

#### Create the set of the available hours in this dataset
Including the hours with all zeros...

In [9]:
actual_tflag_set = set()

# get TFLAGs as datetime objects, friendlier, make a set, wanna see what's missing
for t in ds['TFLAG'].values:
    actual_tflag_set.add(parse_tflag(t[0]))

print(f'there are {len(actual_tflag_set)} hours available in firesmoke_metadata.nc dataset')

there are 25053 hours available in firesmoke_metadata.nc dataset


#### Create the set of all hours we would like, between dates 3/3/2021 - 2/10/2024

In [10]:
# Define the start and end dates
start_date = pd.Timestamp(datetime.datetime.strptime("20210303", "%Y%m%d"))
end_date = pd.Timestamp(datetime.datetime.strptime("20240210", "%Y%m%d"))

# Get all hours between the start and end dates
desired_tflag_set = {start_date + pd.Timedelta(hours=x) for x in range(int((end_date - start_date).total_seconds() // 3600) + 1)}

print(f'There are {len(desired_tflag_set)} hours between 3/3/21 and 2/10/24')

There are 25777 hours between 3/3/21 and 2/10/24


#### Determine what hours are apparently missing netCDF files using the sets above

In [11]:
missing_tflags = desired_tflag_set.difference(actual_tflag_set)

print(f'there are {len(missing_tflags)} missing hours from our dataset')

there are 747 missing hours from our dataset


In [12]:
len(zero_hours.intersection(missing_tflags))

0

In [13]:
missing_tflags

{Timestamp('2021-03-03 00:00:00'),
 Timestamp('2021-03-03 01:00:00'),
 Timestamp('2021-03-03 02:00:00'),
 Timestamp('2021-03-03 03:00:00'),
 Timestamp('2021-03-03 04:00:00'),
 Timestamp('2021-03-03 05:00:00'),
 Timestamp('2021-03-03 06:00:00'),
 Timestamp('2021-03-03 07:00:00'),
 Timestamp('2021-03-03 08:00:00'),
 Timestamp('2021-03-03 09:00:00'),
 Timestamp('2021-03-03 10:00:00'),
 Timestamp('2021-03-03 11:00:00'),
 Timestamp('2021-03-03 12:00:00'),
 Timestamp('2021-03-03 13:00:00'),
 Timestamp('2021-03-03 14:00:00'),
 Timestamp('2021-03-03 15:00:00'),
 Timestamp('2021-03-03 16:00:00'),
 Timestamp('2021-03-03 17:00:00'),
 Timestamp('2021-03-03 18:00:00'),
 Timestamp('2021-03-03 19:00:00'),
 Timestamp('2021-03-03 20:00:00'),
 Timestamp('2021-03-03 21:00:00'),
 Timestamp('2021-03-03 22:00:00'),
 Timestamp('2021-03-03 23:00:00'),
 Timestamp('2022-02-16 00:00:00'),
 Timestamp('2022-02-16 01:00:00'),
 Timestamp('2022-02-16 02:00:00'),
 Timestamp('2022-06-02 00:00:00'),
 Timestamp('2022-06-

## We must account for these missing hours.

### Import all calls made during IDX conversion. Determine holes in those calls and, again, confirm the holes really are because of missing data.

Latest `idx_calls.pkl` is created in latest `conversion` version.

In [14]:
with open("idx_calls.pkl", "rb") as input_file:
    idx_calls = pickle.load(input_file)

print(f'total hours called during idx conversion is {len(idx_calls)}')

total hours called during idx conversion is 25053


#### Ensure that all the missing_tflags were not called during IDX conversion in the first place.

In [15]:
idx_calls_set = set()

for call in idx_calls:
    idx_calls_set.add(pd.Timestamp(call[2]))

desired_not_called = desired_tflag_set.difference(idx_calls_set)

print(f'len(idx_calls_set) = {len(idx_calls_set)}')
print(f'len(desired_not_called.difference(missing_tflags)) = {len(desired_not_called.difference(missing_tflags))}')

len(idx_calls_set) = 25053
len(desired_not_called.difference(missing_tflags)) = 0


#### Good, now let's see if the missing_tflags aren't available from firesmoke.ca...
For each missing date, let's just grab that day and the 4 that **precede** it, at each dataset. Then we'll check if any of those downloaded files have the missing hours available in them...

We may have somehow failed to download all available netCDF files in `data_download` workflow...

In [16]:
# directory to hold the downloaded files
garbage = '/usr/sci/scratch_nvme/arleth/total_garbage'

From all the failed hours, get the DDMMYYYY dates, this is what we use to query firesmoke.ca


In [17]:
missing_dates = set()

for t in missing_tflags:
    missing_dates.add(t.normalize())

missing_dates

{Timestamp('2021-03-03 00:00:00'),
 Timestamp('2022-02-16 00:00:00'),
 Timestamp('2022-06-02 00:00:00'),
 Timestamp('2022-06-03 00:00:00'),
 Timestamp('2022-06-04 00:00:00'),
 Timestamp('2022-06-05 00:00:00'),
 Timestamp('2022-06-06 00:00:00'),
 Timestamp('2022-06-07 00:00:00'),
 Timestamp('2022-06-08 00:00:00'),
 Timestamp('2022-06-09 00:00:00'),
 Timestamp('2022-06-10 00:00:00'),
 Timestamp('2022-06-11 00:00:00'),
 Timestamp('2022-06-12 00:00:00'),
 Timestamp('2022-06-13 00:00:00'),
 Timestamp('2022-06-14 00:00:00'),
 Timestamp('2022-06-15 00:00:00'),
 Timestamp('2022-06-16 00:00:00'),
 Timestamp('2022-06-17 00:00:00'),
 Timestamp('2022-06-18 00:00:00'),
 Timestamp('2022-06-19 00:00:00'),
 Timestamp('2022-06-20 00:00:00'),
 Timestamp('2022-06-21 00:00:00'),
 Timestamp('2022-06-22 00:00:00'),
 Timestamp('2022-06-23 00:00:00'),
 Timestamp('2022-06-24 00:00:00'),
 Timestamp('2022-06-25 00:00:00'),
 Timestamp('2022-06-26 00:00:00'),
 Timestamp('2022-06-27 00:00:00'),
 Timestamp('2022-06-

In [18]:
# make sequence of queries to download, using missing_tflags and dataset metadata
ids = ["BSC18CA12-01", "BSC00CA12-01", "BSC06CA12-01", "BSC12CA12-01"]
init_times = ["02", "08", "14", "20"]
to_download = set()

# for all missing dates
for date in missing_dates:
    # for 4 days
    for i in range(4):
        curr_date = date + pd.Timedelta(days=-i)
        
        # build date string for query
        date_str = curr_date.strftime('%Y%m%d')
        
        # for each dataset
        for id_, init_time in zip(ids, init_times):
            # build URL string to download from and directory & filename to download to
            url = f'https://firesmoke.ca/forecasts/{id_}/{date_str}{init_time}/dispersion.nc'
            directory = f'{garbage}/{id_}/dispersion_{date_str}.nc'
            to_download.add((url, directory))

print(f'there are {len(to_download)} files to try downloading...')

there are 212 files to try downloading...


The following code block is for downloading the files, I commented it out to avoid rerunning the download script, overwriting files.. etc.

In [19]:
# # download files, do this as a python script by doing `nbconvert --to script...`, 
# # it's faster somehow
# for q in tqdm(to_download):
#     print(f'downloading {q[0]} to {q[1]}')
#     wget.download(q[0], out=q[1])

In [20]:
to_download

{('https://firesmoke.ca/forecasts/BSC00CA12-01/2021022808/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20210228.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01/2021030108/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20210301.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01/2021030208/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20210302.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01/2021030308/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20210303.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01/2022021308/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20220213.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01/2022021408/dispersion.nc',
  '/usr/sci/scratch_nvme/arleth/total_garbage/BSC00CA12-01/dispersion_20220214.nc'),
 ('https://firesmoke.ca/forecasts/BSC00CA12-01

#### See all hours available from files we downloaded. Check intersection with our missing tflags.

In [21]:
# Track files that successfully opened
successful_files = {id_: [] for id_ in ids}

# Set of all hours available from the files newly downloaded
new_hours = set()

# Try opening all downloaded files
for query in to_download:
    # get file path from current query from to_download
    path = query[1]
    # get file name from query
    file_name = path[-len('dispersion_20210228.nc'):]
    # get id from path string
    id_ = path[-len('BSC00CA12-01/dispersion_20210228.nc'):-len('/dispersion_20210228.nc')]

    # keep track of which files successfully open
    try:
        # open the file with xarray
        ds = xr.open_dataset(path)

        # append file name to successful_files
        successful_files[id_].append(path)
        # add each available hour to new_hours
        for h in range(ds.sizes["TSTEP"]):
            vals = ds['TFLAG'].values[h]
            curr_tflag = parse_tflag(ds['TFLAG'].values[h][0])
            panda_tflag = pd.Timestamp(curr_tflag)
            new_hours.add((id_, panda_tflag))
    except:
        # netcdf file does not exist
        print(f'FAILED, {id_}, {file_name}')
        continue

FAILED, BSC18CA12-01, dispersion_20230228.nc
FAILED, BSC18CA12-01, dispersion_20231231.nc
FAILED, BSC12CA12-01, dispersion_20240101.nc
FAILED, BSC18CA12-01, dispersion_20240101.nc
FAILED, BSC12CA12-01, dispersion_20220214.nc
FAILED, BSC12CA12-01, dispersion_20220215.nc
FAILED, BSC18CA12-01, dispersion_20231111.nc
FAILED, BSC06CA12-01, dispersion_20240102.nc
FAILED, BSC12CA12-01, dispersion_20231110.nc
FAILED, BSC12CA12-01, dispersion_20231109.nc
FAILED, BSC06CA12-01, dispersion_20240103.nc
FAILED, BSC06CA12-01, dispersion_20231112.nc
FAILED, BSC00CA12-01, dispersion_20230228.nc
FAILED, BSC06CA12-01, dispersion_20220624.nc
FAILED, BSC12CA12-01, dispersion_20240102.nc
FAILED, BSC06CA12-01, dispersion_20231110.nc
FAILED, BSC00CA12-01, dispersion_20220215.nc
FAILED, BSC12CA12-01, dispersion_20240103.nc
FAILED, BSC12CA12-01, dispersion_20230228.nc
FAILED, BSC06CA12-01, dispersion_20231109.nc
FAILED, BSC00CA12-01, dispersion_20231111.nc
FAILED, BSC18CA12-01, dispersion_20240102.nc
FAILED, BS

In [22]:
for id_ in ids:
    print(f'For {id_}:')
    print(f'Of files downloaded, there are {len(successful_files[id_])} that open successfully.')
    print('---')
print(f'Of files downloaded, there are {len(new_hours)} hours available.')

For BSC18CA12-01:
Of files downloaded, there are 41 that open successfully.
---
For BSC00CA12-01:
Of files downloaded, there are 49 that open successfully.
---
For BSC06CA12-01:
Of files downloaded, there are 42 that open successfully.
---
For BSC12CA12-01:
Of files downloaded, there are 40 that open successfully.
---
Of files downloaded, there are 4932 hours available.


#### Determine how many new hours were downloaded that we could add to the final IDX conversion, if any...

In [23]:
# get all timestamps downloaded
hours_set = set()

# get each timestamp, add it to a set
for hour in new_hours:
    hours_set.add(hour[1])

print(f'there are {len(hours_set.intersection(missing_tflags))} hours available from downloaded data')

there are 747 hours available from downloaded data


In [24]:
hours_set.intersection(missing_tflags)

{Timestamp('2021-03-03 00:00:00'),
 Timestamp('2021-03-03 01:00:00'),
 Timestamp('2021-03-03 02:00:00'),
 Timestamp('2021-03-03 03:00:00'),
 Timestamp('2021-03-03 04:00:00'),
 Timestamp('2021-03-03 05:00:00'),
 Timestamp('2021-03-03 06:00:00'),
 Timestamp('2021-03-03 07:00:00'),
 Timestamp('2021-03-03 08:00:00'),
 Timestamp('2021-03-03 09:00:00'),
 Timestamp('2021-03-03 10:00:00'),
 Timestamp('2021-03-03 11:00:00'),
 Timestamp('2021-03-03 12:00:00'),
 Timestamp('2021-03-03 13:00:00'),
 Timestamp('2021-03-03 14:00:00'),
 Timestamp('2021-03-03 15:00:00'),
 Timestamp('2021-03-03 16:00:00'),
 Timestamp('2021-03-03 17:00:00'),
 Timestamp('2021-03-03 18:00:00'),
 Timestamp('2021-03-03 19:00:00'),
 Timestamp('2021-03-03 20:00:00'),
 Timestamp('2021-03-03 21:00:00'),
 Timestamp('2021-03-03 22:00:00'),
 Timestamp('2021-03-03 23:00:00'),
 Timestamp('2022-02-16 00:00:00'),
 Timestamp('2022-02-16 01:00:00'),
 Timestamp('2022-02-16 02:00:00'),
 Timestamp('2022-06-02 00:00:00'),
 Timestamp('2022-06-

## Turns out these hours are available... 
### Need to investiage further:
1. Do we need to use these downloaded files in the IDX conversion because somehow we didn't already download them?
2. Or, do we already have the files and we instead sequenced through the netCDF files incorrectly...

### 1. Let's see if the files we downloaded are all available in the files we originally downloaded...
We will check if they open successfully as well. Otherwise we should redownload failed files.

In [25]:
# Get directory to where we downloaded netCDF for IDX conversion

# ******* THIS IS WHEN RUNNING FROM ATLANTIS.SCI **************
firesmoke_dir = "/usr/sci/cedmav/data/firesmoke"

print(f'downloaded netCDF data for IDX conversions at\n {firesmoke_dir}')
print(f'downloaded netCDF data for diagnosing issues at\n {garbage}')

downloaded netCDF data for IDX conversions at
 /usr/sci/cedmav/data/firesmoke
downloaded netCDF data for diagnosing issues at
 /usr/sci/scratch_nvme/arleth/total_garbage


#### For each file we downloaded and successfully opened at `garbage` dir, see if it exists in the `firesmoke_dir`

In [26]:
# to keep track of successful_files we downloaded for IDX conversion already
idx_successful_files = {id_: [] for id_ in ids}
idx_new_hours = set()

# to track what files failed and WHY, that way we know WHAT files to 'fix'
idx_failed_files = {id_: [] for id_ in ids}

# for each dataset
for id_ in ids:
    # for each file downloaded for said dataset
    for file in successful_files[id_]:
        # get file name
        file_name = file[-len('dispersion_20220616.nc'):]
        # get path to netCDF file that may be in IDX netCDF downloads
        idx_file_path = f'{firesmoke_dir}/{id_}/{file_name}'
        
        # keep track of which files successfully open
        try:
            # open the file with xarray
            ds = xr.open_dataset(idx_file_path)

            # append file name to successful_files
            idx_successful_files[id_].append(file_name)

            # add each available hour to idx_new_hours
            for h in range(ds.sizes["TSTEP"]):
                vals = ds['TFLAG'].values[h]
                curr_tflag = parse_tflag(ds['TFLAG'].values[h][0])
                panda_tflag = pd.Timestamp(curr_tflag)
                idx_new_hours.add((id_, panda_tflag))
        # ref: https://stackoverflow.com/questions/9823936/how-do-i-determine-what-type-of-exception-occurred
        except Exception as e:
            # if file cannot be opened by xarray
            if 'did not find a match in any of xarray' in str(e):
                idx_failed_files[id_].append((file_name, 'INCOMPATIBLE FILE'))
            # if file isn't found
            elif 'No such file or directory' in str(e):
                idx_failed_files[id_].append((file_name, 'NO SUCH FILE'))
            else:
            # otherwise... who knows check issue
                idx_failed_files[id_].append((file_name, str(e)))
            continue

In [27]:
idx_failed_files

{'BSC18CA12-01': [('dispersion_20220630.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220606.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220603.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220610.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220624.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220604.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220619.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220613.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20210228.nc', 'NO SUCH FILE'),
  ('dispersion_20220621.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220627.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220628.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220607.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220609.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220626.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20210301.nc', 'NO SUCH FILE'),
  ('dispersion_20220620.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220629.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220605.nc', 'INCOMPATIBLE FILE'),
  ('dispersion_20220617.n

In [28]:
for id_ in ids:
    print(f'For {id_}')
    print(f'Of {len(successful_files[id_])} newly downloaded, there are {len(idx_successful_files[id_])} files for idx conversion.')

For BSC18CA12-01
Of 41 newly downloaded, there are 9 files for idx conversion.
For BSC00CA12-01
Of 49 newly downloaded, there are 15 files for idx conversion.
For BSC06CA12-01
Of 42 newly downloaded, there are 9 files for idx conversion.
For BSC12CA12-01
Of 40 newly downloaded, there are 8 files for idx conversion.


#### For reference future reference, I got the following output:
```
For BSC18CA12-01
Of 41 newly downloaded, there are 9 files for idx conversion.
For BSC00CA12-01
Of 49 newly downloaded, there are 15 files for idx conversion.
For BSC06CA12-01
Of 42 newly downloaded, there are 9 files for idx conversion.
For BSC12CA12-01
Of 40 newly downloaded, there are 8 files for idx conversion.
```

Great, so we should redo the IDX conversion and add these newly downloaded netCDFs to fill in the gaps!

#### Wait but what about files that successfully open, do they hold any of the hours we are missing...

In [29]:
# get all timestamps downloaded
idx_hours_set = set()

# get each timestamp, add it to a set
for hour in idx_new_hours:
    idx_hours_set.add(hour[1])

print(f'len(idx_new_hours.intersection(missing_tflags)) = {len(idx_new_hours.intersection(missing_tflags))}')

len(idx_new_hours.intersection(missing_tflags)) = 0


#### `len(idx_new_hours.intersection(missing_tflags)) = 0`
#### So now we know to fix the missing 747 hours we must add the newly downloaded netCDF files to the set of netCDF files used during IDX conversion.

We must have failed to download those files initially.

go ahead and delete all files from the `firesmoke_dir` that are listed as `successful_files` here. then copy paste `successful_files` to corresponding `firesmoke_dir`, then we're ready to redo conversion........

In [43]:
# delete files from firesmoke_dir, we will replace them next...
for id_ in ids:
    for f in successful_files[id_]:
        to_delete = f'{firesmoke_dir}/{f[-len("BSC18CA12-01/dispersion_20220630.nc"):]}'
        # try deleting the file
        try:
            # ref: https://stackoverflow.com/questions/6996603/how-can-i-delete-a-file-or-folder-in-python
            os.remove(to_delete)
        # catch exception, likely from 'no such file or directory'
        except Exception as e:
            print(e)

[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC18CA12-01/dispersion_20210228.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC18CA12-01/dispersion_20210301.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC18CA12-01/dispersion_20210302.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC00CA12-01/dispersion_20210301.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC00CA12-01/dispersion_20210303.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC00CA12-01/dispersion_20210228.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC00CA12-01/dispersion_20210302.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC06CA12-01/dispersion_20210302.nc'
[Errno 2] No such file or directory: '/usr/sci/cedmav/data/firesmoke/BSC06CA12-01/dispersion_20210301.nc'
[Errno 2] No such file or directory: '/usr/sci

now add newly downloaded files to the firesmoke dir accordingly

In [52]:
# # move files to firesmoke_dir
# # ref: https://stackoverflow.com/questions/123198/how-to-copy-files
# for id_ in ids:
#     for f in successful_files[id_]:
#         move_to_dir = f'{firesmoke_dir}/{f[-len("BSC18CA12-01/dispersion_20220630.nc"):]}'
#         shutil.copyfile(f, move_to_dir)
#         # print(f'MOVING {f} TO {move_to_dir}')