In [16]:
import os
import json
import harp
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import time

import S3tools

import sys
sys.path.append('../')  # Adjusts path to include the parent directory, 'Code'
# import HARP.harptools as ht
import SentHub.catalog_tools as ct

# get keys from Data\keys.json
with open('../../Data/keys.json') as f:
    keys = json.load(f)
    # Define your parameters
    access_key = keys["s3"]["access_key"]
    secret_key = keys["s3"]["secret_key"]


In [17]:
def simple_operations(product: str, spatial_extent: list, filter_vars: bool = False) -> str:
    """
    Returns a string of operations to be used in the harp.import_product function
    This function needs to be changed if a user wants to use different operations or products.
    The product names in harp are different from the product names in the S5P data. And need to be changed manually to ensure accuracy.
    # product names: https://stcorp.github.io/harp/doc/html/ingestions/index.html#sentinel-5p-products
    """
    lon_min, lat_min, lon_max, lat_max = spatial_extent
    variables = {
    "HCHO": ["HCHO_slant_column_number_density", "tropospheric_HCHO_column_number_density_validity"],
    "NO2": ["NO2_slant_column_number_density", "tropospheric_NO2_column_number_density_validity"],
    "SO2": ["SO2_slant_column_number_density", "SO2_column_number_density_validity"],
    "misc": ["datetime_start", "latitude", "longitude", "cloud_fraction", 
             "surface_meridional_wind_velocity", "latitude_bounds", "longitude_bounds",
             "solar_zenith_angle", "sensor_zenith_angle", "solar_azimuth_angle", "sensor_azimuth_angle"],
    }
    
    ops = [f"latitude>={lat_min}",f"latitude<={lat_max}",f"longitude>={lon_min}",f"longitude<={lon_max}"]
    
    if filter_vars:
        ops.append(f"keep({','.join([x for x in variables[product]])})")
    
    operations = ";".join(ops)
    return operations

In [18]:
def merge_reduce(files, products, spatial_extent, root : str = "", delete = False, filter_vars: bool = False):
    """
    Merges and regrids multiple netcdf files into one file using harp
    During the merge, unwanted variables are removed.
    The 
    
    Parameters
    ----------
    files : str
        List of files to merge, equivalent to the urls of the files
    products : str
        List of products to merge, equivalent to the names of the products
    root : str
        Root directory of the input files, must end with a '/'
    """
    
    final_product = harp.Product()
        
    # load products' data
    for file, product in zip(files, products):  
        operation = simple_operations(product, spatial_extent)
        gridded = harp.import_product(f"{root}{file}", operations=operation)
        for variable in gridded:
            if variable not in final_product:
                final_product[variable] = gridded[variable]

    # add misc variables
    if filter_vars:
        operation = simple_operations("misc", spatial_extent)
        gridded = harp.import_product(f"{root}{files[0]}", operations=operation)
        for variable in gridded:
            final_product[variable] = gridded[variable]

    # add source product to the final product for reference, it will automatically extract the file name
    final_product.source_product = files[-1]

    # get orbit id from the file name
    orbit_id = ct.parse_s5p_filename(files[0].split("/")[-2], "orbit_id")
    date = files[0].split("/")[3:6]

    # make new filename out of the old filename
    path = f"{root}Sentinel-5P/TROPOMI/L3__Merged_/{date[0]}/{date[1]}/{date[2]}/{orbit_id}.nc"

    # save the final product
    if not os.path.exists(path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
    
    harp.export_product(final_product, path)

    # delete the original files and the folders L2__HCHO__, L2__NO2__, L2__SO2__
    if delete:
        for file in files:
            try:
                file_dir = '/'.join(file.split("/")[:-1])
                if os.path.exists(f"{root}{file_dir}"):
                    os.remove(f"{root}{file}")
                os.removedirs(f"{root}{file_dir}")
            except:
                pass
    
    return

Merging tends to be very fast, most of the time is spent downloading

`
S5P_RPRO_L2__NO2____20190828T121434_20190828T135604_09706_03_020400_20221106T095252.nc:   1%|          | 8/1059 [05:37<12:07:40, 41.54s/orbit]
`

Due to the long runtime, api side error can occur, simply restarting the program would fix these
S5P_RPRO_L2__HCHO___20190804T080247_20190804T094416_09363_03_020401_20230122T105828 wont download

In [19]:
ROOT = "../../Data/"

# open ../../Data/aoi.json as dict
with open(f"{ROOT}aoi.json", "r") as f:
  aoi = json.load(f)

# iterate over the areas of interest
for aoi_name, area_of_interest in aoi.items():
  # check if the download list already exists
  if not os.path.exists(f"{ROOT}Catalog/{aoi_name}/download_list.csv"):
    print(f"Download list for {aoi_name} doesn't exists, skipping...")
    continue

  target_path = f"{ROOT}EODATA/{aoi_name}/"
  
  # retrieve a list of .nc files in the merged folder
  merged_files = glob(f"{target_path}Sentinel-5P/TROPOMI/L3__Merged_/*/*/*/*.nc")
  # create a list of orbit ids from the merged files, this might work on windows only
  merged_orbits = [int(file.split("\\")[-1][:-3]) for file in merged_files]
  
  # path to the product to download
  product_df = pd.read_csv(f"{ROOT}Catalog/{aoi_name}/download_list.csv")
  
  # remove duplicate rows
  product_df.drop_duplicates(subset=["s3_url"], inplace=True)
  
  # get the unique orbit ids
  unique_orbits = product_df["sat:absolute_orbit"].unique().tolist()
  # remove the orbits that already have a merged file
  unique_orbits = [orbit for orbit in unique_orbits if orbit not in merged_orbits]
  
  if len(unique_orbits) == 0:
    # fake progress bar to simulate processing
    with tqdm(total=10, desc=f"Processing {aoi_name}...") as pbar:  # Create the progress bar
        for _ in range(10):  # A simple loop
            time.sleep(0.1)  # Simulate processing delay
            pbar.update(1)   # Update the progress bar
    print()
    continue
  
  # iterate over unique orbit ids. There should be 3 products for each orbit
  loop = tqdm(unique_orbits, desc=f"Downloading {aoi_name} orbits for box {area_of_interest}...\n", 
                       unit="orbit", colour="green", dynamic_ncols=True)
  for orbit_id in loop:
      
      # get the first product for each orbit
      orbit_df = product_df[product_df["sat:absolute_orbit"] == orbit_id]
      files = orbit_df["s3_url"].to_list()
      
      # download the products
      for url in files:
          loop.set_description_str(f"{url.split('/')[-1]}")
          S3tools.download(url, target_path)
      
      products = orbit_df["s5p:type"].to_list()
      loop.set_description_str(f"Merging {products} for orbit {orbit_id}")
      merged = merge_reduce(files, products, area_of_interest, target_path, delete=True)

Processing Mediterranean...: 100%|██████████| 10/10 [00:01<00:00,  9.22it/s]





Processing Biscay Bay...: 100%|██████████| 10/10 [00:01<00:00,  9.15it/s]





Processing Arabian Sea...: 100%|██████████| 10/10 [00:01<00:00,  9.15it/s]





Processing Bengal Bay...: 100%|██████████| 10/10 [00:01<00:00,  9.30it/s]





