# Coddington, 2022 - Full Spectrum Extension (FSE)

## Setup

In [None]:
# Imports

import os
import re
import tempfile
import typing as t

import requests
import xarray as xr

import tengen
from tengen import unit_registry as ureg

# Dataset attributes

IDENTIFIER = "coddington_2022-fse"
DATA_URL = "https://lasp.colorado.edu/lisird/resources/lasp/hsrs/v2/"
TITLE = "Total and Spectral Solar Irradiance Sensor-1 (TSIS-1) Hybrid Solar Reference Spectrum (HSRS), Version 2 - Full Spectrum Extension"
INSTITUTION = "Laboratory for Atmospheric and Space Physics"
SOURCE = "TSIS-1 Spectral Irradiance Monitor (SIM), CubeSat Compact SIM (CSIM), Air Force Geophysical Laboratory ultraviolet solar irradiance balloon observations, ground-based Quality Assurance of Spectral Ultraviolet Measurements In Europe Fourier transform spectrometer solar irradiance observations, Kitt Peak National Observatory solar transmittance atlas and the semi-empirical Solar Pseudo-Transmittance Spectrum atlas with independent observations and theoretical knowledge where no observations exist"
REFERENCES = "Coddington, O., Richard, E., Harber, D., Pilewskie, P., Woods, T. N., Snow, M., Chance, K., Liu, X., and Sun, K. (2022, accepted) Version 2 of the TSIS-1 Hybrid Solar Reference Spectrum and Extension to the Full Spectrum, Earth and Space Science Journal."

# Notebook configuration

UPDATE_CACHE = False  # change to True to update the cache when running this notebook

## Download

In [None]:
FILENAMES = [
    "binned_fs_",
    "fs_",
]
FILENAME_SUFFIX = "hybrid_reference_spectrum_c2022-11-30_with_unc.nc"


def download(
    url: str,
    path: t.Optional[os.PathLike] = None,
) -> None:
    """Download original data from url.

    Args:
        url: URL to download data from.
        path: Path to save data to (must be a directory). If None, a temporary
            directory is created and the raw data is saved there.
    """
    if path is None:
        tmpdir = tempfile.TemporaryDirectory()
        path = tmpdir.name

    # path must be a directory
    if not os.path.isdir(path):
        raise ValueError(f"Path {path} must be a directory.")

    for filename in FILENAMES:
        file = f"{filename}{FILENAME_SUFFIX}"
        response = requests.get(url + file)
        with open(os.path.join(path, file), "wb") as f:
            f.write(response.content)

## Format

In [None]:
def format_missing_carats_units(s: str) -> str:
    """Add missing carats to a malformed unit string.

    Will format a string 'm-1' to 'm^-1'.

    Args:
        s: Unit string.

    Returns:
        Formatted unit string.
    """
    where = [m.start() for m in re.finditer("[-+][0-9]", s)]

    for count, i in enumerate(where):
        s = s[: i + count] + "^" + s[i + count :]

    where2 = [m.start() for m in re.finditer("[a-z ][0-9]", s)]

    for count, i in enumerate(where2):
        s = s[: i + count + 1] + "^" + s[i + count + 1 :]
    return s


def format(
    data: os.PathLike,
    path: t.Optional[os.PathLike] = None,
) -> t.Optional[t.List[xr.Dataset]]:
    """Format original data.

    Args:
        data: Path to original data directory.
        path: Path to save formatted data to. If None, formatted datasets are
            returned.

    Returns:
        None or formatted datasets.
    """
    # check that path is a directory
    if path is not None and not os.path.isdir(path):
        raise ValueError(f"Path {path} must be a directory.")

    if path is None:
        datasets = []

    for filename in FILENAMES:
        file = f"{data}/{filename}{FILENAME_SUFFIX}"

        with xr.open_dataset(file, engine="netcdf4") as ds:
            # parse wavelength data
            w_units = ds["Vacuum Wavelength"].attrs["units"]
            w_magnitude = ds["Vacuum Wavelength"].values
            w = ureg.Quantity(w_magnitude, w_units)

            # parse solar spectral irradiance data
            ssi_units = format_missing_carats_units(ds["SSI"].attrs["units"])
            ssi_magnitude = ds["SSI"].values
            ssi = ureg.Quantity(ssi_magnitude, ssi_units)

        _attrs = {
            "title": f"{TITLE} (binned)" if "binned" in filename else TITLE,
            "institution": INSTITUTION,
            "source": SOURCE,
            "references": REFERENCES,
        }
        ds = tengen.to_dataset(ssi=ssi, w=w, data_url=DATA_URL, attrs=_attrs)

        if path is not None:
            filename = (
                f"{IDENTIFIER}_binned.nc"
                if "binned" in filename
                else f"{IDENTIFIER}.nc"
            )
            filename = os.path.join(path, filename)
            ds.to_netcdf(filename)
        else:
            datasets.append(ds)

    if path is None:
        return datasets

## Run

In [None]:
# (leave this cell as is)

if UPDATE_CACHE:
    original_data_dir = tengen.RAW_DATA_DIR / IDENTIFIER
    original_data_dir.mkdir(parents=True, exist_ok=True)

    download(url=DATA_URL, path=original_data_dir)

    formatted_data_dir = tengen.FORMATTED_DATA_DIR / IDENTIFIER
    formatted_data_dir.mkdir(parents=True, exist_ok=True)

    format(data=original_data_dir, path=formatted_data_dir)

else:
    with tempfile.TemporaryDirectory() as tmpdir:
        download(url=DATA_URL, path=tmpdir)
        dataset = format(data=tmpdir, path=None)