# WHI, 2008

## Setup

In [None]:
# Imports

from datetime import date
import os
import tempfile
import typing as t
from pathlib import Path

import numpy as np
import requests
import xarray as xr

import tengen
from tengen import unit_registry as ureg


# Dataset attributes

IDENTIFIER = "whi_2008"
DATA_URL = "https://lasp.colorado.edu/lisird/resources/whi_ref_spectra/data/ref_solar_irradiance_whi-2008_ver2.dat"
TITLE = "Whole Heliosphere Interval (WHI) solar irradiance reference spectrum (2008)"
INSTITUTION = "Laboratory for Atmospheric and Space Physics, University of Colorado, Boulder, Colorado, USA"
SOURCE = "Combination of satellite observations from the SEE and SORCE instruments onboard the TIMED satellite and a prototype EVE instrument onboard a sounding rocket launched on 2008-04-14."
REFERENCES = "https://doi.org/10.1029/2008GL036373"

# Notebook configuration

UPDATE_CACHE = False  # change to True to update the cache when running this notebook

## Download

In [None]:
FILE = "ref_solar_irradiance_whi-2008_ver2.dat"

def download(url: str, path: t.Optional[os.PathLike] = None) -> None:
    """Download original data from url.

    Args:
        url: URL to download data from.
        path: Path to save data to. If None, the data is saved to (a) temporary
            file(s).
    """
    response = requests.get(url)
    raw_data = response.content

    if path is None:
        tmpdir = tempfile.TemporaryDirectory()
        path = Path(tmpdir.name)
    else:
        path = Path(path)
        path.mkdir(parents=True, exist_ok=True)
    
    with open(path / FILE, "wb") as f:
        f.write(raw_data)

## Format

In [None]:
WHI_2008_TIME_PERIOD = {
    "sunspot active": (date(2008, 3, 25), date(2008, 3, 29)),
    "faculae active": (date(2008, 3, 29), date(2008, 4, 4)),
    "quiet sun": (date(2008, 4, 10), date(2008, 4, 16)),
}

def format(
    data: os.PathLike,
    path: t.Optional[os.PathLike] = None,
) ->  t.Optional[t.List[xr.Dataset]]:
    """Format original data.

    Args:
        data: Path to original data directory.
        path: Directory to save formatted data to. If None, the dataset is
            returned

    Returns:
        Formatted data or None.
    """
    data = Path(data)
    
    if path is None:
        datasets = []
    else:
        path = Path(path)
        if not path.is_dir():
            raise ValueError(f"Path must be a directory (got {path}).")
        path.mkdir(parents=True, exist_ok=True)
    
    data = np.loadtxt(
        fname=data / FILE,
        comments=";",
        skiprows=142,
    )

    wavelength = data[:, 0]
    datasets = []
    for identifier, (start, end) in WHI_2008_TIME_PERIOD.items():
        time_period_index = list(WHI_2008_TIME_PERIOD.keys()).index(identifier)
        dataset = tengen.to_dataset(
            ssi=data[:, time_period_index] * ureg.W / ureg.m ** 2 / ureg.nm,
            w=wavelength * ureg.nm,
            data_url=DATA_URL,
            attrs={
                "title": TITLE + f" ({identifier})",
                "institution": INSTITUTION,
                "source": SOURCE,
                "references": REFERENCES,
                "observation_period": " to ".join(
                    [x.strftime("%Y-%m-%d") for x in [start, end]]
                ),
            },
        )
        if path is not None:
            resolution = f"{identifier}".replace(" ", "_")
            filename = os.path.join(path, f"{IDENTIFIER}-{resolution}.nc")
            dataset.to_netcdf(filename)
        else:
            datasets.append(dataset)
    
    if path is None:
        return datasets

## Run

In [None]:
# (leave this cell as is)

if UPDATE_CACHE:

    original_data_dir = tengen.RAW_DATA_DIR / IDENTIFIER
    original_data_dir.mkdir(parents=True, exist_ok=True)

    download(url=DATA_URL, path=original_data_dir)

    formatted_data_dir = tengen.FORMATTED_DATA_DIR / IDENTIFIER
    formatted_data_dir.mkdir(parents=True, exist_ok=True)

    format(data=original_data_dir, path=formatted_data_dir)

else:
    with tempfile.TemporaryDirectory() as tmpdir:
        download(url=DATA_URL, path=tmpdir)
        dataset = format(data=tmpdir, path=None)