# Thuillier, 2003

## Setup

In [None]:
# Imports

import os
from pathlib import Path
import tempfile
import typing as t

import numpy as np
import requests
import xarray as xr

import tengen
from tengen import unit_registry as ureg


# Dataset attributes

IDENTIFIER = "thuillier_2003"  # unique identifier for the dataset in the cache
RAW_DATA_URL = "https://oceancolor.gsfc.nasa.gov/docs/rsr/f0.txt"
TITLE = "Thuillier (2003) solar irradiance spectrum"
INSTITUTION = "Service d'Aéronomie du CNRS, F91371, Verrières-le-Buisson, France."
SOURCE = "Combined observations from the SOLSPEC instrument during the ATLAS-1 mission (from 1992-03-24 to 1992-04-02) and the SOSP instrument onboard the EURECA satellite (from 1992-8-7 to 1993-7-1), with the Kurucz and Bell (1995) synthetic spectrum."
REFERENCES="https://doi.org/10.1023/A:1024048429145"


# Notebook configuration

UPDATE_CACHE = False  # change to True to update the cache when running this notebook

## Raw data download

In [None]:
FILE = "f0.txt"

def download_raw_data(url: str, path: t.Optional[os.PathLike] = None) -> None:  # replace t.Any with appropriate type
    """Download raw data from url.

    Args:
        url: URL to download data from.
        path: Path to save data to. If None, the data is saved to a temporary
            file.
    """
    response = requests.get(url)
    raw_data = response.content

    if path is None:
        tmpdir = tempfile.TemporaryDirectory()
        path = Path(tmpdir.name)
    else:
        path = Path(path)
        path.mkdir(parents=True, exist_ok=True)
    
    with open(path / FILE, "wb") as f:
        f.write(raw_data)

## Raw data formatting

In [None]:
def format_raw_data(
    raw_data: os.PathLike,
    path: t.Optional[os.PathLike] = None,
) ->  t.Optional[xr.Dataset]:
    """Format raw data.

    Args:
        raw_data: Path to raw data directory.
        path: Directory to save formatted data to. If None, the dataset is
            returned

    Returns:
        Formatted data or None.
    """
    file = Path(raw_data) / FILE

    data = np.loadtxt(
        fname=file,
        comments=["/", "!"],
        encoding="latin-1",
    )
    w = data[:, 0] * ureg.nm
    ssi = data[:, 1] * ureg.microwatt / ureg.cm ** 2 / ureg.nm

    ds = tengen.to_dataset(
        w=w,
        ssi=ssi,
        data_url=RAW_DATA_URL,
        attrs={
            "Conventions": "CF-1.10",
            "title": TITLE,
            "institution": INSTITUTION,
            "source": SOURCE,
            "references": REFERENCES,
        },
    )

    if path is not None:
        filename = f"{IDENTIFIER}.nc"
        ds.to_netcdf(path / filename)
    else:
        return ds

## Run

In [None]:
# (leave this cell as is)

if UPDATE_CACHE:
    raw_data_dir = tengen.RAW_DATA_DIR / IDENTIFIER
    raw_data_dir.mkdir(parents=True, exist_ok=True)
    formatted_data_dir = tengen.FORMATTED_DATA_DIR / IDENTIFIER
    formatted_data_dir.mkdir(parents=True, exist_ok=True)

    download_raw_data(
        url=RAW_DATA_URL,
        path=raw_data_dir,
    )
    format_raw_data(
        raw_data=raw_data_dir,
        path=formatted_data_dir,
    )

else:
    with tempfile.TemporaryDirectory() as tmpdir:
        download_raw_data(
            url=RAW_DATA_URL,
            path=tmpdir,
        )
        dataset = format_raw_data(
            raw_data=tmpdir,
            path=None,
        )