# SOLID, 2017

## Setup

In [None]:
# Imports

import os
import pathlib
import shutil
import tempfile
import typing as t
import urllib.request
from contextlib import closing
from datetime import date, timedelta

import pandas as pd
import xarray as xr

import tengen
from tengen import unit_registry as ureg

# Dataset attributes

IDENTIFIER = "solid_2017"
DATA_URL = "ftp://ftp.pmodwrc.ch/pub/projects/SOLID/database/composite_published/SOLID_1978_published/"
TITLE = "SOLID solar irradiance composite spectrum (2017)"
INSTITUTION = "Physikalisch-Meteorologisches Observatorium and World Radiation Center, Davos Dorf, Switzerland"
SOURCE = "Combined original SSI observations from 20 different instruments"
REFERENCES = "https://doi.org/10.1002/2016JA023492"


# Notebook configuration

UPDATE_CACHE = False  # change to True to update the cache when running this notebook

## Download

In [None]:
FILES = [
    "solid_0_100.nc",
    "solid_100_100.nc",
    "solid_200_100.nc",
    "solid_300_100.nc",
    "solid_400_100.nc",
    "solid_500_100.nc",
    "solid_600_100.nc",
    "solid_700_100.nc",
    "solid_800_100.nc",
    "solid_900_100.nc",
    "solid_1000_100.nc",
    "solid_1100_100.nc",
    "solid_1200_100.nc",
    "solid_1300_100.nc",
    "solid_1400_100.nc",
    "solid_1500_100.nc",
    "solid_1600_100.nc",
    "solid_1700_100.nc",
    "solid_1800_100.nc",
    "solid_1900_100.nc",
]


def download(url: str, path: t.Optional[os.PathLike] = None) -> None:
    """Download original data from url.

    Args:
        url: URL to download data from.
        path: Path to save data to (must be a directory). If None, a temporary
            directory is created and the raw data is saved there.
    """
    if path is None:
        tmpdir = tempfile.TemporaryDirectory()
        path = pathlib.Path(tmpdir.name)
    else:
        path = pathlib.Path(path)
        path.mkdir(parents=True, exist_ok=True)

    # path must be a directory
    if not path.is_dir():
        raise ValueError(f"Path must be a directory (got {path}).")

    for file in FILES:
        with closing(urllib.request.urlopen(url + file)) as r:
            with open(path / file, "wb") as f:
                shutil.copyfileobj(r, f)

## Format

In [None]:
import re


def format_missing_carats_units(s: str) -> str:
    """Add missing carats to a malformed unit string.

    Will format a string 'm-1' to 'm^-1'.

    Args:
        s: Unit string.

    Returns:
        Formatted unit string.
    """
    where = [m.start() for m in re.finditer("[-+][0-9]", s)]

    for count, i in enumerate(where):
        s = s[: i + count] + "^" + s[i + count :]

    where2 = [m.start() for m in re.finditer("[a-z ][0-9]", s)]

    for count, i in enumerate(where2):
        s = s[: i + count + 1] + "^" + s[i + count + 1 :]
    return s


def format(
    data: os.PathLike,
    path: t.Optional[os.PathLike] = None,
) -> t.Optional[xr.Dataset]:
    """Format original data.

    Args:
        data: Path to original data directory.
        path: Directory to save formatted data to. If None, the dataset is
            returned

    Returns:
        Formatted data or None.
    """
    datasets = [xr.open_dataset(file) for file in pathlib.Path(data).glob("*.nc")]
    merged = xr.merge(datasets)
    end = date(2014, 12, 30)
    start = end - timedelta(merged.time.size - 1)

    ssi_magnitude = merged.data.values.transpose()
    ssi_units = format_missing_carats_units(merged.data.attrs["units"])
    ssi = ssi_magnitude * ureg(ssi_units)
    w = merged.wavelength.values * ureg(merged.wavelength.attrs["units"])
    t = pd.date_range(start, end, freq="D")

    ds = tengen.to_dataset(
        w=w,
        ssi=ssi,
        data_url=DATA_URL,
        t=t,
        attrs={
            "title": TITLE,
            "institution": INSTITUTION,
            "source": SOURCE,
            "references": REFERENCES,
            "observation_period": f"{start} - {end}",
        },
    )

    if path is not None:
        filename = f"{IDENTIFIER}.nc"
        ds.to_netcdf(path / filename)
    else:
        return ds

## Run

In [None]:
# (leave this cell as is)

if UPDATE_CACHE:
    original_data_dir = tengen.RAW_DATA_DIR / IDENTIFIER
    original_data_dir.mkdir(parents=True, exist_ok=True)

    download(url=DATA_URL, path=original_data_dir)

    formatted_data_dir = tengen.FORMATTED_DATA_DIR / IDENTIFIER
    formatted_data_dir.mkdir(parents=True, exist_ok=True)

    format(data=original_data_dir, path=formatted_data_dir)

else:
    with tempfile.TemporaryDirectory() as tmpdir:
        download(url=DATA_URL, path=tmpdir)
        dataset = format(data=tmpdir, path=None)