# Meftah, 2018

## Setup

In [None]:
# Imports

import datetime
import os
from pathlib import Path
import tempfile
import typing as t
import zlib

import numpy as np
import requests
import xarray as xr

import tengen
from tengen import unit_registry as ureg


# Dataset attributes

IDENTIFIER = "meftah_2018"
RAW_DATA_URL = "http://cdsarc.u-strasbg.fr/ftp/J/A+A/611/A1/spectrum.dat.gz"
TITLE = "Meftah et al (2018) solar irradiance reference spectrum"
INSTITUTION = "CNRS, LATMOS, Université Paris Saclay, Université Pierre et Marie Curie, UVSQ, INSU, IPSL, 75005 Paris, France"
SOURCE = "Observations from the SOLSPEC instrument of the SOLAR payload onboard the international space station"
REFERENCES="https://doi.org/10.1051/0004-6361/201731316"


# Notebook configuration

UPDATE_CACHE = False  # change to True to update the cache when running this notebook

## Raw data download

In [None]:
FILE = "spectrum.dat"

def download_raw_data(url: str, path: t.Optional[os.PathLike] = None) -> None:
    """Download raw data from url.

    Args:
        url: URL to download data from.
        path: Path to save data to. If None, the data is saved to (a) temporary
            file(s).
    """
    response = requests.get(RAW_DATA_URL)
    raw_data = zlib.decompress(response.content, 32)

    if path is None:
        tmpdir = tempfile.TemporaryDirectory()
        path = Path(tmpdir.name)
    else:
        path = Path(path)
        path.mkdir(parents=True, exist_ok=True)
    
    filename = FILE
    with open(path / filename, "wb") as f:
        f.write(raw_data)

## Raw data formatting

In [None]:
def format_raw_data(
    raw_data: os.PathLike,
    path: t.Optional[os.PathLike] = None,
) ->  t.Optional[xr.Dataset]:
    """Format raw data.

    Args:
        raw_data: Path to raw data directory.
        path: Directory to save formatted data to. If None, the dataset is
            returned

    Returns:
        Formatted data or None.
    """
    file = Path(raw_data) / FILE
    # read raw data
    data = np.genfromtxt(
        fname=file,
        missing_values="---",
        filling_values=np.nan,
    )

    wavelength = data[:, 0]
    spectral_irradiance = data[:, 1]

    # The raw data covers the 0.5 to 3000.10 nm range whereas the range
    # indicated by Meftah (2018) in:
    # https://doi.org/10.1051/0004-6361/201731316
    # is 165 to 3000 nm.
    # Therefore, we ignore wavelengthes < 165, and keep the 3000.10 nm point.
    mask = wavelength >= 165.0
    
    w = wavelength[mask] * ureg.nm
    ssi = spectral_irradiance[mask] * ureg.W / ureg.m ** 2 / ureg.nm

    start = datetime.date(2008, 4, 5)
    end = datetime.date(2016, 12, 31)
    observation_period=" to ".join(
        [x.strftime("%Y-%m-%d") for x in [start, end]]
    )

    ds = tengen.to_dataset(
        ssi=ssi,
        w=w,
        data_url=RAW_DATA_URL,
        attrs={
            "title": TITLE,
            "institution": INSTITUTION,
            "source": SOURCE,
            "references": REFERENCES,
            "observation_period": observation_period,
        },
    )

    if path is not None:
        filename = f"{IDENTIFIER}.nc"
        ds.to_netcdf(path / filename)
    else:
        return ds

## Run

In [None]:
# (leave this cell as is)

if UPDATE_CACHE:
    raw_data_dir = tengen.RAW_DATA_DIR / IDENTIFIER
    raw_data_dir.mkdir(parents=True, exist_ok=True)
    formatted_data_dir = tengen.FORMATTED_DATA_DIR / IDENTIFIER
    formatted_data_dir.mkdir(parents=True, exist_ok=True)

    download_raw_data(
        url=RAW_DATA_URL,
        path=raw_data_dir,
    )
    format_raw_data(
        raw_data=raw_data_dir,
        path=formatted_data_dir,
    )

else:
    with tempfile.TemporaryDirectory() as tmpdir:
        download_raw_data(
            url=RAW_DATA_URL,
            path=tmpdir,
        )
        dataset = format_raw_data(
            raw_data=tmpdir,
            path=None,
        )