# Prepare Inputs

This notebook fetches the external data and updates the copies stored in this repository.

1. Fetch the Google Sheet containing the list of products and descriptions
   - Store the data in `input/overview.csv`
2. Fetch example product files for each product and extract metadata
   - Store tables of metadata for variables in `input/vartables/*.csv`

In [None]:
import os
import requests
# import pooch
import json
import pandas as pd
import cdflib
import urllib
import zipfile
from tempfile import NamedTemporaryFile
import shutil
# import datetime as dt
from ftplib import FTP
# from time import sleep

In [None]:
DOC_URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vStz17Gi-O3tJjWcT_F0zYj4eCVuiiaU9ewpKTLlu_qRak-Cd0NHG3oQa0lcVFmWC2TFK3ecZHvdPxT/pub?output=xlsx"
CSV_PATH = os.path.abspath("input/overview.csv")
CSV_VARTABLES_PATH = os.path.abspath("input/vartables")

## Fetch Google Doc (products overview)

In [None]:
def load_google_sheet(url=DOC_URL):
    xl_doc = requests.get(url).content
    overview = pd.read_excel(xl_doc, "Overview", header=1)
    overview = overview.set_index("Name").fillna("-")
    # names = list(overview.index.dropna())
    # details = {}
    # missing_sheets = []
    # for name in names:
    #     try:
    #         details[name] = pd.read_excel(xl_doc, name).set_index("FIELD", drop=False).fillna("-")
    #     except Exception:
    #         missing_sheets.append(name)
    #         details[name] = pd.DataFrame()
    return overview

In [None]:
overview = load_google_sheet(DOC_URL)

In [None]:
overview.head()

In [None]:
# Sanitise url and guess FTP link from it
url_http = overview.get("Link: HTTP").apply(lambda s: s.replace("%2F", "/"))
url_ftp_guessed = url_http.apply(lambda s: s.replace("#swarm/", "").replace("https", "ftp"))
# Replace FTP links with the guessed one if there isn't one already
for i in range(overview.shape[0]):
    name = overview.iloc[i].name
    if overview.iloc[i]["Link: FTP"] == "-":
        overview.loc[name, "Link: FTP"] = url_ftp_guessed.iloc[i]

In [None]:
overview

In [None]:
overview.to_csv(CSV_PATH)

## Fetch VirES `product_types.json` (tables of metadata for variables in products)

In [None]:
PRODUCT_TYPES_URL = "https://raw.githubusercontent.com/ESA-VirES/VirES-Server/staging/vires/vires/data/product_types.json"

In [None]:
def load_product_types_json(url=PRODUCT_TYPES_URL):
    json_content = json.loads(
        requests.get(url).content
    )
    names = [content["name"] for content in json_content]
    product_metadata = {}
    for name, product_content in zip(names, json_content):
        product_metadata[name] = product_content
    return product_metadata

In [None]:
product_metadata = load_product_types_json(PRODUCT_TYPES_URL)

In [None]:
# Mapping between names used in json file and our csv
names_short_to_long = {
    "MODx_SC_1B": 'SW_MODx_SC_1B',
    "MAGx_LR_1B": 'SW_MAGx_LR_1B',
    "MAGx_HR_1B": 'SW_MAGx_HR_1B',
    "EFIx_LP_1B": 'SW_EFIx_LP_1B',
    "IBIxTMS_2F": 'SW_IBIxTMS_2F',
    "EEFxTMS_2F": 'SW_EEFxTMS_2F',
    "FACxTMS_2F": 'SW_FACxTMS_2F',
    "TECxTMS_2F": 'SW_TECxTMS_2F',
    "IPDxIRR_2F": 'SW_IPDxIRR_2F',
    # "": 'SW_AUX_IMF_2_',
    # "AEJxLPL_2F": 'SW_AEJxLPL_2F',
    # "AEJxPBL_2F": 'SW_AEJxPBL_2F',
    # "AEJxLPS_2F": 'SW_AEJxLPS_2F',
    # "AEJxPBS_2F": 'SW_AEJxPBS_2F',
    # "AOBxFAC_2F": 'SW_AOBxFAC_2F',
    # "MITx_LP_2F": 'SW_MITx_LP_2F',
    # "MITxTEC_2F": 'SW_MITxTEC_2F',
    # "PPIxFAC_2F": 'SW_PPIxFAC_2F',
    # "": 'OMNI_HR_1min',
    # "AUX_OBSS2_": 'SW_AUX_OBSx2_',
    # "AUX_OBSM2_": 'SW_AUX_OBSx2_',
    # "AUX_OBSH2_": 'SW_AUX_OBSH2_',
    # "VOBS_1M_2_": 'SW_VOBS_xM_2_',
    # "VOBS_4M_2_": 'SW_VOBS_xM_2_',
    # "": 'GRACE_x_MAG',
    # "": 'GFx_FGM_ACAL',
    # "": 'CS_MAG'
}

In [None]:
# Each collection can contain subcollections
for name, content in product_metadata.items():
    print(content["datasets"].keys())

## Attempt to fetch example file for each product and extract metadata

In [None]:
def split_host_link(link):
    link = os.path.normpath(link).split(os.path.sep)
    host = link[1]
    directory = os.path.join(*link[2:])
    return host, directory


def find_file(host, directory, match=""):
    """Try to find a file to use"""
    # List files found in given directory
    def check_dir(dir_):
        print("Searching:", dir_)
        with FTP(host) as ftp:
            ftp.login()
            files = ftp.nlst(os.path.join(dir_, "*.ZIP"))
        return files
    if match=="FAC_TMS_2F":
        files = check_dir(os.path.join(directory, "Sat_AC"))
    else:
        files = check_dir(os.path.join(directory, "Sat_A"))
    if len(files) == 0:
        files = check_dir(os.path.join(directory))
        if len(files) == 0:
            print("! No .ZIP files at:", directory)
            return None
    if match != "":
        # Reduce to subset containing "match"
        files = [f for f in files if match in f]
        # Screen out validation reports
        files = [f for f in files if "VAL_" not in f]
        if len(files) == 0:
            print(f"! No match found containing '{match}' and without 'VAL_'")
            return None
    # Identify most recent file
    files.sort()
    matched_file = files[-1]
    file_url = "ftp://" + os.path.join(host, matched_file)
    print("Found:", file_url)
    return file_url


# cdf_file_name_exceptions_match_pattern = {
#     "MAGx_CA_1B": "MDR_MAG",
#     "MAGx_HR_1B": "MDR_MAG",
#     "MAGx_LR_1B": "MDR_MAG"
# }


def fetch_zipped_file(url, match="cdf"):
    """Fetch a given file type from within an online zip file"""
    output_file = NamedTemporaryFile()
    zip_file, _ = urllib.request.urlretrieve(url)
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        file_names = zip_ref.namelist()
        file_names_matched = [i for i in file_names if (match in i) or (match.upper() in i)]
        if len(file_names_matched) == 0:
            print("! Could not find match out of:", file_names)
            return None
        if len(file_names_matched) > 1:
            print("! Found too many:", file_names_matched)
            # Special case with MDR_MAG files
            potential = [
                i for i in file_names_matched if "MDR_MAG" in i
            ]
            if len(potential) == 1:
                file_names_matched = potential
        file_name = file_names_matched[0]
        print("Using file:", file_name)
        with zip_ref.open(file_name) as f:
            shutil.copyfileobj(f, output_file)
            output_file.seek(0)
    return output_file


# def try_x_times(fn, x=3, delay_factor=10):
#     """Attempt function, fn, x times, with increasing delays"""
#     for attempt in range(x):
#         try:
#             outputs = fn()
#         except Exception as e:
#             if attempt < x - 1:
#                 delay = (attempt + 1) * delay_factor
#                 print(f"Failed ({e}). Trying again in {delay}s...")
#                 sleep(delay)
#                 continue
#             else:
#                 raise
#         return outputs

In [None]:
def get_var_attributes(cdf, varname) -> dict:
    varatts = cdf.varattsget(varname)
    description = varatts.get("DESCRIPTION", "-")
    units = varatts.get("UNITS", "-")
    dims = cdf.varinq(varname).get("Dim_Sizes")
    if dims == []:
        dims = "1"
    elif len(dims) == 1:
        dims = str(dims[0])
    else:
        dims = str(dims)
    type_ = cdf.varinq(varname).get("Data_Type_Description")
    return {
        "Name": varname,
        "Units": units,
        "Description": description,
        "Dim": dims,
        "Type": type_
    }


def make_vartable(cdf) -> pd.DataFrame:
    cdfinfo = cdf.cdf_info()
    varnames = cdfinfo.get("zVariables")
    cols = ["Units", "Description", "Dim", "Type"]
    product_varinfo = pd.DataFrame(columns=cols, index=varnames)
    product_varinfo.index.name = "Variable"
    for varname in varnames:
        varatts = get_var_attributes(cdf, varname)
        for col in cols:
            product_varinfo.loc[varname, col] = varatts[col]
    return product_varinfo

In [None]:
def find_file_and_make_vartable(product_name):
    # Find example file
    ftp_link = overview.loc[product_name, "Link: FTP"]
    if ftp_link[:3] != "ftp":
        print("! No ftp link for:", product_name)
        return None
    host, directory = split_host_link(ftp_link)
    matchname = product_name.replace("x", "A").replace("*", "")
    file_zip_url = find_file(host, directory, match=matchname)
    # Download and open file, extracting the metadata
    _file = fetch_zipped_file(file_zip_url, match="cdf")
    if not _file:
        print("! Unable to find match for:", product_name)
        return None
    cdf = cdflib.cdfread.CDF(_file.name, string_encoding='utf-8')
    globalatts = cdf.globalattsget()
    vartable = make_vartable(cdf)
    return vartable

In [None]:
failed_products = []
products = overview.index
# products = ["MAGx_CA_1B", "MAGx_HR_1B", "MAGx_LR_1B"]
# products = ["MCO_SHA_2C", "MMA_SHA_2F", "MCR_1DM2"]
# products = ["FAC_TMS_2F"]
for product_name in products:
    try:
        vartable = find_file_and_make_vartable(product_name)
        if vartable is not None:
            vartable.to_csv(os.path.join(CSV_VARTABLES_PATH, f"{product_name}.csv"))
            print("Saved:", f"{product_name}.csv")
    except Exception as e:
        print("! Failed:", product_name)
        print(e)
        failed_products.append(product_name)
    print()