In [1]:
# NMDB helper functions: download monthly / yearly neutron monitor data
# in a robust way, directly into pandas DataFrames.
import re
import requests
import pandas as pd
from io import StringIO
from calendar import monthrange

# Base URL for NMDB "draw_graph" interface
BASE_URL = "https://www.nmdb.eu/nest/draw_graph.php"

In [2]:
def nmdb_month_df(year, month, stations,
                  dtype="corr_for_efficiency",
                  tresolution=2):
    """
    Download one month of NMDB data and return a tidy pandas DataFrame.

    Parameters
    ----------
    year : int
        Year (e.g. 2000).
    month : int
        Month as integer 1–12.
    stations : list of str
        List of requested station codes, e.g.
        ["MXCO", "JUNG1", "LMKS", "NEWK", "KERG", "OULU", "THUL", "SOPO"].
    dtype : str, optional
        NMDB data type, e.g. "corr_for_efficiency", "count_rate", etc.
    tresolution : int, optional
        Time resolution in minutes (as used by NMDB).

    Returns
    -------
    df : pandas.DataFrame
        DataFrame with columns:
          - "DATETIME" (UTC timestamps)
          - one column per station in `stations` (float, NaN where missing)

        If a station is requested but not available for that month, the column
        will be filled with NaNs for that month, but still present.

    Notes
    -----
    - NMDB sometimes returns only a subset of the requested stations for
      a given time range. We detect which stations actually appear in the
      header and then add empty columns for the missing ones, so the
      column layout is consistent across months.
    - Lines that do not look like data (e.g. header, comments, legal text)
      are ignored.
    """

    last_day = monthrange(year, month)[1]

    # Parameters are essentially those encoded in the NMDB query URL
    params = {
        "formchk": 1,
        "stations[]": stations,
        "tabchoice": "revori",
        "dtype": dtype,
        "tresolution": tresolution,
        "force": 1,
        "yunits": 0,
        "date_choice": "bydate",
        "start_day": 1,
        "start_month": month,
        "start_year": year,
        "start_hour": 0,
        "start_min": 0,
        "end_day": last_day,
        "end_month": month,
        "end_year": year,
        "end_hour": 23,
        "end_min": 59,
        "output": "ascii",
    }

    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    lines = response.text.splitlines()

    # ------------------------------------------------------------------
    # 1) Detect the station header line.
    #    It is a non-comment, non-empty line that appears just before the
    #    actual data and does NOT start with a timestamp.
    # ------------------------------------------------------------------
    header_line = None
    for ln in lines:
        if ln.startswith("#"):
            continue
        if not ln.strip():
            continue
        # If it's already a timestamp, we've reached the data block.
        if re.match(r'^\s*\d{4}-\d{2}-\d{2}\s', ln):
            break
        header_line = ln

    if header_line is not None:
        # Example: "                       MXCO    KERG    OULU    SOPO"
        stations_in_file = header_line.split()
    else:
        # Fallback: assume all requested stations are present (rare case).
        stations_in_file = stations

    # ------------------------------------------------------------------
    # 2) Extract only the data lines, which start with "YYYY-MM-DD ..."
    # ------------------------------------------------------------------
    data_lines = [
        ln for ln in lines
        if re.match(r'^\s*\d{4}-\d{2}-\d{2}\s', ln)
    ]

    if not data_lines:
        raise ValueError(f"No data lines found for {year}-{month:02d}. "
                         "Check that the stations / dates are valid and NMDB has data.")

    # Columns as they appear in the file: DATETIME + stations_in_file
    file_columns = ["DATETIME"] + stations_in_file

    # ------------------------------------------------------------------
    # 3) Read into a DataFrame using only valid data lines
    # ------------------------------------------------------------------
    df = pd.read_csv(
        StringIO("\n".join(data_lines)),
        sep=";",
        header=None,
        names=file_columns,
        na_values=["null"],
        on_bad_lines="skip",  # ignore malformed lines instead of failing
        engine="python",
    )

    # ------------------------------------------------------------------
    # 4) Parse timestamps and drop any rows where DATETIME is invalid
    # ------------------------------------------------------------------
    df["DATETIME"] = pd.to_datetime(df["DATETIME"].str.strip(),
                                    errors="coerce", utc=True)
    df = df.dropna(subset=["DATETIME"]).reset_index(drop=True)

    # ------------------------------------------------------------------
    # 5) Add missing station columns (requested but not present in file)
    # ------------------------------------------------------------------
    for st in stations:
        if st not in df.columns:
            df[st] = pd.NA

    # ------------------------------------------------------------------
    # 6) Reorder columns: DATETIME + stations (in requested order)
    # ------------------------------------------------------------------
    ordered_cols = ["DATETIME"] + list(stations)
    df = df[ordered_cols]

    return df


def nmdb_year_df(year, stations, dtype="corr_for_efficiency", tresolution=2):
    """
    Download a full year of NMDB data for a set of stations and return
    a single concatenated DataFrame.

    Months with no data (e.g., future months in the current year or
    months where NMDB has no records for the requested stations) are
    skipped gracefully.

    Parameters
    ----------
    year : int
        Year (e.g. 2000, 2018, 2025, etc.).
    stations : list of str
        Station list, e.g.
        ["MXCO", "JUNG1", "LMKS", "NEWK", "KERG", "OULU", "THUL", "SOPO"].
    dtype : str, optional
        NMDB data type (passed to `nmdb_month_df`).
    tresolution : int, optional
        Time resolution in minutes (passed to `nmdb_month_df`).

    Returns
    -------
    df_year : pandas.DataFrame
        Concatenated DataFrame for the whole year (for the months that
        actually have data), with columns:
        "DATETIME" + stations (in the given order).
    """

    monthly_dfs = []

    for month in range(1, 13):
        print(f"Downloading {year}-{month:02d} ...", flush=True)
        try:
            df_m = nmdb_month_df(
                year=year,
                month=month,
                stations=stations,
                dtype=dtype,
                tresolution=tresolution,
            )
        except ValueError as e:
            # No data lines found for this month (e.g., future month)
            print(f"  -> Skipping {year}-{month:02d}: {e}")
            continue
        except requests.HTTPError as e:
            # HTTP error from NMDB (network issue, server error, etc.)
            print(f"  -> Skipping {year}-{month:02d} due to HTTP error: {e}")
            continue

        if not df_m.empty:
            monthly_dfs.append(df_m)
        else:
            print(f"  -> Skipping {year}-{month:02d}: empty DataFrame returned.")

    if not monthly_dfs:
        raise RuntimeError(
            f"No monthly data could be retrieved for year {year}. "
            "Check the station list and the availability of NMDB data."
        )

    # Concatenate all available months
    df_year = pd.concat(monthly_dfs, ignore_index=True)

    # Sort chronologically and drop repeated timestamps (if any)
    df_year = df_year.sort_values("DATETIME").reset_index(drop=True)
    df_year = df_year.drop_duplicates(subset="DATETIME", keep="first").reset_index(drop=True)

    return df_year

In [3]:
stations = ["DOMB","DOMC","MRNY","INVK","APTY","AATB","AATA","ROME"]
def get_all(year):
    df = nmdb_year_df(year, stations)
    # (opcional pero recomendable) ordenar por tiempo
    df = df.sort_values("DATETIME").reset_index(drop=True)
    # (opcional) revisar si hay duplicados en el tiempo
    dups = df.duplicated(subset="DATETIME").sum()
    print(f"Número de timestamps duplicados: {dups}")
    # (recomendado) eliminar duplicados por DATETIME, conservando la primera aparición
    df = df.drop_duplicates(subset="DATETIME", keep="first").reset_index(drop=True)
    # Guardar todo el año en un solo CSV
    df.to_csv("Data2/nmdb_{}_all_stations.csv".format(year), index=True)
    print("Dataset para el año {} guardado exitosamente... DOne...!".format(year))
    return 

In [7]:
for i in range(2018,2020,1):
    get_all(i)

Downloading 2018-01 ...
Downloading 2018-02 ...
Downloading 2018-03 ...
Downloading 2018-04 ...
Downloading 2018-05 ...
Downloading 2018-06 ...
Downloading 2018-07 ...
Downloading 2018-08 ...
Downloading 2018-09 ...
Downloading 2018-10 ...
Downloading 2018-11 ...
Downloading 2018-12 ...
Número de timestamps duplicados: 0
Dataset para el año 2018 guardado exitosamente... DOne...!
Downloading 2019-01 ...
Downloading 2019-02 ...
Downloading 2019-03 ...
Downloading 2019-04 ...
Downloading 2019-05 ...
Downloading 2019-06 ...
Downloading 2019-07 ...
Downloading 2019-08 ...
Downloading 2019-09 ...
Downloading 2019-10 ...
Downloading 2019-11 ...
Downloading 2019-12 ...
Número de timestamps duplicados: 0
Dataset para el año 2019 guardado exitosamente... DOne...!
