In [1]:
import re
import pandas as pd
import numpy as np
from calendar import month_name

In [2]:
def read_emodnet(filepath, flag):
    input_columns = [
        "datetime", "longitude", "latitude", "depth",
        "nitrate", "nitrite", "ammonium", "phosphate", "silicate", "oxygen", "chlorophyll",
        "instrument_type", "cruise_id"]
    output_columns = [
        "YYYY", "MM", "DD", "lat", "lon", "depth",
        "nitrate", "nitrite", "ammonium", "phosphate", "silicate", "oxygen", "chlorophyll",
        "ID_type_profile", "cruise_id", "flag"]

    df = pd.read_parquet(filepath, columns=input_columns)
    df["YYYY"] = df.datetime.map(lambda dt: dt.year)
    df["MM"] = df.datetime.map(lambda dt: dt.month)
    df["DD"] = df.datetime.map(lambda dt: dt.day)
    df["instrument_type"] = df.instrument_type.map(lambda b: b.decode("utf-8"))
    df["cruise_id"] = df.cruise_id.map(lambda b: b.decode("utf-8"))
    df["flag"] = flag
    df.rename(columns={"latitude": "lat", "longitude": "lon", "instrument_type": "ID_type_profile"}, inplace=True)

    return df[output_columns]

In [3]:
def parse_month(m):
    if re.fullmatch(r'[0-9]+\.0+', m):
        return float(m)
    else:
        search = map(lambda s: re.match(f"(?i){m}", s) is not None, month_name)
        return float(list(search).index(True))

In [4]:
def parse_instrument_type(t):
    if re.match("1", t) is not None:
        return "nut"
    else:
        return "probe"

In [5]:
def read_cruises(filename, flag, idcampains):
    df = pd.read_csv(filename, converters={"MM": parse_month, "ID_type_profile": parse_instrument_type})
    df["YYYY"] = df.YYYY.map(int)
    df["MM"] = df.MM.map(int)
    df["DD"] = df.DD.map(int)
    df["idcampain"] = df.idcampain.map(int)
    df["cruise_id"] = np.fromiter(map(lambda n: idcampains[n], df.idcampain - 1), dtype=object)
    for name in ["density", "id_profile"]:
        if name in df.columns:
            df.drop(columns=name, inplace=True)
    df["flag"] = flag
    return df

In [8]:
emodnet_profiles = read_emodnet("../1999-2023/parquet/emodnet_profile.parquet", "1-emodnet_profile")
emodnet_timeseries = read_emodnet("../1999-2023/parquet/emodnet_timeseries.parquet", "1-emodnet_timeseries")
emodnet = pd.concat([emodnet_timeseries, emodnet_profiles], ignore_index=True)

emodnet

ValueError: Following columns were requested but are not available: {'chlorophyll'}.
All requested columns: ['datetime', 'longitude', 'latitude', 'depth', 'nitrate', 'nitrite', 'ammonium', 'phosphate', 'silicate', 'oxygen', 'chlorophyll', 'instrument_type', 'cruise_id']
Available columns: ['longitude', 'latitude', 'cruise_id', 'instrument_type', 'platform_type', 'station_id', 'station_ncref', 'depth', 'datetime', 'nitrate', 'nitrate_qc', 'nitrite', 'nitrite_qc', 'ammonium', 'ammonium_qc', 'phosphate', 'phosphate_qc', 'silicate', 'silicate_qc', 'oxygen', 'oxygen_qc', 'alkalinity', 'alkalinity_qc', 'dic', 'dic_qc', 'salinity', 'salinity_qc', 'temperature', 'temperature_qc']

In [None]:
emodnet_rounded = emodnet[["YYYY", "MM", "DD"]].copy()
emodnet_rounded["lon"] = np.round(emodnet.lon, 3)
emodnet_rounded["lat"] = np.round(emodnet.lat, 3)
emodnet_rounded["depth"] = np.round(emodnet.depth, 1)

emodnet = emodnet[~emodnet_rounded.duplicated(keep=False)]
emodnet

In [None]:
vdb_idcampains = ['TALPRO', 'SOMBA', 'PEACETIME', 'MEDWAVES', 'MSM72', 'GIANI']
vdb_cruises = read_cruises("df_Nut_cruises_Med_New.csv", "2-cruises_new", vdb_idcampains)

gpc_idcampains = ['06MT51/2','BIOPT06','CANARI','DYFAMED','DYFAMED/PAPADOC - 99','MEDCIESM',
'MEDGOOS2','MEDGOOS3','MEDGOOS4','MEDGOOS5','MELISSA 2004','MT84_3','NORBAL','NORBAL2',
'NORBAL3','NORBAL4','POSEIDONE1M3A','PROSOPE','RHOFI 1','RHOFI 2','RHOFI 3','SINAPSI-3',
'SINAPSI-4']
gpc_cruises = read_cruises("df_Nut_cruises_Med_GP.csv", "3-cruises_old", gpc_idcampains)

cruises = pd.concat([vdb_cruises, gpc_cruises], ignore_index=True)
cruises

In [None]:
data = pd.concat([emodnet, cruises])
data.sort_values(by=["YYYY", "MM", "DD", "lat", "lon", "depth", "flag"], inplace=True)
data.drop_duplicates(subset=["YYYY", "MM", "DD", "lat", "lon", "depth"], inplace=True, ignore_index=True)
data["idcampain"] = pd.Categorical(data.cruise_id).codes
data

In [None]:
data.to_parquet("1999-2023_nutrients.parquet")