In [None]:
import re

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from calendar import month_name

In [5]:
def read_emodnet(filepath, flag):
    input_columns = [
        "datetime", "longitude", "latitude", "depth", 
        "nitrate", "nitrite", "ammonium", "phosphate", "silicate", "oxygen", "chlorophyll", 
        "instrument_type", "cruise_id"]
    output_columns = [
        "YYYY", "MM", "DD", "lat", "lon", "depth", 
        "nitrate", "nitrite", "ammonium", "phosphate", "silicate", "oxygen", "chlorophyll", 
        "ID_type_profile", "cruise_id", "flag"]
    
    df = pd.read_parquet(filepath, columns=input_columns)
    df["YYYY"] = df.datetime.map(lambda dt: dt.year)
    df["MM"] = df.datetime.map(lambda dt: dt.month)
    df["DD"] = df.datetime.map(lambda dt: dt.day)
    df["flag"] = flag
    df.rename(columns={"latitude": "lat", "longitude": "lon", "instrument_type": "ID_type_profile"}, inplace=True)
 
    return df[output_columns]

In [6]:
def parse_month(m):
    if re.fullmatch("[0-9]+\.0+", m):
        return float(m)
    else:
        search = map(lambda s: re.match(f"(?i){m}", s) is not None, month_name)
        return float(list(search).index(True))

In [7]:
def parse_instrument_type(t):
    if re.match("1", t) is not None:
        return "nut"
    else:
        return "probe"

In [8]:
def read_cruises(filename, flag, idcampains):
    df = pd.read_csv(filename, converters={"MM": parse_month, "ID_type_profile": parse_instrument_type})
    df["YYYY"] = df.YYYY.map(int)
    df["MM"] = df.MM.map(int)
    df["DD"] = df.DD.map(int)
    df["idcampain"] = df.idcampain.map(int)
    df["cruise_id"] = np.fromiter(map(lambda n: idcampains[n], df.idcampain - 1), dtype=object)
    for name in ["density", "id_profile"]:
        if name in df.columns:
            df.drop(columns=name, inplace=True)
    df["flag"] = flag
    return df

In [9]:
emodnet_profiles = read_emodnet("nutrients_profile.parquet", "1-emodnet_profile")
emodnet_timeseries = read_emodnet("nutrients_timeseries.parquet", "1-emodnet_timeseries")
emodnet = pd.concat([emodnet_timeseries, emodnet_profiles], ignore_index=True)

emodnet

Unnamed: 0,YYYY,MM,DD,lat,lon,depth,nitrate,nitrite,ammonium,phosphate,silicate,oxygen,chlorophyll,ID_type_profile,cruise_id,flag
0,2007,10,17,45.489220,13.582770,1.400000,,,,,,300.728851,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
1,2007,10,17,45.489220,13.582770,1.500000,,,,,,301.058472,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
2,2007,10,17,45.489220,13.582770,1.600000,,,,,,301.359009,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
3,2007,10,17,45.489220,13.582770,1.700000,,,,,,301.583649,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
4,2007,10,17,45.489220,13.582770,1.800000,,,,,,301.718109,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7812374,1999,10,23,41.587502,2.850500,75.690529,,,,,,,0.13002,probe,Yoyo II,1-emodnet_profile
7812375,1999,10,23,41.587502,2.850500,100.980591,,,,,,,0.10783,probe,Yoyo II,1-emodnet_profile
7812376,1999,10,23,41.587502,2.850500,150.758179,,,,,,,0.12745,probe,Yoyo II,1-emodnet_profile
7812377,2008,1,1,34.723991,33.342461,0.500000,0.40,0.40,0.70,0.02,,,,probe,Zigy_monitoring,1-emodnet_profile


In [10]:
emodnet_rounded = emodnet[["YYYY", "MM", "DD"]].copy()
emodnet_rounded["lon"] = np.round(emodnet.lon, 3)
emodnet_rounded["lat"] = np.round(emodnet.lat, 3)
emodnet_rounded["depth"] = np.round(emodnet.depth, 1)

emodnet = emodnet[~emodnet_rounded.duplicated(keep=False)]
emodnet

Unnamed: 0,YYYY,MM,DD,lat,lon,depth,nitrate,nitrite,ammonium,phosphate,silicate,oxygen,chlorophyll,ID_type_profile,cruise_id,flag
0,2007,10,17,45.489220,13.582770,1.400000,,,,,,300.728851,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
1,2007,10,17,45.489220,13.582770,1.500000,,,,,,301.058472,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
2,2007,10,17,45.489220,13.582770,1.600000,,,,,,301.359009,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
3,2007,10,17,45.489220,13.582770,1.700000,,,,,,301.583649,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
4,2007,10,17,45.489220,13.582770,1.800000,,,,,,301.718109,,probe,NIB-CTD-20071017_0357-0035,1-emodnet_timeseries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7812374,1999,10,23,41.587502,2.850500,75.690529,,,,,,,0.13002,probe,Yoyo II,1-emodnet_profile
7812375,1999,10,23,41.587502,2.850500,100.980591,,,,,,,0.10783,probe,Yoyo II,1-emodnet_profile
7812376,1999,10,23,41.587502,2.850500,150.758179,,,,,,,0.12745,probe,Yoyo II,1-emodnet_profile
7812377,2008,1,1,34.723991,33.342461,0.500000,0.40,0.40,0.70,0.02,,,,probe,Zigy_monitoring,1-emodnet_profile


In [11]:
vdb_idcampains = ['TALPRO', 'SOMBA', 'PEACETIME', 'MEDWAVES', 'MSM72', 'GIANI']
vdb_cruises = read_cruises("df_Nut_cruises_Med_New.csv", "2-cruises_new", vdb_idcampains)

gpc_idcampains = ['06MT51/2','BIOPT06','CANARI','DYFAMED','DYFAMED/PAPADOC - 99','MEDCIESM',
'MEDGOOS2','MEDGOOS3','MEDGOOS4','MEDGOOS5','MELISSA 2004','MT84_3','NORBAL','NORBAL2',
'NORBAL3','NORBAL4','POSEIDONE1M3A','PROSOPE','RHOFI 1','RHOFI 2','RHOFI 3','SINAPSI-3',
'SINAPSI-4']
gpc_cruises = read_cruises("df_Nut_cruises_Med_GP.csv", "3-cruises_old", gpc_idcampains)

cruises = pd.concat([vdb_cruises, gpc_cruises], ignore_index=True)
cruises

Unnamed: 0,YYYY,MM,DD,lat,lon,depth,nitrate,nitrite,ammonium,phosphate,silicate,oxygen,chlorophyll,ID_type_profile,idcampain,cruise_id,flag,total_chlorophyll
0,2016,8,19,38.300000,13.390000,462.475261,4.794709,0.000000,,0.268091,4.629730,,,nut,1,TALPRO,2-cruises_new,
1,2016,8,19,38.300000,13.390000,398.791751,5.112925,0.000000,,0.257708,4.267643,,,nut,1,TALPRO,2-cruises_new,
2,2016,8,19,38.300000,13.390000,299.612310,4.667531,0.010304,,0.216376,3.276545,,,nut,1,TALPRO,2-cruises_new,
3,2016,8,19,38.300000,13.390000,198.996911,4.273914,0.000000,,0.195673,2.657036,,,nut,1,TALPRO,2-cruises_new,
4,2016,8,19,38.300000,13.390000,100.316211,1.214182,0.061738,,0.102897,0.864333,,,nut,1,TALPRO,2-cruises_new,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16406,2002,4,9,42.000168,18.083834,755.000000,,0.040000,,0.140000,7.400000,,,nut,23,SINAPSI-4,3-cruises_old,
16407,2002,4,9,42.000168,18.083834,806.000000,,0.030000,,0.200000,7.200000,,,nut,23,SINAPSI-4,3-cruises_old,
16408,2002,4,9,42.000168,18.083834,1008.000000,,0.020000,,0.130000,7.300000,,,nut,23,SINAPSI-4,3-cruises_old,
16409,2002,4,9,42.000168,18.083834,1110.000000,,0.040000,,0.170000,9.200000,,,nut,23,SINAPSI-4,3-cruises_old,


In [12]:
data = pd.concat([emodnet, cruises])
data.sort_values(by=["YYYY", "MM", "DD", "lat", "lon", "depth", "flag"], inplace=True)
data.drop_duplicates(subset=["YYYY", "MM", "DD", "lat", "lon", "depth"], inplace=True, ignore_index=True)
data["idcampain"] = pd.Categorical(data.cruise_id).codes
data

Unnamed: 0,YYYY,MM,DD,lat,lon,depth,nitrate,nitrite,ammonium,phosphate,silicate,oxygen,chlorophyll,ID_type_profile,cruise_id,flag,idcampain,total_chlorophyll
0,1991,1,24,43.428000,7.852000,10.000000,2.51,0.06,,0.11,2.0,,,nut,DYFAMED,3-cruises_old,415,
1,1991,1,24,43.428000,7.852000,20.000000,2.59,0.05,,0.13,2.3,,,nut,DYFAMED,3-cruises_old,415,
2,1991,1,24,43.428000,7.852000,30.000000,2.58,0.06,,0.10,2.0,,,nut,DYFAMED,3-cruises_old,415,
3,1991,1,24,43.428000,7.852000,40.000000,2.77,0.07,,0.11,2.0,,,nut,DYFAMED,3-cruises_old,415,
4,1991,1,24,43.428000,7.852000,50.000000,2.77,0.07,,0.13,2.1,,,nut,DYFAMED,3-cruises_old,415,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7047978,2022,9,23,36.530499,34.236198,21.837097,,,,,,,0.0186,probe,Joint MBB / SODA projects survey,1-emodnet_profile,576,
7047979,2022,9,23,36.530499,34.236198,22.829638,,,,,,,0.0186,probe,Joint MBB / SODA projects survey,1-emodnet_profile,576,
7047980,2022,9,23,36.530499,34.236198,23.822172,,,,,,,0.0182,probe,Joint MBB / SODA projects survey,1-emodnet_profile,576,
7047981,2022,9,23,36.530499,34.236198,24.814703,,,,,,,0.0181,probe,Joint MBB / SODA projects survey,1-emodnet_profile,576,


In [13]:
data.to_parquet("full_nutrients.parquet")