In [4]:
import xarray as xr
import pandas as pd
import numpy as np

In [29]:
root='/home/ref-coriolis-public/copernicus/INSITU_GLO_TS_OA_REP_OBSERVATIONS_013_002_b/CORIOLIS-GLOBAL-CORA05.2-OBS_FULL_TIME_SERIE'

def cora_preproc(ds):        
    # CREATE A SOURCE FILE DataArray AS A NEW VARIABLE OF OUR DATASET
    SOURCE=np.empty(len(ds.N_PROF),dtype='S32')
    SOURCE[:]=ds.encoding['source'].split('/')[-1]  # DON'T KNOW IF IT'S THE BEST WAY TO RETRIEVE FILENAME ...
    ds['SOURCE']=xr.DataArray(SOURCE,dims='N_PROF') # CREATE DataArray
    
    # TEMPERATURE LEVELS, THIS IS JUST SIMPLE EXAMPLES
    # A LOT OF PROFILES WILL HAVE NaN FOR 1000m & 2000m VALUES    
    ds['TEMP0']=ds['TEMP'].isel(N_LEVELS=0) #TEMPERATURE SURFACE
    ds['TEMP1000']=ds['TEMP'].isel(N_LEVELS=101) #TEMPERATURE 1000m
    ds['TEMP2000']=ds['TEMP'].isel(N_LEVELS=151) #TEMPERATURE 2000m
    
    # NO NEED VARIABLES WILL BE DROPPED FROM THE DATASET
    ds=ds.drop(['REFERENCE_DATE_TIME','DATA_TYPE',
                'DC_REFERENCE','DEPH','TEMP','TEMP_PROC',
                'TEMP_QC','TEMP_CLMN','TEMP_CLSD',
                'TEMP_ERME','TEMP_ERUR','TEMP_RESI'])
    
    # REMOVE ANY USELESS DIMENSIONS
    ds=ds.squeeze()
    return ds

# XARRAY MULTIFILE LOAD
# HERE WE OPEN EACH CORA TEMP FILE, WE DON'T DECODE TIME, 
# WE CONCATENATE DATA ALONG THE N_PROF VARIABLES AND WE
# APPLY CORA_PREPROC FUNCTION TO EACH FILE

CORA=xr.open_mfdataset(root+'/data/????/*TEMP.nc',decode_times=False,concat_dim='N_PROF',preprocess=cora_preproc)
CORA

<xarray.Dataset>
Dimensions:          (N_PROF: 9058551)
Dimensions without coordinates: N_PROF
Data variables:
    PLATFORM_NUMBER  (N_PROF) |S8 dask.array<shape=(9058551,), chunksize=(13927,)>
    WMO_INST_TYPE    (N_PROF) |S4 dask.array<shape=(9058551,), chunksize=(13927,)>
    JULD             (N_PROF) float64 dask.array<shape=(9058551,), chunksize=(13927,)>
    LATITUDE         (N_PROF) float64 dask.array<shape=(9058551,), chunksize=(13927,)>
    LONGITUDE        (N_PROF) float64 dask.array<shape=(9058551,), chunksize=(13927,)>
    SOURCE           (N_PROF) |S32 b'OA_CORA5.1_19900115_dat_TEMP.nc' ... b'OA_CORA5.2_20180615_dat_TEMP.nc'
    TEMP0            (N_PROF) float32 dask.array<shape=(9058551,), chunksize=(13927,)>
    TEMP1000         (N_PROF) float32 dask.array<shape=(9058551,), chunksize=(13927,)>
    TEMP2000         (N_PROF) float32 dask.array<shape=(9058551,), chunksize=(13927,)>
Attributes:
    Conventions:       CF-1.4
    title:             Monthly analysis
    instit

In [50]:
# CONVERT TO PANDAS DATAFRAME (EASY BECAUSE 1 DIMENSION)
CORD=CORA.to_dataframe()

# JULD TO CALENDAR DATE
CORD['DATE']=pd.to_datetime(CORD['JULD'].values,unit='D',origin=pd.to_datetime('1950/1/1'))
CORD=CORD.drop(columns='JULD')

#BYTE STRING TO STRING
str_df = CORD.select_dtypes([np.object])
str_df = str_df.stack().str.decode('utf-8').unstack()
for col in str_df:
    CORD[col] = str_df[col]
    
#HERE IS OUR PANDAS INDEX    
CORD

Unnamed: 0_level_0,PLATFORM_NUMBER,WMO_INST_TYPE,LATITUDE,LONGITUDE,SOURCE,TEMP0,TEMP1000,TEMP2000,DATE
N_PROF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,OCL0056,999,38.167999,-9.765500,OA_CORA5.1_19900115_dat_TEMP.nc,17.810001,,,1989-12-05 10:20:09.374956800
1,OCL0056,999,38.179001,-10.152500,OA_CORA5.1_19900115_dat_TEMP.nc,17.480000,,,1989-12-05 14:31:52.500000000
2,OCL0056,999,38.178001,-10.465000,OA_CORA5.1_19900115_dat_TEMP.nc,17.590000,,,1989-12-05 17:44:59.999971200
3,OCL0056,999,38.736164,-9.630667,OA_CORA5.1_19900115_dat_TEMP.nc,17.133333,,,1989-12-11 15:40:46.874956800
4,OCL0056,999,38.735001,-10.017250,OA_CORA5.1_19900115_dat_TEMP.nc,17.500000,,,1989-12-11 20:57:11.250000000
5,OCL0056,999,38.735001,-10.423333,OA_CORA5.1_19900115_dat_TEMP.nc,17.273335,,,1989-12-12 02:15:00.000000000
6,OCL0056,999,38.737499,-10.887501,OA_CORA5.1_19900115_dat_TEMP.nc,16.740002,,,1989-12-12 07:32:48.750000000
7,OCL0056,999,38.732498,-11.332500,OA_CORA5.1_19900115_dat_TEMP.nc,16.625000,,,1989-12-12 12:45:00.000000000
8,OCL0056,999,39.165001,-11.191999,OA_CORA5.1_19900115_dat_TEMP.nc,16.530001,,,1989-12-12 18:39:22.500000000
9,OCL0056,999,39.167999,-10.693333,OA_CORA5.1_19900115_dat_TEMP.nc,16.876667,,,1989-12-12 22:46:52.500000000
