# How to create monthly observations datasets (CSIC)

In [1]:
import nes
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

## 1. Collect data

We have two dataframes: the first one gives us NH3 monthly data at some stations, and the second one gives us the locations of the stations

In [2]:
file_path = '/gpfs/projects/bsc32/models/NES_tutorial_data/NH3_barcelona_2019_csic.csv'
df_nh3 = pd.read_csv(file_path, index_col='Date-hour in', parse_dates=True)
df_nh3

Unnamed: 0_level_0,traffic_site,urban_site
Date-hour in,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,4.988988,2.553235
2019-02-01,3.422535,1.556226
2019-03-01,2.675065,1.686355
2019-04-01,3.425522,1.975486
2019-05-01,5.314809,1.119245
2019-06-01,3.139495,1.626567
2019-07-01,0.0,2.226856
2019-08-01,0.0,2.469638
2019-09-01,0.0,3.727355
2019-10-01,0.0,1.535056


In [3]:
file_path = '/gpfs/projects/bsc32/models/NES_tutorial_data/NH3_stations_CSIC.csv'
df_stations = pd.read_csv(file_path)
df_stations

Unnamed: 0,station,Lon,Lat
0,urban_site,2.1151,41.3875
1,traffic_site,2.1534,41.3987


## 2. Create dataset with all timesteps

### Define coordinates

In [4]:
times = df_nh3.index.to_pydatetime()
lat = df_stations['Lat'].to_numpy()
lon = df_stations['Lon'].to_numpy()

In [5]:
nessy = nes.create_nes(comm=None, info=False, projection=None, parallel_method='X',
                   lat=lat, lon=lon, times=times)

### Add data

In [6]:
variables = {'station_name': {'data': df_nh3.columns.to_numpy(),
                              'dimensions': ('station',),
                              'dtype': str},
             'sconcnh3': {'data': df_nh3.to_numpy(),
                          'dimensions': ('time', 'station',),
                          'dtype': float}}

In [7]:
nessy.variables = variables

### Write dataset

In [8]:
nessy.set_strlen(75)
nessy.to_netcdf('points_csic_nh3.nc', info=True)

  warn(msg)


Rank 000: Creating points_csic_nh3.nc
Rank 000: NetCDF ready to write
Rank 000: Dimensions done
Rank 000: Writing station_name var (1/2)
Rank 000: Var station_name created (1/2)
Rank 000: Filling station_name)
Rank 000: Var station_name data (1/2)
Rank 000: Var station_name completed (1/2)
Rank 000: Writing sconcnh3 var (2/2)
Rank 000: Var sconcnh3 created (2/2)
Rank 000: Filling sconcnh3)
Rank 000: Var sconcnh3 data (2/2)
Rank 000: Var sconcnh3 completed (2/2)


## 3. Create one dataset per month (Ready for Providentia)

### Add columns with month and year

In [9]:
df_nh3['month'] = df_nh3.index.month
df_nh3['year'] = df_nh3.index.year
df_nh3

Unnamed: 0_level_0,traffic_site,urban_site,month,year
Date-hour in,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,4.988988,2.553235,1,2019
2019-02-01,3.422535,1.556226,2,2019
2019-03-01,2.675065,1.686355,3,2019
2019-04-01,3.425522,1.975486,4,2019
2019-05-01,5.314809,1.119245,5,2019
2019-06-01,3.139495,1.626567,6,2019
2019-07-01,0.0,2.226856,7,2019
2019-08-01,0.0,2.469638,8,2019
2019-09-01,0.0,3.727355,9,2019
2019-10-01,0.0,1.535056,10,2019


### Iterate through each month

In [10]:
for (year, month), current in df_nh3.groupby(['year', 'month']):
    
    # Read time
    times = current.index.to_pydatetime()
    
    # Fill altitude with nans
    altitude = np.full(len(current.columns[0:2]), np.nan)
  
    # Read metadata
    metadata = {'station_name': {'data': current.columns[0:2].to_numpy(),
                                 'dimensions': ('station',),
                                 'dtype': str,
                                 'standard_name': ''},
                'altitude': {'data': altitude,
                             'dimensions': ('station',),
                             'units': 'meters',
                             'standard_name': 'altitude'},
                'sconcnh3': {'data': current.iloc[:, 0:2].to_numpy(),
                             'units': 'Âµg m-3',
                             'dimensions': ('time', 'station',),
                             'long_name': ''}
               }
    
    # Create object
    points_grid = nes.create_nes(comm=None, info=False, projection=None, parallel_method='X',
                                 lat=lat, lon=lon, times=times)
    
    # Assign metadata
    points_grid.variables = metadata
    points_grid.set_strlen(75)
    
    # Making directory
    netcdf_path = 'csic/csic/monthly/sconcnh3/'
    if not os.path.exists(os.path.dirname(netcdf_path)):
        os.makedirs(os.path.dirname(netcdf_path))
        
    # To run Providentia, this folder should be moved to:
    # '/esarchive/obs/' as in '/esarchive/obs/csic/csic/monthly/sconcnh3/'
    
    # Save files
    points_grid.to_netcdf(netcdf_path + '/sconcnh3_{0}{1}.nc'.format(year, str(month).zfill(2)))
    
    del points_grid
    print('Done sconcnh3_{0}{1}.nc'.format(year, str(month).zfill(2)))

  warn(msg)


Done sconcnh3_201901.nc


  warn(msg)
  warn(msg)


Done sconcnh3_201902.nc


  warn(msg)


Done sconcnh3_201903.nc


  warn(msg)


Done sconcnh3_201904.nc
Done sconcnh3_201905.nc


  warn(msg)


Done sconcnh3_201906.nc


  warn(msg)
  warn(msg)


Done sconcnh3_201907.nc
Done sconcnh3_201908.nc


  warn(msg)


Done sconcnh3_201909.nc


  warn(msg)
  warn(msg)


Done sconcnh3_201910.nc
Done sconcnh3_201911.nc


  warn(msg)


Done sconcnh3_201912.nc
