# Regridding NO2:
    
This notebook will generate the results in the NO2 column on Table 1, except the last two rows, as well as Figure 5, the data needed to get the results in the first row of Table 4, and Table 9. Make sure to replace any filepaths with the appropriate filepaths on your machine.



First, we import all relevant libraries.

In [None]:
import sys
import os
import netCDF4 as ntf
from pyhdf.SD import SD, SDC
import numpy as np
import math
from netCDF4 import Dataset 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
import cartopy
import xesmf as xmf
import xarray as xr
import glob

Next, we load in the raw dataset, where all the files that were downloaded are kept in one directory, and then extract the desired data into an array.

In [None]:
raw_no2 = xr.open_mfdataset("/data0/rm3873/daily_no2_v2/*.nc4",concat_dim='TIMERANGE',combine='nested')

In [None]:
data = np.array(raw_no2['ColumnAmountNO2Trop'])

Here, we set up our parameters including info about the desired region, the size of the target grid, filepaths, latitude stride for the regridding, selected days, scale factor to apply to the data, and the key threshold to use for marking a target box as missing or not.



In [None]:
binary_path = '/data0/rm3873/no2_binary_regridded.nc'
land_mask_path = '/data0/zzheng/GEOS-Chem-grid/land_mask.nc'
TARGET_MISSING = -9999
MISSING = np.nan
WATER = -4999
day_start = 1
day_end = 366
lat_st = 6
lat_end = 36.25
lat_siz = 0.25
lon_st = 68.125
lon_end = 97.8126
lon_siz = 0.3125
lat_start_idx = 0
lat_end_idx = len(no2_data[0])
lon_start_idx = 0
lon_end_idx = len(no2_data[0][0])
target_grid_lats = 121
target_grid_lons = 96
orig_grid_size = 0.25
SCALE_FACTOR = 1
missing_threshold = 0.1

Let's check the percent of missing data in the raw grid.

In [None]:
np.count_nonzero(~np.isnan(data))/(365*121*121)*100 

.... and get the empty new grid.

In [None]:
regridded = np.full((day_end-day_start,target_grid_lats,target_grid_lons), np.nan)

Let's setup the regridded NetCDF file with our desired dimensions, we will fill in the 'no2' variable.

In [None]:
land_mask = Dataset(land_mask_path,mode='r',format='NETCDF4_CLASSIC')
ncfile = Dataset(binary_path,mode='w',format='NETCDF4_CLASSIC') 
lat_dim = ncfile.createDimension('lat', target_grid_lats)     
lon_dim = ncfile.createDimension('lon', target_grid_lons)
time = ncfile.createDimension('time',day_end-day_start)

lat = ncfile.createVariable('lat', np.float32, ('lat',))
lat.units = 'degrees_north'
lat.long_name = 'latitude'
lon = ncfile.createVariable('lon', np.float32, ('lon',))
lon.units = 'degrees_east'
lon.long_name = 'longitude'
time = ncfile.createVariable('time', np.float64, ('time',))
time.units = 'days of 2015'
time.long_name = 'days_of_the_year'
# Define a 3D variable to hold the data
no2 = ncfile.createVariable('no2',np.float64,('time','lat','lon')) # note: unlimited dimension is leftmost

lat[:] = np.arange(lat_st,lat_end,lat_siz)
lon[:] = np.arange(lon_st,lon_end,lon_siz)
time[:] = np.arange(day_start,day_end)

In [None]:
for day in range(day_start,day_end):
    print('Day: ' + str(day))
    ct_lat = 0
    lat_str = 1
    data = no2_data[day-1]
    i=lat_start_idx

    while(i < lat_end_idx):
        #Part III
        j = lon_start_idx
        ct_lon = 0
        lon_str = 2
        while(j < lon_end_idx):
            if(ct_lon == 25):
                lon_str = 1
            '''
            if(land_mask["mask"][ct_lat][ct_lon] == 0):
                regridded_no2[day - 1][ct_lat][ct_lon] = WATER
                ct_lon = ct_lon + 1
                j = j + lon_str
                continue
            '''
            #Part IV
            #print('Bounding box: [' + str(i) + ',' + str(i+lat_str) + ' by ' + str(j) + ',' + str(j+lon_str) + '] for idx (' + str(ct_lat) + ',' + str(ct_lon) + ')')
            cur_row = [m for m in range(i,i+lat_str)] 
            cur_col = [n for n in range(j,j+lon_str)]
            cur_box_idx = np.ix_(cur_row,cur_col)
            cur_box = data[cur_box_idx]
            #Part V
            total_ct = len(cur_box) * len(cur_box[0])
            non_miss = []
            for a in range(len(cur_box)):
                for b in range(len(cur_box[a])):
                    if(cur_box[a][b] != TARGET_MISSING):
                        non_miss.append(cur_box[a][b]* SCALE_FACTOR)

            #Part VI
            cur_val = np.nan
            if(len(non_miss) >= missing_threshold * total_ct):
                non_miss = np.array(non_miss)
                cur_val = np.average(non_miss)
            

            #Part VII
            regridded[day - 1][ct_lat][ct_lon] = cur_val
            ct_lon = ct_lon + 1
            j = j + lon_str


        ct_lat = ct_lat + 1
        i = i + lat_str

Let's read in the simulated data (already matched for the GEOS-Chem target grid), and copy them into a dictionary of the species.

In [None]:
raw_emission = xr.open_dataset("/data0/rm3873/dsi_india/daily_emission.nc").sel(lat=slice(6,36),lon=slice(68,98))
raw_gas = xr.open_dataset("/data0/rm3873/dsi_india/daily_gas_column.nc").sel(lat=slice(6,36),lon=slice(68,98))
raw_pm = xr.open_dataset("/data0/rm3873/dsi_india/daily_surface_pm25_RH50.nc").sel(lat=slice(6,36),lon=slice(68,98))
raw_met = xr.open_dataset("/data0/rm3873/dsi_india/daily_meteo.nc").sel(lat=slice(6,36),lon=slice(68,98))
raw_aod = xr.open_dataset("/data0/rm3873/dsi_india/daily_aod.nc").sel(lat=slice(6,36),lon=slice(68,98))
raw_emission["EmisDST_Natural"] = raw_emission["EmisDST1_Natural"] + raw_emission["EmisDST2_Natural"] + raw_emission["EmisDST3_Natural"] + raw_emission["EmisDST4_Natural"]
feature_ml = [
    {"PM25":[]},
    {'CO_trop':[], 'SO2_trop':[], 'NO2_trop':[], 'CH2O_trop':[], 'NH3_trop':[]},
     {'AOT_C':[], 'AOT_DUST_C':[]},
    {'T2M':[], 'PBLH':[], 'U10M':[], 'V10M':[], 'PRECTOT':[], 'RH':[]},
    {'EmisDST_Natural':[], 
                'EmisNO_Fert':[], 'EmisNO_Lightning':[], 'EmisNO_Ship':[], 'EmisNO_Soil':[]}]
sets = [raw_pm,raw_gas,raw_aod,raw_met,raw_emission]

for i in range(len(sets)):
    for spec in feature_ml[i]:
        print(spec)
        cur_set = sets[i][spec]
        
        for entry in cur_set:
            feature_ml[i][spec].append([])
        for j in range(len(cur_set)):
            print(j)
            for k in range(len(cur_set[j])):
                feature_ml[i][spec][j].append(cur_set[j][k])
        feature_ml[i][spec] = np.array(feature_ml[i][spec])

Now we can simply apply our NO2 regridded missing mask onto all these datasets!

In [None]:
missing_list = np.argwhere(np.isnan(regridded))
missing_vals = [np.NaN] * len(missing_list)
for j in range(len(sets)):
    for spec in feature_ml[j]:
        print(spec)
        feature_ml[j][spec][tuple(np.transpose(missing_list))] = missing_vals

Finally, let's write all these datasets, now with the NO2 missing mask applied to them, to disk!

In [None]:
for i in range(len(feature_ml)):
    for spec in feature_ml[i]:
        fname = '/data0/rm3873/custom_regridded_no2_' + str(spec) + '.nc'
        ncfile = Dataset(fname,mode='w',format='NETCDF4_CLASSIC') 
        lat_dim = ncfile.createDimension('lat', 121)     
        lon_dim = ncfile.createDimension('lon', 96)
        time = ncfile.createDimension('time',365)

        lat = ncfile.createVariable('lat', np.float32, ('lat',))
        lat.units = 'degrees_north'
        lat.long_name = 'latitude'
        lon = ncfile.createVariable('lon', np.float32, ('lon',))
        lon.units = 'degrees_east'
        lon.long_name = 'longitude'
        time = ncfile.createVariable('time', np.float64, ('time',))
        time.units = 'days of 2015'
        time.long_name = 'days_of_the_year'
        # Define a 3D variable to hold the data
        key_var = ncfile.createVariable(spec,np.float64,('time','lat','lon'))

        lat[:] = np.arange(6,36.25,0.25)
        lon[:] = np.arange(68.125,97.8126,0.3125)
        time[:] = np.arange(1,366)
        key_var[::] = feature_ml[i][spec]
        ncfile.close()