In [16]:
import sys

import glob
import numpy as np
import netCDF4 as nc
from pprint import pprint
import pandas as pd
from functools import partial

import multiprocessing as mp

In [21]:
# things break with newer version of netCDF (when opening xarray datasets)
nc.__version__

'1.5.3'

In [2]:
# loads the autoreload package into ipython kernel
%load_ext autoreload
# sets autoreload mode to automatically reload modules when they change
%autoreload 2
# enables tab completion
%config IPCompleter.greedy=True

In [9]:
from preprocess_modis import preprocess_aod_data, read_and_process_hdfs

In [4]:
modis_granules = glob.glob('/neodc/modis/data/MYD04_L2/collection61/2009/*/*/*.hdf')

In [20]:
dset = nc.Dataset(modis_granules[10])

all_datasets = list(dset.variables.keys())

target_products = ['Scan_Start_Time', 'Latitude', 'Longitude', 'Solar_Zenith', 'Land_sea_Flag',
                  'Optical_Depth_Land_And_Ocean', 'Land_Ocean_Quality_Flag','PSML003_Ocean',
                   'AOD_550_Dark_Target_Deep_Blue_Combined','AOD_550_Dark_Target_Deep_Blue_Combined_QA_Flag',
                   'AOD_550_Dark_Target_Deep_Blue_Combined_Algorithm_Flag', ]

vars_to_drop = list(set(all_datasets) - set(target_products))

In [21]:
modis_dir = '/neodc/modis/data/MYD04_L2/collection61/'

dates_2008 = pd.date_range('2008/01/01', '2008/12/31', freq = '1D')
dates_2009 = pd.date_range('2009/01/01', '2009/12/31', freq = '1D')

In [25]:
# the dimension should be x here because every tenth MOD file has an output grid size of 204 x 135 (vs 203 x 135 for most)
preprocess = partial(read_and_process_hdfs, base_dir='/neodc/modis/data/MYD04_L2/collection61/', dim='x',
                        drop_variables=vars_to_drop, preprocess_func=preprocess_aod_data, verbose=True)

In [26]:
%%time
with mp.Pool(mp.cpu_count()) as pool:
    res = pool.map(preprocess, dates_2009)
    pool.close()
    pool.join()

Processing MODIS files for 31/01/2009
Processing MODIS files for 19/01/2009Processing MODIS files for 15/01/2009Processing MODIS files for 23/01/2009

Processing MODIS files for 03/01/2009Processing MODIS files for 01/01/2009


Processing MODIS files for 21/01/2009Processing MODIS files for 07/01/2009Processing MODIS files for 13/01/2009Processing MODIS files for 27/01/2009Processing MODIS files for 02/02/2009

Processing MODIS files for 17/01/2009Processing MODIS files for 29/01/2009
Processing MODIS files for 11/01/2009Processing MODIS files for 09/01/2009


Processing MODIS files for 16/02/2009


Processing MODIS files for 10/03/2009
Processing MODIS files for 05/01/2009

Processing MODIS files for 12/03/2009Processing MODIS files for 26/02/2009

Processing MODIS files for 18/02/2009Processing MODIS files for 14/03/2009Processing MODIS files for 20/02/2009Processing MODIS files for 06/03/2009Processing MODIS files for 24/03/2009Processing MODIS files for 14/02/2009Processing MODIS f

In [28]:
%%time
with mp.Pool(mp.cpu_count()) as pool:
    res_2009 = pool.map(preprocess, dates_2008)
    pool.close()
    pool.join()