In [None]:
import pandas as pd
import numpy as np
import glob
from pyhdf.SD import *
from tqdm.auto import tqdm
import datetime as dt
from calendar import monthrange
import os

In [None]:
def gp(file):
    ''' gp = granule process
    Receives: an .HDF file
    Returns: a dataframe of lat, lon, sci, and radiance
    '''
    try:
        f = SD(file, SDC.READ)
    except:
        print(file, "is unreadable")

    sci = f.select('spectral_clear_indicator')
    rad = f.select('radiances')
    lat = f.select('Latitude')
    lon = f.select('Longitude')
    sza = f.select('solzen')
    tim = f.select('Time')
    ang = f.select('scanang')
    sta = f.select('state')
    top = f.select('topog')

    y,x,r = rad[:,:,:].shape
    geox = np.arange(0, x, 1)
    geo = np.arange(0, y, 1)
    geofield = np.meshgrid(geox, geo)

    out_arr = np.column_stack(rad[:,:,:].reshape(y*x,-1))
    df = pd.DataFrame(out_arr).T
    df = pd.concat([df, 
                    pd.DataFrame(data=lat[:,:].reshape(y*x,-1).astype('float64'), columns=['lat']),
                    pd.DataFrame(data=lon[:,:].reshape(y*x,-1).astype('float64'), columns=['lon']),
                    pd.DataFrame(data=sza[:,:].reshape(y*x,-1).astype('float64'), columns=['sza']), 
                    pd.DataFrame(data=sci[:,:].reshape(y*x,-1).astype('int32'), columns=['sci']), 
                    pd.DataFrame(data=ang[:,:].reshape(y*x,-1).astype('float64'), columns=['scanang']), 
                    pd.DataFrame(data=sta[:,:].reshape(y*x,-1).astype('int32'), columns=['state']), 
                    pd.DataFrame(data=top[:,:].reshape(y*x,-1).astype('float64'), columns=['topog']),
                    pd.DataFrame(data=tim[:,:].reshape(y*x,-1).astype('float64'), columns=['time']),
                    #pd.DataFrame(data=geofield[0].reshape(y*x,-1), columns=['x']),
                    #pd.DataFrame(data=geofield[1].reshape(y*x,-1), columns=['y']),
                   ], axis=1)
    df = df.loc[(df['scanang'] >= -15) & (df['scanang'] <= 15) & (df['state'] == 0)]
    df['time'] = df['time'].apply(ti)
    df['lat'] = df['lat'].round(4) # this is done to be comparable to L2 lat/lon's
    df['lon'] = df['lon'].round(4) # this is done to be comparable to L2 lat/lon's
    df['topog'] = df['topog'].round(1) # this is done to be comparable to L2 lat/lon's
    return df

def ti(x):
    '''
    ti = time index, Returns a datetime that is indexed to Jan 1, 1993 (NASA standard)
    '''
    c = dt.datetime(1993,1,1,0,0,0) + dt.timedelta(0, x)
    c -= dt.timedelta(microseconds=c.microsecond)
    return c

def files_to_df(folder):
    '''
    Receives: a folder
    Returns: a dataframe of the files in that folder with separate columns for Year, Month and Day
    '''
    files = glob.glob(folder+'\\*.hdf')
    df = pd.DataFrame(data=files, columns = ['file'])
    # Add columns that will be filled next
    df['year'] = 0
    df['month'] = 0
    df['day'] = 0
    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc('year')] = int(df['file'][i].rsplit('.')[1])
        df.iloc[i, df.columns.get_loc('month')] = int(df['file'][i].rsplit('.')[2])
        df.iloc[i, df.columns.get_loc('day')] = int(df['file'][i].rsplit('.')[3])
    return df

def days_in_month(year, month):
    return monthrange(year, month)[1]

### Source Data Location

In [None]:
folder_L1b = 'C:\\data\\AIRS\\L1b\\'

lat_width = 20 # bin height for eventual grouping/averaging
lon_width = 20 # bin width for eventual grouping/averaging
#years = [2003]
years = np.arange(2003, 2022, 1)
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

In [None]:
# Optional, you can pre-read the files to ensure the code won't crash when processing the data

if 1 == 2:
    bad_files = []
    for year in tqdm(years, "Years", ncols = 400, position = 0):
        for month in tqdm(months, "Months", ncols = 400, position = 1, leave = False):
            for file in glob.glob(folder_L1b + str(year) + '\\' + month + '\\*.hdf'):
                try: 
                    gp(file)
                except:
                    print(file, 'is unreadable')
                    bad_files = bad_files.append(file)
    if len(bad_files) > 0:
        print('list of bad files:')
        for file in bad_files:
            print(file)
    else:
        print('no bad files found.')

### Process L1b data:

In [None]:
for year in tqdm(years, "Years", ncols = 400, position = 0):
    for month in tqdm(months, "Months", ncols = 400, position = 1, leave = False):
        df1 = files_to_df(folder_L1b+str(year)+'\\'+month)
        days = np.arange(1, days_in_month(year, int(month))+1, 1)
        
        for day in tqdm(days, "Days", ncols = 400, position = 2, leave = False):
            dfo = pd.DataFrame()
            files_L1b = df1.loc[(df1['year'] == year) & (df1['month'] == int(month)) & (df1['day'] == day), 'file']
            
            if len(files_L1b) < 1:
                continue
            for file in files_L1b:
                df = gp(file)
                dfo = pd.concat([dfo, df.loc[(df['sci'] == 2) | (df['sci'] == -2)]])
                    
            try:
                # Eliminate a few unnecessary columns before saving
                dfo = dfo.drop(['state', 'TotCld_4_CCfinal'], axis = 1) 
            except:
                pass
            
            save_path = folder_L1b + '_AIRS2\\'
            if not os.path.isdir(save_path):
                os.makedirs(save_path)
    
            dfo.to_csv(save_path+str(year)+'_'+month+'_'+str(day)+'_L1b.csv.gz', compression = 'gzip', index=False, header=True)