In [None]:
import pandas as pd
import numpy as np
import glob
import datatable as dt
from tqdm.auto import tqdm

In [None]:
def gzp(file):
    '''
    GZP = gzip process
    Takes a gzipped .csv and puts it into a Pandas df, make an average (gridded)
    '''
    df = dt.fread(file)
    df = df.to_pandas()
    return df

def files_to_df(files):
    '''
    Receives: a folder
    Returns: a dataframe of the files in that folder with separate columns for Year, Month and Day
    '''
    df = pd.DataFrame(data=files, columns = ['file'])
    # Add columns that will be filled next
    df['year'] = 0
    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc('year')] = int(df['file'][i].rsplit('\\')[2].rsplit('_')[0])
        #df['year'].iloc[i] = int(df['file'][i].rsplit('\\')[4].rsplit('_')[0])
    return df

def df_grid(df, lat_min, lon_min):
    '''
    Receives: df and the longitude bin width (lon_width)
    Returns: single df that is the mean radiance for each grid cell
    '''
    dfo = pd.DataFrame()
    lat_max = lat_min + 20
    lon_max = lon_min + 20
    df1 = pd.DataFrame(df.loc[(df['lat'] >= lat_min) & 
                           (df['lat'] < lat_max) & 
                           (df['lon'] >= lon_min) &
                           (df['lon'] < lon_max)])

    ct = df1['count'].sum()
    if ct > 0:
        dfo = dfo.append(pd.DataFrame(df1.mean(numeric_only=True)).T.round(5))
        dfo.iloc[-1].at['count'] = ct
        dfo = dfo.astype({'year': 'int32'})
        dfo = dfo.astype({'count': 'int32'})
    return dfo

## Locate files, create single-year ~1Gb daily radiance avg gzip file

In [None]:
lat_width = 20
lon_width = 20

files = glob.glob('F:\\L1b\\*.gz')
dff = files_to_df(files)
print('Found', len(files), 'files.')
years = dff['year'].unique()
print('Found these years:', years)
dff

## Save 1 yr of monthly averages in 20 deg lat x 20 deg lon for entire globe:

In [None]:
for year in years:
    df2 = pd.DataFrame()
    print('Reading in >1Gb file, please wait...')
    file = dff.loc[dff['year'] == year]['file'].values[0]
    df = gzp(file)
    print('df mem size:', round(df.memory_usage().sum()/2**20, 0), 'MB')

    for month in tqdm(np.arange(1, 13, 1)):
        if len(str(month)) < 2:
            mon_tag = '0' + str(month)
        else:
            mon_tag = str(month)

        for i in np.arange(1, 10, 1):
            lat = -90 + ((i + 8) % 9) * 20
            for j in np.arange(1, 19, 1):
                lon = -180 + ((j + 8) % 18) * 20
                dfm = df.loc[(df['month'] == month)]
                dfm = df_grid(dfm, lat, lon)
                dfm['lat'] = i
                dfm['lon'] = j
                df2 = df2.append(dfm)
    df2.to_csv('C:\\data\\AIRS\\L1B_globe_20x20\\'+str(year)+'_lat'+str(i)+'_lon'+str(j)+'.csv.gz', index=False, header=True, compression = 'gzip')
            
print('Finished.')
# clear memory
df = pd.DataFrame()
df2 = pd.DataFrame()
dfm = pd.DataFrame()