In [None]:
import datatable as dt  # pip install datatable
import pandas as pd # pip install pandas
import numpy as np
import glob
from tqdm.auto import tqdm
import os

In [None]:
def gzp(file):
    '''
    GZP = gzip process
    Takes a gzipped .csv and puts it into a Pandas df, make an average (gridded)
    '''
    df = dt.fread(file)
    df = df.to_pandas()
    return df

def mean_df(df, lat_width, lon_width):
    '''
    Receives a dataframe of clear sky radiances
    Counts the number of radiances then stores count + avg radiances row-wise in a new df
    '''
    df_out = pd.DataFrame()
    df['count'] = len(df)
    for i in range(int(180/lat_width)):
        lat_min = -90 + i*lat_width
        lat_max = lat_min + lat_width
        for j in range(int(360/lon_width)):
            lon_min = -180 + j * lon_width
            lon_max = lon_min + lon_width
            ct = len(df.loc[(df['lat'] >= lat_min) & 
                           (df['lat'] < lat_max) & 
                           (df['lon'] >= lon_min) &
                           (df['lon'] < lon_max) &
                           (abs(df['sci']) == 2)])
            if ct > 0:
                df_out = pd.concat([df_out, pd.DataFrame(df.loc[(df['lat'] >= lat_min) & 
                                                               (df['lat'] < lat_max) & 
                                                               (df['lon'] >= lon_min) &
                                                               (df['lon'] < lon_max) &
                                                               (abs(df['sci']) == 2)].mean(numeric_only=True)).T])
                df_out.iloc[-1].at['count'] = ct
    return df_out

def files_to_df(files):
    '''
    Receives: a folder
    Returns: a dataframe of the files in that folder with separate columns for Year, Month and Day
    '''
    df = pd.DataFrame(data=files, columns = ['file'])
    # Add columns that will be filled next
    df['year'] = 0
    df['month'] = 0
    df['day'] = 0
    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc('year')] = int(df['file'][i].rsplit('\\')[5].rsplit('_')[0])
        df.iloc[i, df.columns.get_loc('month')] = int(df['file'][i].rsplit('\\')[5].rsplit('_')[1])
        df.iloc[i, df.columns.get_loc('day')] = int(df['file'][i].rsplit('\\')[5].rsplit('_')[2])
    return df

## Verify all files are readable

In [None]:
files = glob.glob('C:\\data\\AIRS\\L1b\\_AIRS2\\*.csv.gz')
print('Found', len(files), 'files.')

if 1 == 1:
    for file in tqdm(files):
        try:
            df = gzp(file)
        except:
            print(file, 'is unreadable')

## Locate files, create single-year ~1Gb daily radiance avg for each lat*lon grid -> gzip file

In [None]:
dff = files_to_df(files)                       # dataframe of files (dff)
print('Found', len(files), 'files.')
print('Found these years:', dff['year'].unique())

In [None]:
for year in dff['year'].unique():
    dffs = dff.loc[dff['year'] == year]  # dataframe of files, single year (dffs)
    dfo = pd.DataFrame()
    print('Processing year', str(year)+'...')
    for i in tqdm(range(len(dffs))):
        df = pd.DataFrame()
        df = mean_df(gzp(dffs.iloc[i]['file']), 20, 20)
        df['year'] = dffs.iloc[i]['year']
        df['month'] = dffs.iloc[i]['month']
        dfo = pd.concat([dfo, df])
    print('dfo mem size:', round(dfo.memory_usage().sum()/2**20, 0), 'MB')
    
    # Prepare output folder
    save_path = 'C:\\data\\AIRS\\L1b\\_AIRS3\\'
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    dfo.to_csv(save_path + str(year) + '_dailyavgs.csv.gz', compression = 'gzip', index=False, header=True)

# clear df's from memory
df = pd.DataFrame()
dfo = pd.DataFrame()