In [2]:
import numpy as np
import pandas as pd
import glob
from tqdm.auto import tqdm # library for progress bars
from IPython.display import clear_output
from IPython import display
from time import strptime
import datatable as dt
import tarfile
import io

import gzip
import shutil
import os

### This notebook pre-processes LBL data for convenience & rapid handling/processing in  _AIRS6 notebook

## Decompress .gz files:

In [3]:
# Typical input is a folder of LBL files resembling: lbl_ERA5_AIRS_L1b_v4-2003.tar.gz
# The first time this runs, decompress the .gz into a .tar, then delete the .gz

folder_LBL = 'C:\\data\\LBL\\res_v4\\'
files = glob.glob(folder_LBL+'*.gz')

# Unzip the contents of the .gz files
if 1 == 2:
    for file in tqdm(files):
        with gzip.open(file, 'rb') as f_in:
            with open(file[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

# Delete the .gz files after decompressing
if 1 == 2:
    print('Deleting gzips...')
    for item in os.listdir(folder_LBL):
        if item.endswith(".gz"):
            os.remove(os.path.join(folder_LBL, item))
    print('Done.')

## Data

In [4]:
# If multiple LBL types are in the same folder (ERA5, GHG, MET, CFC...)
# this tag specification allows only a portion of the files to be processed
tag = 'ERA5'
files = glob.glob(folder_LBL+'*'+tag+'*.tar')
print('Found', len(files), 'TAR files.')

Found 19 TAR files.


## Open each .tar file, add each .dat file to the lgrid dataframe

In [9]:
lgrid = pd.DataFrame(columns = np.arange(0.1, 1800.1, 0.1))  # lgrid means "Line-by-line grid (lat x lon) of radiances"
lgrid[['year', 'month', 'lat', 'lon']] = []
lgrid.set_index(['year', 'month', 'lat', 'lon'], inplace=True)

for file in tqdm(files, 'TAR Files', ncols = 400, position = 0):
    with tarfile.open(file) as tar:
        for member in tqdm(tar, 'DAT Files', ncols = 400, position = 1, leave = False):
            if member.isreg():      # Is it a regular file?
                #print("{} - {} bytes".format(member.name, member.size))
                csv_file = io.StringIO(tar.extractfile(member).read().decode('ascii'))
                df_in = pd.read_csv(csv_file, header=None, delim_whitespace=True)
                year = int(member.name.rsplit('-')[1])
                month = strptime(member.name.rsplit('_')[4].rsplit('-')[2],'%b').tm_mon
                lat = int(member.name.rsplit('_')[5][3:])
                lon = int(member.name.rsplit('_')[6][3:])
                lgrid.loc[(year, month, lat, lon)] = df_in[1].values*1000  # convert mW --> W

lgrid.sort_index(inplace=True)
lgrid.columns = np.arange(0.1, 1800.1, 0.1).round(1)
lgrid = lgrid.rename_axis(["wavenumber"], axis=1)

HBox(children=(HTML(value='TAR Files'), FloatProgress(value=0.0, layout=Layout(flex='2'), max=19.0), HTML(valu…

HBox(children=(HTML(value='DAT Files'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), ma…




## Degrade lgrid resolution, then save

In [23]:
n=4 # Degrade LBL by a factor of 4 (i.e., 0.1, 0.2, 0.3, ...0.9 cm-1 --> 0.1 cm-1, 0.5 cm-1, 0.9 cm-1)

lgridd = lgrid.T.groupby(np.arange(len(lgrid.T))//n).mean().T    #lgridd name: extra "d" means "degraded"
lgridd.columns = np.arange(0.1, 1800.1, 0.1*n).round(3)

if 1 == 1:
    print('Saving file...')
    lgridd.to_csv('C:\\data\\LBL\\lgridd_v4\\'+tag+'_v4.csv.gz', compression = 'gzip', index=True, header=True)
    print('Done.')

Saving file...
Done.
