In [None]:
import sys
import os
if not any([p for p in sys.path if os.getcwd().replace('/fstcatalog/fstcatalog/notebooks','') == p]):
    sys.path.append(os.getcwd().replace('/notebooks','/fstcatalog'))


In [None]:
import pandas as pd
from glob import glob
from pathlib import Path
from datetime import datetime, timedelta
import os
from multiprocessing import Pool, Manager
import concurrent.futures
import numpy as np

manager = Manager()

In [None]:
REJECT = set(['color', 'iau', 'fstcomp','logs','difax','xml','pds','gpkg','plot','bulletins','work', 'images', 'scribe', 'grib', 'netcdf', 'umos', 'banco', 'cutoff', 'backup', 'blacklisting', 'iweb', 'stormtrack', 'monitoring', 'edigraf', 'anal', 'trial', 'restart', 'prof'])
META_DATA = ["^>", ">>", "^^", "!!", "!!SF", "HY", "P0", "PT", "E1"]
FILES = manager.list([])

In [None]:
def get_subdirectories_for_path(path):
    subdirectories = []
    if len([r for r in REJECT if r in path]):
        return subdirectories
    for entry in os.scandir(path):
        if entry.is_dir():
            subdirectories.append(entry.path)
    return subdirectories
    
def get_subdirectories(base_path):
    subdirectories = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        to_visit = [base_path]
        while to_visit:
            with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
                directories = {executor.submit(get_subdirectories_for_path, path): path for path in to_visit}
                to_visit = []
                for future in concurrent.futures.as_completed(directories):
                    subdirectories_for_path = future.result()
                    subdirectories.extend(subdirectories_for_path)
                    to_visit.extend(subdirectories_for_path)

    subdirectories = [sd for sd in subdirectories if not any([r in sd for r in REJECT])]
    return subdirectories

def maybeFST(filename) -> bool:
    if not os.path.isfile(filename):
        return False
    with open(filename, 'rb') as f:
        buf = f.read(16)
        if len(buf) < 16:
            return False
        # Same check as c_wkoffit in librmn
        return buf[12:] == b'STDR'

def get_days(num_days):
    today = datetime.today()
    dates = [today + timedelta(days=i) for i in range(num_days + 1)]
    return [date.strftime("%d") for date in dates]

def find_files(base_path, num_days = 3):
    now = datetime.now()
    fst_files = []
    for day in get_days(num_days):
        pattern = f"**/{now.year}{now.month:02d}{day}[0-9][0-9]_[0-9][0-9][0-9]"
        fst_files.extend(glob(f'{base_path}/{pattern}', recursive = True))
    FILES.extend(fst_files)    
    # return fst_files
    
def raw_headers(filename):
    # if not os.path.exists(filename):
    #   return None
    f = open(filename,'rb')
    # # Use same check as maybeFST
    # magic = f.read(16)
    # if len(magic) < 16 or magic[12:] != b'STDR':
    #     f.close()
    #     return None
    # Get the raw (packed) parameters.
    pageaddr = 27
    raw = []

    while pageaddr > 0:
        f.seek(pageaddr*8-8, 0)
        page = np.fromfile(f, '>i4', 8+256*18)
        params = page[8:].reshape(256,9,2)
        nent = page[5]
        raw.append(params[:nent].view('B').flatten())
        pageaddr = page[4]

    res = np.concatenate(raw)
    f.close()
    return res


def decode_headers(raw):
    raw = raw.view('>i4').astype('uint32').reshape(-1,9,2)
    nrecs = raw.shape[0]
    out = {}


    out['nomvar'] = np.empty(nrecs, dtype='|S4')
    out['typvar'] = np.empty(nrecs, dtype='|S2')
    out['etiket'] = np.empty(nrecs, dtype='|S12')
    out['ni'] = np.empty(nrecs, dtype='int32')
    out['nj'] = np.empty(nrecs, dtype='int32')
    out['nk'] = np.empty(nrecs, dtype='int32')
    out['dateo'] = np.empty(nrecs, dtype='int32')
    out['ip1'] = np.empty(nrecs, dtype='int32')
    out['ip2'] = np.empty(nrecs, dtype='int32')
    out['ip3'] = np.empty(nrecs, dtype='int32')
    out['deet'] = np.empty(nrecs, dtype='int32')
    out['npas'] = np.empty(nrecs, dtype='int32')
    out['datyp'] = np.empty(nrecs, dtype='ubyte')
    out['nbits'] = np.empty(nrecs, dtype='byte')
    out['grtyp'] = np.empty(nrecs, dtype='|S1')
    out['ig1'] = np.empty(nrecs, dtype='int32')
    out['ig2'] = np.empty(nrecs, dtype='int32')
    out['ig3'] = np.empty(nrecs, dtype='int32')
    out['ig4'] = np.empty(nrecs, dtype='int32')
    out['datev'] = np.empty(nrecs, dtype='int32')
    out['lng'] = np.empty(nrecs, dtype='int32')
    out['dltf'] = np.empty(nrecs, dtype='ubyte')
    out['swa'] =  np.empty(nrecs, dtype='uint32')
    out['ubc'] = np.empty(nrecs, dtype='uint16')
    # out['key'] = np.empty(nrecs, dtype='int32')

    temp8 = np.empty(nrecs, dtype='ubyte')
    temp32 = np.empty(nrecs, dtype='int32')

    np.divmod(raw[:,0,0],2**24, temp8, out['lng'])
    out['lng'] *= 2 # Convert from 8-byte to 4-byte units.
    np.divmod(temp8,128, out['dltf'], temp8)
    out['swa'][:] = raw[:,0,1]
    np.divmod(raw[:,1,0],256, out['deet'], out['nbits'])
    np.divmod(raw[:,1,1],256, out['ni'], out['grtyp'].view('ubyte'))
    np.divmod(raw[:,2,0],256, out['nj'], out['datyp'])
    np.divmod(raw[:,2,1],4096, out['nk'], out['ubc'])
    out['npas'][:] = raw[:,3,0]//64
    np.divmod(raw[:,3,1],256, out['ig4'], temp32)
    out['ig2'][:] = (temp32 << 16) # ig2a
    np.divmod(raw[:,4,0],256, out['ig1'], temp32)
    out['ig2'] |= (temp32 << 8) # ig2b
    np.divmod(raw[:,4,1],256, out['ig3'], temp32)
    out['ig2'] |= temp32 # ig2c
    etik15 = raw[:,5,0]//4
    etik6a = raw[:,5,1]//4
    et = raw[:,6,0]//256
    etikbc, _typvar = divmod(et, 4096)
    _nomvar = raw[:,6,1]//256
    np.divmod(raw[:,7,0],16, out['ip1'], temp8)
    out['ip2'][:] = raw[:,7,1]//16
    out['ip3'][:] = raw[:,8,0]//16
    date_stamp = raw[:,8,1]
    # Reassemble and decode.
    # (Based on fstd98.c)
    etiket_bytes = np.empty((nrecs,12),dtype='ubyte')
    for i in range(5):
        etiket_bytes[:,i] = ((etik15 >> ((4-i)*6)) & 0x3f) + 32
    for i in range(5,10):
        etiket_bytes[:,i] = ((etik6a >> ((9-i)*6)) & 0x3f) + 32
    etiket_bytes[:,10] = ((etikbc >> 6) & 0x3f) + 32
    etiket_bytes[:,11] = (etikbc & 0x3f) + 32
    out['etiket'][:] = etiket_bytes.flatten().view('|S12')
    nomvar_bytes = np.empty((nrecs,4),dtype='ubyte')
    for i in range(4):
      nomvar_bytes[:,i] = ((_nomvar >> ((3-i)*6)) & 0x3f) + 32
    out['nomvar'][:] = nomvar_bytes.flatten().view('|S4')
    typvar_bytes = np.empty((nrecs,2),dtype='ubyte')
    typvar_bytes[:,0] = ((_typvar >> 6) & 0x3f) + 32
    typvar_bytes[:,1] = ((_typvar & 0x3f)) + 32
    out['typvar'][:] = typvar_bytes.flatten().view('|S2')
    out['datev'][:] = (date_stamp >> 3) * 10 + (date_stamp & 0x7)
    # Note: this dateo calculation is based on my assumption that
    # the raw stamps increase in 5-second intervals.
    # Doing it this way to avoid a gazillion calls to incdat.
    date_stamp = date_stamp - (out['deet']*out['npas'])//5
    out['dateo'][:] = (date_stamp >> 3) * 10 + (date_stamp & 0x7)
    # out['xtra1'][:] = out['datev']
    # out['xtra2'][:] = 0
    # out['xtra3'][:] = 0
    out['nomvar'] = np.char.strip(out['nomvar'].astype('str'))
    out['typvar'] = np.char.strip(out['typvar'].astype('str'))
    out['etiket'] = np.char.strip(out['etiket'].astype('str'))
    out['grtyp'] = np.char.strip(out['grtyp'].astype('str'))
    df = pd.DataFrame(out)

  # df['path'] = path

    df = df.loc[df.dltf == 0]
    df = df.drop(labels=['dltf', 'ubc'], axis=1)

    df['shape'] = pd.Series(zip(df.ni.to_numpy(),df.nj.to_numpy()),dtype='object').to_numpy()
  
    return df

def fstindex_to_pandas(filename):
    if maybeFST(filename):
        raw = raw_headers(filename)
        df = decode_headers(raw)
        df['path'] = filename
        return df    
    return None

In [None]:
base_path = Path('/home/smco500/cmcprod/ppp5/suites/gdps/g1')
subdirectories = get_subdirectories(str(base_path))

In [None]:
with Pool(10) as p:
    p.map(find_files, subdirectories)

In [None]:
df = pd.DataFrame([{'base_path':Path(f).parent, 'path':f} for f in sorted(set(FILES))])
# df

In [None]:
DATAFRAMES = manager.list([])
def get_records(filename):
    DATAFRAMES.append(fstindex_to_pandas(filename))

groups = df.groupby('base_path')
for base_path, sub_df in groups:
    with Pool(10) as p:
        p.map(get_records, sub_df.path.to_list())



In [None]:
df = pd.concat(list(DATAFRAMES))
df

In [2]:
import fstcatalog
from glob import glob
import datetime
from pathlib import Path

In [3]:
current_date = datetime.datetime.now()
year_month_string = current_date.strftime('%Y%m')

base_path1 = Path('/home/smco500/cmcprod/ppp5/suites/gdps/g1/gridpt.usr/prog/eta')
base_path2 = Path('/home/smco500/cmcprod/ppp5/suites/gdps/g1/gridpt.usr/prog/diag')
base_path3 = Path('/home/smco500/cmcprod/ppp5/suites/gdps/g1/gridpt.usr/prog/pres')
files = glob(f'{base_path1}/{year_month_string}0100_00*')
files.extend(glob(f'{base_path2}/{year_month_string}0100_*'))
files.extend(glob(f'{base_path3}/{year_month_string}0100_*'))

In [4]:
filtered_files = fstcatalog.get_fst_files(files)

In [5]:
fstcatalog.fst_index(filtered_files)

Processing 346 files...


ValueError: Must have equal len keys and value when setting with an iterable

In [14]:
# import pandas as pd
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 1000)
df.index.get_level_values(2).unique().to_list()
df.index
# df.loc[df.nomvar == 'ES']
# nomvar typvar     etiket    ni    nj  nk      dateo    ip1    ip2  ip3  deet  npas datyp  nbits grtyp    ig1    ig2    ig3    ig4


MultiIndex([(  '2Z', 'P',   'EDYNTRP',   69,   51, 1, 'N',     '620390', ...),
            (  '5P', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            (  'AB', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ( 'ABE', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            (  'AD', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            (  'AE', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ('AFSD', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ('AFSF', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ('AFSI', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ('AFSV', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            ...
            (  'WH', 'P', 'G1WE_2_0X', 1801, 1251, 1, 'Z', '6883990098', ...),
            (  'WT', 'P', 'G1_8_1_0N', 1801, 1251, 1, 'Z', '6883990098', ...),
            (  'WW', 'P', 'G1_8_1_0N

In [None]:
df = fstcatalog.remove_meta(df)

# np.sort(df.nomvar.unique())

In [None]:
df.columns


In [None]:
df2 = df.set_index(['nomvar','etiket','ni','nj', 'nk', 'grtyp', 'grid', 'vctype'])
df2

In [None]:
import fstpy
display(filtered_files[0])
df1 = fstpy.StandardFileReader(filtered_files[0]).to_pandas()
fstpy.voir(df1)

In [None]:
import fstd2nc
ds = fstd2nc.Buffer(df.loc[df.nomvar == 'TD'].path.tolist(), vars=['TD']).to_xarray()
ds
