# NetCDF Explore

In [1]:
import numpy as np
import glob
import re
import xarray as xr

import pandas as pd

from os.path import join

## Get Files

Get all files from a day

In [2]:
file_dir = '../../rp_weather_data/20200102'
to_get_path = join(file_dir, "**/*.nc")
nc_files = glob.glob(to_get_path, recursive=True)

Turn file paths into a dataframe with values so we can work with the files to get AVG and STD files for the values we need

In [3]:
df = pd.DataFrame(nc_files, columns=['file_path'])

# ChatGPT (so ???)
def extract_parts(filename):
    filename = re.sub(r'\.nc$', '', filename)
    # Split the string on _
    filename_parts = re.split('_', filename)
    filename_parts.remove('hence')
    return filename_parts

# Assuming df is your DataFrame and 'file_path' is the column with the filenames
df[['date_time', 'region', 'grid_length', 'config', 'region_size', 'sampling', 'time', 'processing', 'code']] = df['file_path'].apply(lambda x: pd.Series(extract_parts(x.split('/')[-1])))


Group so we have AVG and STD for the region and time and code (ie our X and Y data)

In [4]:
simple_df = df[['region', 'time', 'code', 'file_path', 'processing']]
df_big_group = simple_df.groupby(['region', 'time',])

Convert to list of values

In [5]:
result = []
for name, group in df_big_group:
    region, time = name
    file_list = []
    for index, row in group.iterrows():
        file_list.append( row['file_path'] )

    result.append({
        'region': region,
        'time': time,
        'files': file_list
    })

In [6]:
def process_file_list(all_files:list) -> dict:
    '''
    Want to convert file list into a dict with {processing_code:file_path}
    '''
    def get_code(s:str) -> str:
        sl = ((s.split('/')[-1]).split('.')[0]).split('_')[-2:] # Should be [processing, code]
        return '_'.join(sl)
    list_dict = {}
    for l in all_files:
        list_dict[get_code(l)] = l
    return list_dict

In [7]:
def get_values(fname:str) -> np.ndarray:
    ds = xr.open_dataset(fname)
    v = ds['unknown']
    v.load()
    return v.values

def load_data(list_dict:dict) -> dict:
    '''
    Load data as numpy arrays into dictionary objects
    '''
    array_dict = {}
    for k in list_dict.keys():
        v = list_dict[k]
        array_dict[k] = get_values(v)
    return array_dict

Into the abyss. Currently have a list of dicts. Want to convert that to a list of dicts but with arrays not file paths

In [8]:
def process_dict(d:dict) -> dict:
    '''
    Get dictionary from groupby process (_), which contains 
        ...
    '''
    d['files'] = process_file_list(d['files'])
    d['arrays'] = load_data(d['files'])
    return d

In [9]:
p_list = [ process_dict(d) for d in result]

In [10]:
p_list[0]

{'region': 'r01',
 'time': 'time000',
 'files': {'AVG_00033': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00033.nc',
  'AVG_16004': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_16004.nc',
  'STD_00010': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00010.nc',
  'STD_00033': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00033.nc',
  'STD_16004': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_16004.nc',
  'AVG_00010': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00010.nc',
  'AVG_00408': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00408.nc',
  'AVG_00030': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5

Turn into X,y pairs

In [11]:
def convert_pairs(d:dict) -> dict:
    '''
    We have 3 vectors we want to create ( (lm_avg, Oro_avg, Oro_std), (T_avg, qv_avg, pressure_avg), (T_std, qv_std))
    '''
    return_dict = {
        'region': d['region'],
        'time': d['time'],
    }
    d = d['arrays']

    # Get lm, oro
    lm_avg = d['AVG_00030']
    oro_avg = d['AVG_00033']
    oro_std = d['STD_00033']
    return_dict['landmass'] = np.array([lm_avg, oro_avg, oro_std]) # Will need to make sure in same dim

    # Get t_avg, qv_avg, p_avg
    t_avg = d['AVG_16004']
    qv_avg = d['AVG_00010']
    p_avg = d['AVG_00408']
    return_dict['x'] = np.array([t_avg, qv_avg, p_avg])

    # Get t_std, qv_std
    t_std = d['STD_16004'] 
    qv_std = d['STD_00010']
    return_dict['y'] = np.array([t_std, qv_std])

    return return_dict

In [12]:
p = convert_pairs(p_list[0])

In [13]:
p.keys()

dict_keys(['region', 'time', 'landmass', 'x', 'y'])

In [16]:
p['landmass'].shape

(3, 2, 2)