# NetCDF Explore

In [1]:
import numpy as np
import glob
import re
from netCDF4 import Dataset

import xarray as xr

import pandas as pd

from os.path import join

## Get Files

Get all files from a day

In [2]:
file_dir = '../../rp_weather_data/20200102'
to_get_path = join(file_dir, "**/*.nc")
nc_files = glob.glob(to_get_path, recursive=True)

Turn file paths into a dataframe with values so we can work with the files to get AVG and STD files for the values we need

In [3]:
df = pd.DataFrame(nc_files, columns=['file_path'])

# ChatGPT (so ???)
def extract_parts(filename):
    filename = re.sub(r'\.nc$', '', filename)
    # Split the string on _
    filename_parts = re.split('_', filename)
    filename_parts.remove('hence')
    return filename_parts

# Assuming df is your DataFrame and 'file_path' is the column with the filenames
df[['date_time', 'region', 'grid_length', 'config', 'region_size', 'sampling', 'time', 'processing', 'code']] = df['file_path'].apply(lambda x: pd.Series(extract_parts(x.split('/')[-1])))


Group so we have AVG and STD for the region and time and code (ie our X and Y data)

In [4]:
simple_df = df[['region', 'time', 'code', 'file_path', 'processing']]
df_big_group = simple_df.groupby(['region', 'time', 'code'])

Convert to list of values

In [5]:
result = []
for name, group in df_big_group:
    region, time, code = name
    file_dict = {}
    for index, row in group.iterrows():
        file_dict[row['processing']] = row['file_path']

    result.append({
        'region': region,
        'time': time,
        'code': code,
        'files': file_dict
    })

In [6]:
result[0]

{'region': 'r01',
 'time': 'time000',
 'code': '00010',
 'files': {'STD': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00010.nc',
  'AVG': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00010.nc'}}

## Read NetCDF files

In [7]:
ncdf_grouping = result[0]

In [8]:
test_file = ncdf_grouping['files']['STD']

In [9]:
ncdf_grouping

{'region': 'r01',
 'time': 'time000',
 'code': '00010',
 'files': {'STD': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00010.nc',
  'AVG': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00010.nc'}}

In [10]:
'''
nc = Dataset(test_file, 'r')
for var in nc.variables:
    print(f'Variable: {var}')
    print(f'Values: {nc.variables[var][:]}')

# Close the NetCDF file
nc.close()
'''
nc = Dataset(test_file, 'r')
var = nc.variables['unknown']
#print(var[:])
v = np.array(var[:])
nc.close()

print(v.shape)

#'''

(70, 2, 2)


Unkown field is what we want out of every file

In [11]:
def get_values(fname:str) -> np.ndarray:
    nc = Dataset(fname, 'r')
    var = nc.variables['unknown']
    v = np.array(var[:])
    nc.close()
    return v

In [12]:
def process_group(g:dict) -> dict:
    print(g)
    to_return = {'region': g['region'], 'time': g['time'], 'code': g['code']}
    to_return['AVG'] = get_values(g['files']['AVG'])
    to_return['STD'] = get_values(g['files']['STD'])
    return to_return

In [13]:
processed_list = [process_group(g) for g in result]
processed_df = pd.DataFrame(processed_list)

{'region': 'r01', 'time': 'time000', 'code': '00010', 'files': {'STD': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00010.nc', 'AVG': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00010.nc'}}
{'region': 'r01', 'time': 'time000', 'code': '00030', 'files': {'AVG': '../../rp_weather_data/20200102/20200102T0000Z_r01_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00030.nc'}}


KeyError: 'STD'

Notes:
    - Not every file has a STD included

    - `ls ./*_STD_* | wc -l` -> 5760

    - `ls ./* | wc -l` -> 15360

    - So about 1/3

```
20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00010.nc  20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_16004.nc
20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00030.nc  20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00010.nc
20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00033.nc  20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_00033.nc
20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_AVG_00408.nc  20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_STD_16004.nc
```

1. cd ~/Documents/data/rp_weather_data/scratch/frme/caramel_for_zenodo/20200102 

2. ls 20200102T0000Z_r02_km1p5_RA2T_224x224sampling_hence_2x2_time000_*

What exactly are we doing?