# Download wave data from the AWS bucket
Joshua Simmons 2022

Example code for using Google Colab to download Hsig data for our time period of interest from the AWS bucket.

## Connect to AWS and load the data

See example [here](https://github.com/planet-os/notebooks/blob/master/aws/era5-s3-via-boto.ipynb).

In [None]:
# If using colab then:
!pip install xarray dask[complete] pandas intake-esm xmip cftime gcsfs s3fs
!pip install --upgrade requests==2.26.0

In [None]:
!pip install --upgrade xarray

In [None]:
# Connect to your google drive to download data to a folder
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/My Drive/colab_connect/era5/')

import s3fs 
import xarray as xr
import numpy as np

s3_loc = 's3://era5-pds/{}/{}/data/significant_height_of_wind_and_swell_waves.nc'

# get the data
start_year = 1985
end_year = 2022

get_years = np.arange(start_year, end_year+1, 1)
get_months = ['{:02.0f}'.format(_) for _ in np.arange(1, 13, 1)]

fs = s3fs.S3FileSystem(anon=True)

lat_sel = -28.0
lon_sel = 153.5

# -27.0,
# 153.5,

ll_sel = zip(
    [-28.0,-29.0,-30.0,-31.0,-32.0,-33.0,-34.0,-35.0,-36.0,-37.0], 
    [153.5,153.5,153.5,153.5,153.0,152.0,151.5,151.0,150.5,150.0]
)

# this_year = get_years[0]
for this_year in get_years:
    for this_month in get_months:
        with fs.open(s3_loc.format(this_year, this_month)) as f:
            dataset = xr.open_dataset(
                f, 
                engine='h5netcdf',
                chunks={'time0': 1000, 'lat_ocean': 60, 'lon_ocean': 60}
            )
            dataset.sel(
                lat_ocean=[-27.0,-28.0,-29.0,-30.0,-31.0,-32.0,-33.0,-34.0,-35.0,-36.0,-37.0], 
                lon_ocean=[153.5,153.5,153.5,153.5,153.5,153.0,152.0,151.5,151.0,150.5,150.0],
            ).to_dataframe().to_csv('era5_{}{}_{}.csv'.format(this_year,this_month,'bulk'))

## Now convert the downloaded data into manageable chunks
Can do this locally..

In [1]:
import glob
import pandas as pd

In [2]:
data_loc = 'data/waves/raw'
fns = glob.glob(os.path.join(data_loc, '*.csv'))

# df = pd.concat([pd.read_csv(fn) for fn in fns])
df = pd.read_csv(fns[0])

In [24]:

ll_sel = zip(
    [-27.0,-28.0,-29.0,-30.0,-31.0,-32.0,-33.0,-34.0,-35.0,-36.0,-37.0], 
    [153.5,153.5,153.5,153.5,153.5,153.0,152.0,151.5,151.0,150.5,150.0]
)

for lat_sel, lon_sel in ll_sel:
    df = pd.concat([pd.read_csv(fn).query('lat_ocean == {} & lon_ocean == {}'.format(lat_sel,lon_sel)).set_index('time0').drop(columns=['lat_ocean', 'lon_ocean']).drop_duplicates() for fn in fns])
    df.to_csv('data/waves/era5_{}_{}.csv'.format(lat_sel,lon_sel))