# AWS NetCDF Data Tests

### Import Modules

In [1]:
import s3fs
import numpy as np
import xarray as xr
import fsspec
import zarr
import timeit
import matplotlib.pyplot as plt
import requests
import netCDF4
from dask.distributed import Client, performance_report
from json import dumps
from io import StringIO
from os.path import dirname, join
from IPython.display import HTML

In [2]:
ShortName = "MUR-JPL-L4-GLOB-v4.1"

### Earthdata Login
<br/>
You should have a .netrc file in your home directory set up like:

    machine urs.earthdata.nasa.gov 
    login <username> 
    password <password>

### Direct Access to Data from S3

In [3]:
def begin_s3_direct_access(url: str="https://archive.podaac.earthdata.nasa.gov/s3credentials"):
    response = requests.get(url).json()
    return s3fs.S3FileSystem(key=response['accessKeyId'],
                             secret=response['secretAccessKey'],
                             token=response['sessionToken'],
                             client_kwargs={'region_name':'us-west-2'})

fs = begin_s3_direct_access()

type(fs)

s3fs.core.S3FileSystem

## Setup for Global Tests

### Open MUR Dataset
<br/>


In [None]:
start_time = timeit.default_timer()

mur_files = fs.glob(join("podaac-ops-cumulus-protected/", ShortName, "*.nc"))
mur_files = mur_files[:6443]
print(len(mur_files))

variables=[
    'analysed_sst', 
    'mask'
]

def subset(ds):
    subset_ds = ds[variables]
    return subset_ds

mur_global = xr.open_mfdataset(
    paths=[fs.open(f) for f in mur_files],
    preprocess=subset,
    combine='by_coords',
    mask_and_scale=True,
    decode_cf=True,
    cache=False,
    engine='h5netcdf',
    chunks='auto'
)

elapsed = timeit.default_timer() - start_time
print(elapsed)

In [None]:
mur_global

### Add in NAN Values for Land to MUR Data
<br/>
We use the mask dimension to replace temperature values from land observations with NaN so that they are not factored in to our calculations. The mask variable has a value for each coordinate pair representing which surface the temperature was collected from (land, open-sea, ice, etc.).

In [None]:
mur_global_sst = mur_global['analysed_sst'].where(mur_global.mask == 1)

### Convert Temperatures to Celsius
<br/>
The dataset is stored with temperatures measured in Kelvin. This converts it to Celsius for ease of understanding and analysis.

In [None]:
mur_global_sst = mur_global_sst - 273.15

### Find Daily Average Temperature for Time Series

In [None]:
mur_global_sst_mean_ts = mur_global_sst.mean(['lat', 'lon'])

### Find Average Temperature for Each Coordinate Pair for Spatial Plot

In [None]:
mur_global_sst_mean_sp = mur_global_sst.mean(['time'])

## Global Tests

### Global SST Averaged Time Series

In [None]:
start_time = timeit.default_timer()

mur_global_sst_mean_ts.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

### Global SST Averaged Spatial Plot

In [None]:
start_time = timeit.default_timer()

mur_global_sst_mean_sp.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

## Setup for Regional Tests

### Period and Region of Interest

In [4]:
start_date = "2019-08-01"
end_date = "2020-1-20"

minlat = 18
maxlat = 23
minlon = -160
maxlon = -154

### Open MUR Dataset
<br/>


In [5]:
start_time = timeit.default_timer()


mur_files = fs.glob(join("podaac-ops-cumulus-protected/", ShortName, "*2019*.nc"))
mur_files = mur_files[212:]

mur_files2020 = fs.glob(join("podaac-ops-cumulus-protected/", ShortName, "*202001*.nc"))
mur_files2020 = mur_files2020[:20]

for day in mur_files2020:
    mur_files.append(day)

variables=[
    'analysed_sst', 
    'mask'
]

def subset(ds):
    subset_ds = ds[variables].sel(
        lat=slice(minlat, maxlat),
        lon=slice(minlon, maxlon)
    )
    return subset_ds

mur_hawaii = xr.open_mfdataset(
    paths=[fs.open(f) for f in mur_files],
    preprocess=subset,
    combine='by_coords',
#     mask_and_scale=True,
#     decode_cf=True,
#     cache=False,
#     parallel=True,
    engine='h5netcdf'
).chunk({"time": 30, "lat": 100, "lon": 100})

mur_hawaii.load()   # Uncomment if you want to load the dataset into memory now


elapsed = timeit.default_timer() - start_time
print(elapsed)

MemoryError: 

In [None]:
mur_hawaii

### Add in NAN Values for Land to MUR Data
<br/>
We use the mask dimension to replace temperature values from land observations with NaN so that they are not factored in to our calculations. The mask variable has a value for each coordinate pair representing which surface the temperature was collected from (land, open-sea, ice, etc.).

In [None]:
mur_hawaii_sst = mur_hawaii['analysed_sst'].where(mur_hawaii.mask == 1)

### Convert Temperatures to Celsius
<br/>
The dataset is stored with temperatures measured in Kelvin. This converts it to Celsius for ease of understanding and analysis.

In [None]:
mur_hawaii_sst = mur_hawaii_sst - 273.15

In [None]:
mur_hawaii_sst

### Open MUR Climatology for Hawaii

In [None]:
mur_clim = xr.open_dataarray(
    "../data/MURClimatology.nc", 
    chunks={"time": 30, "lat": 100, "lon": 100}
)

In [None]:
mur_clim

### Drop the Leap Day

In [None]:
mur_clim = mur_clim.where(mur_clim["time"] != np.datetime64('2004-02-29T09:00:00', 'ns'), drop=True)

### Create Subset Dataset

In [None]:
mur_clim_jan = mur_clim[0:20]

In [None]:
mur_clim_subset = mur_clim[212:]

In [None]:
mur_clim_subset = xr.concat([mur_clim_subset, mur_clim_jan], dim="time")

In [None]:
mur_clim_subset = mur_clim_subset.assign_coords({"time": mur_hawaii_sst["time"]})

In [None]:
mur_clim_subset

### Create SST Anomaly Dataset

In [None]:
sst_anomaly = mur_hawaii_sst - mur_clim_subset

In [None]:
sst_anomaly

### Find Daily Average SST Anomaly for Time Series

In [None]:
sst_anomaly_mean_ts = sst_anomaly.mean(['lat', 'lon'])

In [None]:
sst_anomaly_mean_ts

### Find Average SST Anomaly for Each Coordinate Pair for Spatial Plot

In [None]:
sst_anomaly_mean_sp = sst_anomaly.mean(['time'])

In [None]:
sst_anomaly_mean_sp

## Regional Tests

### Regional SST Anomaly Averaged Time Series, August 1st, 2019 - January 20th, 2020
<br/>
Kernel dies on t3.small, likely due to limited memory (RAM)

In [None]:
start_time = timeit.default_timer()

sst_anomaly_mean_ts.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

### Regional SST Anomaly Averaged Spatial Plot, August 1st, 2019 - January 20th, 2020
<br/>
Kernel dies on t3.small, likely due to limited memory (RAM)

In [None]:
start_time = timeit.default_timer()

sst_anomaly_mean_sp.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)