In [1]:
from google.cloud import storage
import os
import netCDF4
import numpy as np
import numpy.ma as ma
from dask import delayed
import dask.array as da
from dask.distributed import Client
import glob
import tempfile
import subprocess
import datetime, time
from urllib import request
from multiprocessing import Pool
import json
import gc
import boto3
import botocore
import itertools

storage_client = storage.Client.from_service_account_json('/home/jovyan/work/credentials.json')
bucket = storage_client.get_bucket('nex-gddp')
loca_bucket = 'nasanex'
base_key_path = 'LOCA'
all_models = ["ACCESS1-0", "BNU-ESM", "CCSM4", "CESM1-BGC", "CNRM-CM5", "CSIRO-Mk3-6-0", "CanESM2", "GFDL-CM3", "GFDL-ESM2G", "GFDL-ESM2M", "IPSL-CM5A-LR", "IPSL-CM5A-MR", "MIROC-ESM-CHEM", "MIROC-ESM", "MIROC5", "MPI-ESM-LR", "MPI-ESM-MR", "MRI-CGCM3", "NorESM1-M", "bcc-csm1-1", "inmcm4"]
some_models = ["ACCESS1-0","BNU-ESM","CCSM4"]

client = Client('scheduler:8786')
s3 = boto3.resource('s3')


def gen_nex_netcdf_id(model, scenario, year, var):
    id = f'NEX-GDDP/BCSD/{scenario}/day/atmos/{var}/r1i1p1/v1.0/{var}_day_BCSD_{scenario}_r1i1p1_{model}_{str(year)}.nc'
    return id

def download_file(file_id, loca_bucket = loca_bucket, download_location = '/temp'):
    filename = f'{download_location}/{file_id.split("/")[-1]}'
    print(f"Downloading {filename}")
    try:
        s3.Bucket(loca_bucket).download_file(file_id, filename)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            file_id = file_id.replace('r1i1p1', 'r6i1p1')
            s3.Bucket(loca_bucket).download_file(file_id, filename)
        else:
            raise
    except:
        
        filename = None
        raise
    return filename

def cleanup():
    for file in glob.glob('/temp/*'):
        os.remove(file)

## BASELINE


In [2]:
def generate_baseline(prefix, model, var, chunks, gen_netcdf_id = gen_nex_netcdf_id):
    # Getting all file ids
    ids = [gen_netcdf_id(model, "historical", year, var) for year in range(1971, 2001)]
    
    # Downloading all files
    pool = Pool()
    filenames = pool.map(download_file, ids)
    
    darrays = list(map(lambda fname: da.from_array(netCDF4.Dataset(fname)[var], chunks = (366, *chunks)), filenames))
    
    p99_percentiles = [ delayed(np.percentile)(darray, axis = 0, q = 99).compute() for darray in darrays ]
    
    final_avg = np.mean(np.stack(p99_percentiles), axis = 0)
    
    cleanup()
    
    output = f'/temp/{prefix}_baseline_{model}_{var}.npy'
    filename = output.split('/')[-1]
    
    np.save(output, final_avg)
    blob = bucket.blob('baselines/' + filename)
    blob.upload_from_filename(output)    
    os.remove(output)
    return output
#test_res = generate_baseline('nexgddp', some_models[1], 'pr', (360, 360))

In [None]:
for model in all_models:
    cleanup()
    client.restart()
    generate_baseline('nexgddp', model, 'tasmax', (360, 360))
    
    cleanup()
    client.restart()
    generate_baseline('nexgddp', model, 'pr', (360, 360))

Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1972.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1975.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1973.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1978.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1976.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1977.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1974.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1979.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1980.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1981.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1982.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1983.nc
Downloading /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1984.nc
Downloading /temp/ta