# Post-process the k-means and ACCESS model data 

- I am using the conda environment version 21.10 http://climate-cms.wikis.unsw.edu.au/Conda#21.10
- Model code is documented here (https://code.metoffice.gov.uk/trac/roses-u/log/b/x/4/0/0/trunk, accesible to UM users)
- CERES data has been downloaded from https://ceres.larc.nasa.gov/data/}{https://ceres.larc.nasa.gov/data/
- The data generated by this note book can be found here: https://doi.org/10.5281/zenodo.6004062 

### In this notebook: 
- Organise the data generated by the k-means clustering for both MODIS and ACCESS 
- Pull out the analysis data you need for the ACCESS model, perform any unit conversions needed
- Mirror the MODIS data to match the ACCESS data and add the CERES data to the array 


In [1]:
import sys
import os
import numpy as np
import pandas as pd
import xarray as xr

# K-Means data 

### Read in data from k-means clustering

Centers and labels

In [2]:
centers = xr.open_dataarray('/g/data/jk72/slf563/ACCESS/clustering_data/modis_cluster_centres_2015-2019.nc')
M = xr.open_dataarray('/g/data/jk72/slf563/ACCESS/clustering_data/modis_cluster_labels_2015-2019.nc')
A = xr.open_dataarray('/g/data/jk72/slf563/ACCESS/clustering_data/bx400_cluster_labels_2015-2019.nc')
M = xr.DataArray(M,name='Cluster')
A = xr.DataArray(A,name='Cluster')

In [3]:
nclus = centers['k'].shape[0]
plevs = centers.attrs['Cloud top pressure']
tau = centers.attrs['Cloud optical depth']

### Condense clusters 

From our notebook 'check_clusters.ipynb' we can see that the three Antarctic clusters aren't really of interest.. so let's combine them into one. 

In [4]:
nclus = 10
cluster_names = np.array(['Marine stratiform','Stratocumulus','Cirrus',
                 'Cloud decks','Convective','Frontal','Thin cirrus',
                 'Shallow cumulus','Mid-level','Antarctic',])
cluster_short_names = np.array(['MS','StC','Ci','CD','Cv','Fr','TC','ShC','ML','Ant'])

In [5]:
centers10 = centers[[0,1,3,4,5,6,7,9,11,2],:,:]
centers10[-1] = centers[[2,8,10],:,:].sum('k')
centers10['k'] = np.arange(1,11)
centers10 = centers10.assign_attrs({'Cluster Long Names':cluster_names,'Cluster Short Names':cluster_short_names})
centers10.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/modis_cluster_centres_2015-2019_condensed.nc')

In [6]:
M10 = M.where(M!=2,-1)
M10 = M10.where(M10!=8,-1)
M10 = M10.where(M10!=10,-1)
new_nums = np.arange(0,10)
for i,k in enumerate([0,1,3,4,5,6,7,9,11,-1]):
    M10 = M10.where(M10!=k,new_nums[i])
        
A10 = A.where(A!=2,-1)
A10 = A10.where(A10!=8,-1)
A10 = A10.where(A10!=10,-1)
new_nums = np.arange(0,10)
for i,k in enumerate([0,1,3,4,5,6,7,9,11,-1]):
    A10 = A10.where(A10!=k,new_nums[i])
    
M10 = M10.assign_attrs({'Cluster Long Names':cluster_names,'Cluster Short Names':cluster_short_names})
A10 = A10.assign_attrs({'Cluster Long Names':cluster_names,'Cluster Short Names':cluster_short_names})

M10.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/modis_cluster_labels_condensed_2015-2019.nc')
A10.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/bx400_cluster_labels_condensed_2015-2019.nc')

### RFOs 
(Relative Frequencies of Occurance)

In [7]:
RFO_M = np.zeros((nclus,M10.lon.shape[0],M.lat.shape[0]))
RFO_A = np.zeros((nclus,A10.lon.shape[0],A.lat.shape[0]))
for i in range(nclus):
            RFO_M[i,:,:] =  M10.where(M10==i).count(axis=0)
            RFO_A[i,:,:] =  A10.where(A10==i).count(axis=0)
            
            RFO_M[i,:,:] = (RFO_M[i,:,:]/M10.count(axis=0))*100
            RFO_A[i,:,:] = (RFO_A[i,:,:]/A10.count(axis=0))*100
            
RFO_M = xr.DataArray(RFO_M, coords=[np.arange(1,11),M.coords['lon'],M.coords['lat']], 
                  dims=["Cluster","lon","lat"])

RFO_A = xr.DataArray(RFO_A, coords=[np.arange(1,11),A.coords['lon'],A.coords['lat']], 
                  dims=["Cluster","lon","lat"])

RFO_A = RFO_A.assign_attrs({'Cluster Long Names':cluster_names,'Cluster Short Names':cluster_short_names})
RFO_M = RFO_M.assign_attrs({'Cluster Long Names':cluster_names,'Cluster Short Names':cluster_short_names})

RFO_M.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/modis_RFO_2015-2019_condensed.nc')
RFO_A.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/bx400_RFO_2015-2019_condensed.nc')

In [8]:
RFO_M_seas = np.zeros((nclus,4,M10.lon.shape[0],M10.lat.shape[0]))
RFO_A_seas = np.zeros((nclus,4,A10.lon.shape[0],A10.lat.shape[0]))

for i in range(nclus):
    RFO_M_seas[i,:,:,:] =  M10.where(M10==i).groupby('time.season').count()/M10.groupby('time.season').count()
    RFO_A_seas[i,:,:,:] =  A10.where(A10==i).groupby('time.season').count()/A10.groupby('time.season').count()
            
RFO_M_seas = RFO_M_seas*100
RFO_M_seas = xr.DataArray(RFO_M_seas, coords=[np.arange(1,nclus+1),['DJF','MAM','JJA','SON'],M10.coords['lon'],M10.coords['lat']], 
                  dims=["Cluster","season","lon","lat"])

RFO_A_seas = RFO_A_seas*100
RFO_A_seas = xr.DataArray(RFO_A_seas, coords=[np.arange(1,nclus+1),['DJF','MAM','JJA','SON'],A10.coords['lon'],A10.coords['lat']], 
                  dims=["Cluster","season","lon","lat"])

RFO_M_seas.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/modis_RFO_2015-2019_condensed_seas.nc')
RFO_A_seas.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/bx400_RFO_2015-2019_condensed_seas.nc')

# Model and Observational data
Pull all the equivalent obs and model data into one place. 

### Model data first

A number of the COSP files need to be divided by a weighted mask (either a specific weight or the cloud fraction) to match the MODIS data 

In [9]:
fname = '/g/data/jk72/slf563/ACCESS/output/bx400/daily/bx400a.pdc2*'
data = xr.open_mfdataset(fname,parallel=True)
#LWP_A = xr.DataArray((data['field2466']/data['field2330'])*1000,name='LWP') # convert from kg/m2 to g/m2
#IWP_A = xr.DataArray((data['field2467']/data['field2330'])*1000,name='IWP') # convert from kg/m2 to g/m2
CFL_A = xr.DataArray(data['field2452']/data['field2330'],name='CFL') 
CFI_A = xr.DataArray(data['field2453']/data['field2330'],name='CFI')
CTP_A = xr.DataArray((data['field2465']/data['field2451'])/100,name='CTP') # convert to hPa

  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))


In [10]:
fname = '/g/data/jk72/slf563/ACCESS/output/bx400/daily/bx400a.pd.glob20*'
data_met = xr.open_mfdataset(fname,parallel=True)
CRESWT_A = xr.DataArray(data_met.rsutcs-data_met.rsut,name='SWCREtoa')
SWT_A = xr.DataArray(data_met.rsut,name='SWtoa')
# note I have used the raw model LWP/IWP not the COSP versions due to unrealistically large biases. 
LWP_A = xr.DataArray(data_met.lwp,name='LWP')*1000 # convert from kg/m2 to g/m2
IWP_A = xr.DataArray(data_met.clivi,name='IWP')*1000 # convert from kg/m2 to g/m2

In [11]:
mod = xr.merge([A10,LWP_A,IWP_A,CFL_A,CFI_A,CTP_A,CRESWT_A,SWT_A])
mod = mod.where(np.isfinite(mod.Cluster),np.nan) # this removes all data where no cluster info is available. 
mod.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/COSP_vars_bx400_2015-2019.nc')
data.close()
data_met.close()

### Obs data

In [12]:
fdir = '/g/data/p66/slf563/OBS/MCD06COSP_D3_MODIS/MCD06COSP_D3_MODIS.201*cloud.nc'
MODIS = xr.open_mfdataset(fdir)

In [13]:
MODIS = MODIS[['CTP','LWP','IWP','CFI','CFL']]
MODIS['LWP'] = MODIS['LWP']*MODIS['CFL'] # Multiply by the cloud fraction to match the access data
MODIS['IWP'] = MODIS['IWP']*MODIS['CFI'] # Multiply by the cloud fraction to match the access data

In [14]:
fdir = '/g/data/jk72/slf563/OBS/CERES_SYN1D/CERES_SYN1deg-Day_Terra-Aqua-MODIS_Ed4.1_Subset_*.nc'
CERES = xr.open_mfdataset(fdir,parallel=True)
CERES = CERES.sel(time=slice('20150101','20191231'))
CERES = CERES.interp(lat=mod.lat,lon=mod.lon)

CRESWT_M = xr.DataArray(CERES.toa_sw_clr_daily - CERES.toa_sw_all_daily,name='SWCREtoa')
SWT_M = xr.DataArray(CERES.toa_sw_all_daily,name='SWtoa')

In [15]:
obs = xr.merge([M10,MODIS,CRESWT_M,SWT_M,])
obs = obs.where(np.isfinite(obs.Cluster),np.nan)
obs = obs.transpose('time','lat','lon')
obs.to_netcdf('/g/data/jk72/slf563/ACCESS/clustering_data/COSP_vars_MODIS_2015-2019.nc')