# Select sub-set of optimal predictions
Step of the codes :
- Import modules 
- Define settings 
- Define functions 
- Download pc p1, pc pred and pc obs
- Compute RMSE rec for all pc pred 
- Select members with RMSE rec lower than pc p1 
- build xarray with index in the LHS
- save this xarray as netCDF

# Import Module

In [None]:
# Computational modules 
%matplotlib inline
import xarray as xr
import glob
import os
import numpy as np
import netCDF4
from netCDF4 import Dataset
import pandas as pd
import re
from array import array
from pylab import *
#import geopandas
from eofs.xarray import Eof
from eofs.multivariate.standard import MultivariateEof
import random

# Plotting modules 
import matplotlib.pyplot as plt
#from mpl_toolkits.basemap import Basemap
import pandas.plotting
import matplotlib.ticker as ticker
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import BoundaryNorm
from cartopy.util import add_cyclic_point

# Scikit-learn
from sklearn import linear_model
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from scipy.optimize import minimize
from scipy.optimize import dual_annealing
from sklearn.decomposition import PCA

# Settings

### Variables

In [16]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']
var_ceres = ['rsdt','rsut', 'rlut']
truncations = [18, 18, 8, 22, 22]
TITLE = 'Multi-variate'
ylabel = '$E_{tot}$'

### Paths

In [3]:
path_official='/data/scratch/globc/peatier/CMIP6/CNRM-CM6-1/CFMIP/amip/'
path_PPE='/data/scratch/globc/peatier/PPE/CNRM-CM6-1_PPE/'
path_files='/data/home/globc/peatier/PPE/CNRMppe_error_decomposition/files/'
path_file_npy = '/data/home/globc/peatier/PPE/CNRMppe_save/PPE/ENSEMBLE2/files/npy/'

# Functions

In [4]:
def MSE(mod, obs, W_rmse_2D) :
    diff_tmp = (mod - obs)**2 * W_rmse_2D
    diff = (diff_tmp.sum(['lat', 'lon']))
    return diff

In [5]:
def reconstruct_X(eofs_combined, pc, nb_dims) :
    X_rec_tmp = np.dot(eofs_combined.transpose(),pc)
    if nb_dims == 3 :
        X_rec = xr.DataArray(X_rec_tmp, 
                        dims=["lon", "lat", "time"]).transpose('time', 'lat', 'lon')
    if nb_dims == 2 :
        X_rec = xr.DataArray(X_rec_tmp, 
                        dims=["lon", "lat"]).transpose('lat', 'lon')
    
    X_rec['lat'] = eofs_combined['lat']
    X_rec['lon'] = eofs_combined['lon']
    return X_rec

In [6]:
def MSE_rec(rec_anom_mod_w, rec_anom_obs_w, Mean, W_rmse_2D) :
    mod = rec_anom_mod_w/W_eof_2D + Mean
    obs = rec_anom_obs_w/W_eof_2D + Mean
    diff = MSE(mod, obs, W_rmse_2D)
    return diff

In [7]:
def get_3D_tas_xarr(path, filename, variables):
#    “”"
#    This function read the netCDF file of monthly data, compute the radiative budget, perform a yearly mean and 
#    return a dataframe
#    “”"
    # First step : download the data into dataframe
    file = xr.open_mfdataset(path+filename,combine='by_coords')
    #
    # Second step : compute the annual average 
    df = file[variables].mean('time', keep_attrs=True)
    tas = df['tas']
    #
    return tas

# Download PCs

In [8]:
path_files = '/data/home/globc/peatier/PPE/CNRMppe_error_decomposition/files/'

pc_PPE = {}
pc_obs = {}
pc_pred = {}
pc_p1 = {}
for var in variables :
    ## PPE
    path = path_files+'nc/'
    filename = 'pc_PPE_'+var+'.nc'
    pc_PPE_tmp = xr.open_mfdataset(path+filename,combine='by_coords')
    dims_dict = {'time' : 'members', 'mode' : 'modes'}
    pc_PPE[var] = pc_PPE_tmp.rename_dims(dims_dict)
    
    ## observations
    path = path_files+'nc/'
    filename = 'pc_obs_'+var+'.nc'
    pc_obs[var] = xr.open_mfdataset(path+filename,combine='by_coords')
    
    ## p1 - the first line of pc_PPE
    pc_p1[var] = pc_PPE[var]['pcs'][0,:]
    
    ## predictions
    path = path_files+'nc/'
    filename = 'pc_pred_'+var+'.nc'
    pc_pred_tmp = xr.open_mfdataset(path+filename,combine='by_coords')
    pc_pred[var] = pc_pred_tmp.rename({'__xarray_dataarray_variable__' : 'pcs'})

# Download EOF solvers

In [9]:
import pickle
path = path_files+'pkl/'

solver = {}
for var in variables :
    # open a file, where you stored the pickled data
    file = open(path+'solver_'+var+'.pkl', 'rb')

    # dump information to that file
    solver[var] = pickle.load(file)

    # close the file
    file.close()

In [10]:
eofs = {}
variances = {}
for var in variables :
    eofs[var] = solver[var].eofsAsCovariance(pcscaling=1)
    variances[var] = solver[var].varianceFraction() 

In [11]:
# Reference simulation
path = path_PPE+'ENSEMBLE1/CNRM-CM6-1_amip_PPE/CNRM-CM6-1_amip_r1i1p1f2/'
filename = 'tas_*_CNRM-CM6-1_amip_*.nc'
p1_amip = get_3D_tas_xarr(path, filename, ['tas'])

In [12]:
lat = p1_amip['lat']
lon = p1_amip['lon']
eofs_nb = arange(1,104,1)
#eofs_xr = {}
eofs_combined = {}

for var in variables :
    eofs_xr = xr.DataArray(eofs[var], 
                   coords={'eofs': eofs_nb,'lat': lat,'lon': lon}, 
                   dims=["eofs", "lat", "lon"])#.to_dataset(name=var)
    ## --Combine the modes for reconstruction
    eofs_combined[var] = eofs_xr

# Compute individual MSEs rec. of pc_pred

In [13]:
path_file = path_files+'npy/'
Mean={}
for var in variables :
    filename = 'CNRMppe_decomposition_mean_'+str(var)+'.npy'
    Mean_tmp =  pd.read_pickle(path_file+filename).to_xarray().to_array()
    Mean[str(var)] = Mean_tmp[0,:,:].rename({'variable':'mode'})

In [14]:
for var in variables :
    W_eof_2D = np.load(path_files+'npy/W_eof_2D_'+str(var)+'.npy')
    W_eof_3D = np.load(path_files+'npy/W_eof_3D_'+str(var)+'.npy')
    W_rmse_2D = np.load(path_files+'npy/W_rmse_2D_'+str(var)+'.npy')

In [26]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']

In [22]:
MSE_rec_pred = {}
cpt_trunc = 0
for var in variables :
    print(var)
    MSE_rec_pred[var] = {}
    trunc = truncations[cpt_trunc]
    rec_anom_obs_w = reconstruct_X(eofs_combined[var][0:trunc,:,:], pc_obs[var]['pseudo_pcs'][0:trunc], nb_dims=2)
    cpt=0
    for i in range(0,100000, 1) :
        #print(cpt)
        rec_anom_mod_w = reconstruct_X(eofs_combined[var][0:trunc,:,:], pc_pred[var]['pcs'][0:trunc,i], nb_dims=2)
        tmp = MSE_rec(rec_anom_mod_w, rec_anom_obs_w, Mean[str(var)], W_rmse_2D)
        MSE_rec_pred[var]['LHS_index = '+str(cpt)] = float(tmp)
        cpt+=1
    
    cpt_trunc+=1

tas
pr
psl


In [24]:
pd_MSE_rec_pred = pd.DataFrame(MSE_rec_pred)
pd_MSE_rec_pred

Unnamed: 0,SW,LW,tas,pr,psl
LHS_index = 0,23102.395051,50.180780,1.036283,2.154691,7303.058312
LHS_index = 1,23672.934333,71.770420,1.100781,1.859019,17824.450817
LHS_index = 2,26528.514017,76.813236,1.389311,2.399282,23894.937000
LHS_index = 3,24110.257924,152.825378,2.314204,2.908401,58342.422116
LHS_index = 4,23119.313310,56.829434,0.807454,2.310436,18417.172296
...,...,...,...,...,...
LHS_index = 99995,23182.076413,80.216578,1.127336,1.730458,15515.528265
LHS_index = 99996,26145.264958,79.358935,2.075135,2.728547,32881.808461
LHS_index = 99997,26415.821777,70.896057,1.370659,2.048138,41896.616551
LHS_index = 99998,27077.132222,47.886722,2.490433,1.506060,54928.313758


# Compute RMSE rec. of pc_p1 

In [1]:
rec_anom_obs_w = {}
cpt_trunc = 0
for var in variables :
    print(var)
    MSE_rec_pred[var] = {}
    trunc = truncations[cpt_trunc]
    rec_anom_obs_w[var] = reconstruct_X(eofs_combined[var][0:trunc,:,:], pc_obs[var]['pseudo_pcs'][0:trunc], nb_dims=2)
    cpt_trunc += 1

NameError: name 'variables' is not defined

In [31]:
MSE_rec_p1 = {}
for var in variables :
    rec_anom_mod_w = reconstruct_X(eofs_combined[var][0:trunc,:,:], pc_p1[var][0:trunc], nb_dims=2)
    MSE_rec_p1[var] = MSE_rec(rec_anom_mod_w, rec_anom_obs_w[var], Mean[var], W_rmse_2D)


# Compute multi-variate metric

In [37]:
MSE_rec_pred = pd_MSE_rec_pred.reset_index().drop('index', axis= 1)
MSE_rec_pred

Unnamed: 0,SW,LW,tas,pr,psl
0,23102.395051,50.180780,1.036283,2.154691,7303.058312
1,23672.934333,71.770420,1.100781,1.859019,17824.450817
2,26528.514017,76.813236,1.389311,2.399282,23894.937000
3,24110.257924,152.825378,2.314204,2.908401,58342.422116
4,23119.313310,56.829434,0.807454,2.310436,18417.172296
...,...,...,...,...,...
99995,23182.076413,80.216578,1.127336,1.730458,15515.528265
99996,26145.264958,79.358935,2.075135,2.728547,32881.808461
99997,26415.821777,70.896057,1.370659,2.048138,41896.616551
99998,27077.132222,47.886722,2.490433,1.506060,54928.313758


In [39]:
## Normaliser par p1 référence
for var in variables :
    print(var)
    MSE_rec_pred[var+'_norm'] = MSE_rec_pred[var]/float(MSE_rec_p1[var])

tas
pr
psl
SW
LW


In [41]:
Etot = []
for i in range(0,100000,1) :
    tmp = MSE_rec_pred.iloc[i]
    tmp_sum = tmp['tas_norm']+tmp['pr_norm']+tmp['psl_norm']+tmp['SW_norm']+tmp['LW_norm']
    tmp_mean = tmp_sum/5
    Etot.append(tmp_mean)

In [44]:
MSE_rec_pred['MSE multi'] = Etot
MSE_rec_pred

Unnamed: 0,SW,LW,tas,pr,psl,tas_norm,pr_norm,psl_norm,SW_norm,LW_norm,MSE multi
0,23102.395051,50.180780,1.036283,2.154691,7303.058312,0.957007,0.220408,0.444806,0.953866,1.242952,0.763808
1,23672.934333,71.770420,1.100781,1.859019,17824.450817,1.016571,0.190163,1.085631,0.977422,1.777716,1.009501
2,26528.514017,76.813236,1.389311,2.399282,23894.937000,1.283029,0.245428,1.455365,1.095325,1.902624,1.196354
3,24110.257924,152.825378,2.314204,2.908401,58342.422116,2.137167,0.297507,3.553453,0.995479,3.785405,2.153802
4,23119.313310,56.829434,0.807454,2.310436,18417.172296,0.745684,0.236340,1.121732,0.954564,1.407635,0.893191
...,...,...,...,...,...,...,...,...,...,...,...
99995,23182.076413,80.216578,1.127336,1.730458,15515.528265,1.041095,0.177012,0.945002,0.957155,1.986923,1.021438
99996,26145.264958,79.358935,2.075135,2.728547,32881.808461,1.916387,0.279109,2.002727,1.079501,1.965679,1.448681
99997,26415.821777,70.896057,1.370659,2.048138,41896.616551,1.265804,0.209509,2.551791,1.090672,1.756058,1.374767
99998,27077.132222,47.886722,2.490433,1.506060,54928.313758,2.299915,0.154058,3.345511,1.117977,1.186129,1.620718


# Select sub-set of RMSE mutli <= 1

In [59]:
optim = {}
index_list = {}
for var in variables :
    optim[var] = []
    index_list[var] = []
    trunc = truncation
    for i in range(0,100000, 1) :
        pred = MSE_rec_pred['MSE multi'][i]
        if pred < 1 :
            optim[var].append(pc_pred[var]['pcs'][0:trunc,i].values)
            index_list[var].append(i)

In [60]:
EOF_list = []
for i in range(1,(trunc+1), 1) :
    EOF_list.append('EOF '+str(i))

In [61]:
df_optim = {}
xr_optim = {}
for var in variables :
    df_optim[var] = pd.DataFrame(optim[var], columns=EOF_list)
    df_optim[var]['LHS index'] = index_list[var]
    
    xr_optim[var] = df_optim[var].to_xarray()

In [64]:
xr_optim['psl']

# Save data

In [65]:
path_files = '/data/home/globc/peatier/PPE/CNRMppe_error_decomposition/files/'
path = path_files+'nc/'
for var in variables :
    filename = 'optim_pc_PPE_'+var+'_multi.nc'
    xr_optim[var].to_netcdf(path+filename)