# Produce input for clustering analysis 
Step of the codes :
- Import [modules](#modules) 
- Define [settings](#settings) 
- Define [functions](#functions) 
- [Download Mean and weights](#down_mean) used for EOF analysis 
- [Dowload reference data](#down_p1) 
- [Downoad EOF solver](#down_EOF) 
- Get the [PPE PC](#PPE_pc) 
- [Download observations](#down_obs) (compute anomalies and weight)
- [Project observations](#proj_obs) to get pseudo-PC 
- [Predict new LHS pseudo-PC](#predict)
- Produce xarray
- [Save](#save) the xarray as netCDF 

# <a id = 'modules'>Import Module<a>

In [1]:
# Computational modules 
%matplotlib inline
import xarray as xr
import glob
import os
import numpy as np
import netCDF4
from netCDF4 import Dataset
import pandas as pd
import re
from array import array
from pylab import *
#import geopandas
from eofs.xarray import Eof
from eofs.multivariate.standard import MultivariateEof
import random

# Plotting modules 
import matplotlib.pyplot as plt
#from mpl_toolkits.basemap import Basemap
import pandas.plotting
import matplotlib.ticker as ticker
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import BoundaryNorm
from cartopy.util import add_cyclic_point

# Scikit-learn
from sklearn import linear_model
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from scipy.optimize import minimize
from scipy.optimize import dual_annealing
from sklearn.decomposition import PCA

  import pandas.util.testing as tm


# <a id='settings'>Settings<a>

### Variables

In [2]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']
var_ceres = ['rsdt','rsut', 'rlut']
truncations = [18, 18, 8, 28, 22]
TITLE = 'Multi-variate'
ylabel = '$E_{tot}$'

### Paths

In [3]:
path_official='/data/scratch/globc/peatier/CMIP6/CNRM-CM6-1/CFMIP/amip/'
path_PPE='/data/scratch/globc/peatier/PPE/CNRM-CM6-1_PPE/'
path_files='/data/home/globc/peatier/PPE/CNRMppe_error_decomposition/files/'
path_file_npy = '/data/home/globc/peatier/PPE/CNRMppe_save/PPE/ENSEMBLE2/files/npy/'

### List of members

In [4]:
nb_p_list = ['p311', 'p312', 'p314', 'p316',
                    'p317', 'p319', 'p320', 'p321', 'p322', 'p324', 'p325', 'p326', 
                    'p329', 'p330', 'p331', 'p332', 'p335', 'p336', 'p337' ,'p338', 
                    'p340', 'p341', 'p343', 'p344', 'p346', 'p347', 'p348', 'p349', 
                    'p350', 'p353', 'p355', 'p357', 'p359', 'p360', 
                    'p361', 'p363', 'p365', 'p367', 'p368', 'p369', 
                    'p372', 'p373', 'p374', 'p375', 'p376', 'p378', 'p381', 'p382', 
                    'p384', 'p386', 'p388', 'p389', 'p391', 'p392', 'p393', 
                    'p394', 'p395', 'p396', 'p398', 'p399', 'p400', 'p404', 
                    'p406', 'p407', 'p409', 'p410', 'p411', 'p412',
                    'p414','p416',
                    'p413','p419','p424','p426','p428','p421','p423',
                    'p425','p427','p429','p430','p436','p438','p431','p433',
                    'p442','p446','p443','p445','p447',
                    'p452','p454','p456','p458','p457','p459',
                    'p460','p465','p467','p469',
                    'p470','p471']

len(nb_p_list)

102

# <a id='functions'>Functions<a>

In [5]:
def get_3D_tas_xarr(path, filename, variables):
#    “”"
#    This function read the netCDF file of monthly data, compute the radiative budget, perform a yearly mean and 
#    return a dataframe
#    “”"
    # First step : download the data into dataframe
    file = xr.open_mfdataset(path+filename,combine='by_coords')
    #
    # Second step : compute the annual average 
    df = file[variables].mean('time', keep_attrs=True)
    tas = df['tas']
    #
    return tas

In [6]:
def load_monthly_clim(path, filename, variables) :
    
    file = xr.open_mfdataset(path+filename,combine='by_coords')
    df=file[variables].to_dataframe()
    
    # Compute Climatological Annual Cycle :
    df1=df.reset_index(level=['time', 'lat', 'lon'])
    df1['year']=pd.DatetimeIndex(df1['time']).year
    df1['month']=pd.DatetimeIndex(df1['time']).month
    
    #list_ind = []
    #cpt=0
    #for i in df1['year'] : 
    #    if i>1981 :
    #        list_ind.append(cpt)
    #        cpt+=1
    #    else : 
    #        cpt+=1
            
    #df2 = df1.drop(list_ind)
    df_mean=df1.groupby(['month', 'lat', 'lon']).mean()
    df_mean=df_mean.drop(columns='year')
    
    return df_mean

In [7]:
def MultiLinReg_pred(LHS, X ,y, param_names) :

    LHS_df = pd.DataFrame(LHS)

    lhs = LHS_df.values
    #LHS_df

    # Let's use the model equation : 

    X_df = pd.DataFrame(data=X)
    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    R = regr.intercept_
    Coeffs = pd.DataFrame([regr.coef_]*30, columns=param_names).iloc[0]

    N=len(LHS_df.values)
    tmp = [0]*N
    y_pred = [0]*N
    i=0
    Ycpt=0
    while i<N:
        
        tmp[i] = Coeffs.values*LHS_df.iloc[i]
        y_pred[i] = tmp[i].sum()+R
        i+=1

    #y_pred
    #members = arange(102,100102,1)
    #DFYpred = pd.DataFrame([y_pred, members], index=["y_pred", "members"]).transpose()
    return y_pred

# <a id = 'down_mean'>General data<a>

In [8]:
path_file = path_files+'npy/'
Mean={}
for var in variables :
    filename = 'CNRMppe_decomposition_mean_'+str(var)+'.npy'
    Mean_tmp =  pd.read_pickle(path_file+filename).to_xarray().to_array()
    Mean[str(var)] = Mean_tmp[0,:,:].rename({'variable':'mode'})

In [9]:
for var in variables :
    W_eof_2D = np.load(path_files+'npy/W_eof_2D_'+str(var)+'.npy')
    W_eof_3D = np.load(path_files+'npy/W_eof_3D_'+str(var)+'.npy')
    W_rmse_2D = np.load(path_files+'npy/W_rmse_2D_'+str(var)+'.npy')

# <a id='down_p1'>Reference p1<a> 

In [10]:
# Reference simulation
path = path_PPE+'ENSEMBLE1/CNRM-CM6-1_amip_PPE/CNRM-CM6-1_amip_r1i1p1f2/'
filename = 'tas_*_CNRM-CM6-1_amip_*.nc'
p1_amip = get_3D_tas_xarr(path, filename, ['tas'])

In [11]:
## --Anom and weight
X_p1 = p1_amip - Mean['tas']
X_p1_w = X_p1*W_eof_2D

# <a id ='PPE_pc'>Download EOF solvers and get PPE pc and p1 pc<a> 

In [12]:
import pickle
path = path_files+'pkl/'
solver = {}
for var in variables :
    #print(var)

    # open a file, where you stored the pickled data
    file = open(path+'solver_'+var+'.pkl', 'rb')

    # dump information to that file
    solver[var] = pickle.load(file)

    # close the file
    file.close()

In [13]:
eofs = {}
variances = {}
for var in variables :
    eofs[var] = solver[var].eofsAsCovariance(pcscaling=0)
    variances[var] = solver[var].varianceFraction() 

In [14]:
cpt = 0
for var in variables :
    #print(var)
    trunc = truncations[cpt]
    #print(trunc)
    v = variances[var][0:trunc].sum()
    #print(float(v))
    cpt+=1

In [15]:
lat = X_p1_w['lat']
lon = X_p1_w['lon']
eofs_nb = arange(1,104,1)
eofs_xr = {}
eofs_combined = {}

for var in variables :
    eofs_xr[var] = xr.DataArray(eofs[var], 
                   coords={'eofs': eofs_nb,'lat': lat,'lon': lon}, 
                   dims=["eofs", "lat", "lon"])#.to_dataset(name=var)
    ## --Combine the modes for reconstruction
    eofs_combined[var] = eofs_xr[var]

In [16]:
### PPE simulations 
var_modes = {}
pc_PPE = {}
for var in variables :
    #print(var)
    var_modes[var] = solver[var].varianceFraction()
    pc_PPE[var] = solver[var].pcs(pcscaling=0)

In [17]:
## For the reference p1 
pc_p1 = {}
for var in variables :
    pc_p1[var] = pc_PPE[var][0]


# <a id = 'down_obs'>Dowload observations<a>

### BEST observations - tas

In [18]:
path_observation = '/data/scratch/globc/peatier/obs/BEST/'
filename = 'Land_and_Ocean_LatLong1_regrid_1979-1981.nc'
var = 'tas'

In [19]:
# POUR LES OBSERVATIONS BEST

path = path_observation

file =  xr.open_mfdataset(path+filename,combine='by_coords')
clim = file['climatology'].to_dataframe()#.drop(columns='month_number')
clim = pd.concat([clim, clim, clim]).reset_index('month_number')

df_obs=file['temperature'].to_dataframe().reset_index('time')
df_obs['temperature'] = df_obs['temperature']
df_obs['clim'] = clim['climatology']
df_obs['tas'] = df_obs['temperature']+df_obs['clim']+273.15
variable_obs = 'ta'

In [20]:
obs = {}
obs[var] = df_obs[var].groupby(['lat','lon']).mean().to_xarray()

## GPCP observations - pr

In [21]:
path_observation = '/data/scratch/globc/peatier/obs/GPCP/regrid_CNRM/'
filename_obs = 'pr_mon_mean_197901-198112_regrid.nc'
var = 'pr'

In [22]:
## --- Initial method
path = path_observation
df_obs = load_monthly_clim(path, filename_obs, var)
obs[var] = df_obs[var].groupby(['lat','lon']).mean().to_xarray()

## CERES observations - fluxes

In [23]:
path_observation = '/data/scratch/globc/peatier/obs/CERES/'
filename = 'CERES_EBAF-TOA_Ed4.1_Subset_200003-201910_regrid.nc'

In [24]:
# Load data and compute Annual Cycle :
file = xr.open_mfdataset(path_observation+filename,combine='by_coords')
#file
variables = ['toa_sw_all_mon','toa_lw_all_mon', 'toa_net_all_mon']
df_obs = load_monthly_clim(path_observation, filename, variables)
#df_obs

In [25]:
df_obs['SW'] =   df_obs['toa_sw_all_mon']# + df_obs['toa_lw_all_mon']
obs['SW'] = df_obs.groupby(['lat','lon']).mean().to_xarray()
obs['SW'] = obs['SW']['SW']

df_obs['LW'] =  df_obs['toa_lw_all_mon']
obs['LW'] = df_obs.groupby(['lat','lon']).mean().to_xarray()
obs['LW'] = obs['LW']['LW']

## NCEP observations - psl

In [26]:
path_observation =  '/data/scratch/globc/peatier/obs/NCEP/regrid_CNRM/'
filename_obs = 'psl_1m_1979-1981_NCEP_regrid.nc'

In [27]:
# Load data and compute Annual Cycle :
file = xr.open_mfdataset(path_observation+filename_obs,combine='by_coords')
#file
var = ['psl']
df_obs = load_monthly_clim(path_observation, filename_obs, var)

In [28]:
obs['psl'] = df_obs.groupby(['lat','lon']).mean().to_xarray()
obs['psl'] = obs['psl']['psl']

## Weighted anomaly

In [29]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']
X_obs = dict()
X_obs_w = dict()
for var in variables :

    ## --Observations
    X_obs[var] = obs[var] - Mean[var]
    X_obs_w[var] = X_obs[var]*W_eof_2D

## Project observations - get pseudo-PC

In [30]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']
## --Project and reconstruct the observations
pc_obs = {}
for var in variables :
    print(var)
    pc_obs[var] = solver[var].projectField(X_obs_w[var], neofs=trunc, weighted=False, eofscaling=1)

tas
pr
psl
SW
LW


# <a id='predict'>Emulations 100 000<a>

In [31]:
variables = ['tas', 'pr', 'psl', 'SW', 'LW']

In [32]:
path = "/data/home/globc/peatier/PPE/CNRMppe/PPE/ENSEMBLE2/files/npy"
LHS = np.load(file=path+"/LHS100000_param_standard.npy")
X = np.load(file=path+"/X_EmulateurFeedbacksN.npy")
param_names = np.load(file=path+"/LHS_paramNames.npy")


pc_pred = {}

for var in variables :
    print(var)
    
    pc_pred_list = []
    for i in range(0,25,1) :
        print(i)
        y = pc_PPE[var][1:, i]
        tmp = MultiLinReg_pred(LHS, X ,y, param_names)
        pc_pred_list.append(tmp)
    
    pc_pred[var] = pc_pred_list

tas
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
pr
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
psl
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
SW
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
LW
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


   # <a id='save'>Save all the data<a>

In [33]:
path_files = '/data/home/globc/peatier/PPE/CNRMppe_error_decomposition/files/'

for var in variables :
    print(var)
    ## PPE
    path = path_files+'nc/'
    filename = 'pc_PPE_'+var+'_PCvariance.nc'
    pc_PPE[var].to_netcdf(path+filename)
    
    ## observations
    path = path_files+'nc/'
    filename = 'pc_obs_'+var+'_PCvariance.nc'
    pc_obs[var].to_netcdf(path+filename)
    
    ## p1 - the first line of pc_PPE
    
    ## predictions
    path = path_files+'nc/'
    filename = 'pc_pred_'+var+'_PCvariance.nc'
    tmp = xr.DataArray(pc_pred[var], dims=["modes", "members"])
    tmp.to_netcdf(path+filename)

tas
pr
psl
SW
LW
