# 


# Validation ERA-Land Magdalena River Basin Daily: Standard metrics

In this document we will calculate the clasical errors between ERA-Land data and local rainfall gauge stations using standard metrics

## 0. Import Libraries and define functions

In [None]:
import pandas as pd
import numpy as np
import netCDF4 as nc
from math import sqrt
from sklearn.metrics import mean_squared_error,mean_absolute_error, log_loss, mutual_info_score
import datetime as dt
import scipy
import warnings # Ignore not important warnings
from tifffile import imsave
warnings.filterwarnings("ignore")

In [None]:
def cross_entropy(targets,predictions,  epsilon=1e-12):
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions))/N
    return ce

def calc_MI(x, y):
    c_xy = np.histogram2d(x, y, 20)[0]
    mi=mutual_info_score(None, None, contingency=c_xy)
    return mi

## 1. Load Data

#### **Local Data CSV**

In [None]:
# CSV Paths
data_path="Data/Local CSV/Precipitation 1980-2020 data filled 1.csv" 
coords_path="Data/Local CSV/Precipitation 1980-2020 coord filled.csv" 
# Import data
stations=pd.read_csv(data_path,sep=',',index_col="Fecha",parse_dates=True)
coords=pd.read_csv(coords_path,sep=';',index_col="Station")
## Daily to monthly
# stations=stations.resample('MS').sum()
# Longitude Coords
if  (coords['Long']<0).any():
    coords['Long']=coords['Long']+360   
# Dates obtained from CSV
dates=stations.index 

#### **Local Data NetCDF(IDW) - 0.1°x0.1°**

In [None]:
# NC path
idw10_path="Data/Local IDW/IDW_daily_010.nc"
# Check NetCDF Time
idw10nc = nc.Dataset(idw10_path)
time_unit=idw10nc.variables['time'].units 
time_cal=idw10nc.variables['time'].calendar
time_valh=idw10nc.variables['time'][:]
time_his=nc.num2date(time_valh,units=time_unit,calendar=time_cal)
time_his=sorted([dt.datetime.strptime(k.strftime('%Y-%m-%d %H:%M'),'%Y-%m-%d %H:%M') for k in time_his])
# NetCDF grid data
nc_lat_idw010=idw10nc.variables['latitude'][:]
nc_lon_idw010=idw10nc.variables['longitude'][:]
if  (nc_lon_idw010<0).any():
    nc_lon_idw010=nc_lon_idw010+360   
# Precipitation variable name
var_type2=list(idw10nc.variables.keys())[-1]    
# Load data
idw10=idw10nc.variables[var_type2][:,:,:].data

#### **ERA Land Data - 0.1°x0.1°**

In [None]:
# NC path
eraland_path="Data/ERALand NC/precipitation_eraland_d.nc"
# Check NetCDF Time
eralandnc = nc.Dataset(eraland_path)
time_unit=eralandnc.variables['time'].units 
time_cal=eralandnc.variables['time'].calendar
time_valh=eralandnc.variables['time'][:]
time_his=nc.num2date(time_valh,units=time_unit,calendar=time_cal)
time_his=sorted([dt.datetime.strptime(k.strftime('%Y-%m-%d %H:%M'),'%Y-%m-%d %H:%M') for k in time_his])
# NetCDF grid data
nc_lat_eraland=eralandnc.variables['latitude'][:]
nc_lon_eraland=eralandnc.variables['longitude'][:]
if  (nc_lon_eraland<0).any():
    nc_lon_eraland=nc_lon_eraland+360   
# Precipitation variable name
var_type4=list(eralandnc.variables.keys())[-1]    
# Load data
eraland=eralandnc.variables[var_type4][:,:,:].data
eralandnc.close()

## 2. Error calculation to local stations

#### **Select datasets to compare**

In [None]:
# Comparison datasets
dataset_name=['IDW 010','ERA5']
dataset=[idw10,eraland]
lats=[nc_lat_idw010,nc_lat_eraland]
longs=[nc_lon_idw010,nc_lon_eraland]

#### **Compute Errors**

In [None]:
# Metrics to evaluate
metrics=['MAE','MSE','RMSE','Correlation','Mutual Information','Cross Entropy']
# Possible datasets
opt=len(dataset_name)
error_options=[]
for i in range(opt):
    # Load dataset
    name=dataset_name[i]
    print(name)
    data=dataset[i]
    lat_set=lats[i]
    long_set=longs[i]
    # Pre-allocate results
    error_stations=coords.copy().drop(['Length'],axis=1)
    error_stations[metrics]=np.nan
    # Compute Results
    stations_names= error_stations.index
    for z in stations_names:
        cell_lat=1;cell_lon=1
        # Get stations location in cell grids 
        if lat_set[0]<lat_set[1]:
            while cell_lat<=len(lat_set):
                if (lat_set[cell_lat-1]+lat_set[cell_lat])/2<=error_stations['Lat'][z] and error_stations['Lat'][z]<=(lat_set[cell_lat+1]+lat_set[cell_lat])/2:
                    break
                cell_lat=cell_lat+1
        else:
            while cell_lat<=len(lat_set):
                if (lat_set[cell_lat+1]+lat_set[cell_lat])/2<=error_stations['Lat'][z] and error_stations['Lat'][z]<=(lat_set[cell_lat-1]+lat_set[cell_lat])/2:
                    break
                cell_lat=cell_lat+1
        while cell_lon<=len(long_set):
            if (long_set[cell_lon-1]+long_set[cell_lon])/2<=error_stations['Long'][z] and error_stations['Long'][z]<=(long_set[cell_lon+1]+long_set[cell_lon])/2:
                break
            cell_lon=cell_lon+1
        # Values in the cell where the station is located
        cell_values=data[:,cell_lat,cell_lon]
        # Compute Error
        ## MAE
        error_stations['MAE'][z]=mean_absolute_error(stations[z], cell_values)
        ## MSE
        error_stations['MSE'][z]=mean_squared_error(stations[z], cell_values)
        ## RMSE
        error_stations['RMSE'][z]=sqrt(mean_squared_error(stations[z], cell_values))
        ## Correlation
        error_stations['Correlation'][z]=np.corrcoef(stations[z],cell_values)[0,1]
        ## Mutual Information
        error_stations['Mutual Information'][z]=calc_MI(stations[z],cell_values)
        ## Cross Entropy
        error_stations['Cross Entropy'][z]=cross_entropy(stations[z], cell_values) 
    # Save results to CSV
    error_stations.to_csv("Results/Error to local stations/"+name+'_daily.csv')
    # Save locally the results
    error_options.append(error_stations)

In [None]:
# Metrics to evaluate
metrics=['MAE','MSE','RMSE','Correlation','Mutual Information','Cross Entropy']
# Possible datasets
opt=len(dataset_name)
error_options=[]
for i in range(opt): # opt
    # Load dataset
    name=dataset_name[i]
    print(name)
    data=dataset[i]
    lat_set=lats[i]
    long_set=longs[i]
    # Pre-allocate results
    error_stations=coords.copy().drop(['Length'],axis=1)
    error_stations[metrics]=np.nan
    # Compute Results
    stations_names= error_stations.index
    for z in stations_names:
        cell_lat=1;cell_lon=1
        # Get stations location in cell grids 
        if lat_set[0]<lat_set[1]:
            while cell_lat<=len(lat_set):
                if (lat_set[cell_lat-1]+lat_set[cell_lat])/2<=error_stations['Lat'][z] and error_stations['Lat'][z]<=(lat_set[cell_lat+1]+lat_set[cell_lat])/2:
                    break
                cell_lat=cell_lat+1
        else:
            while cell_lat<=len(lat_set):
                if (lat_set[cell_lat+1]+lat_set[cell_lat])/2<=error_stations['Lat'][z] and error_stations['Lat'][z]<=(lat_set[cell_lat-1]+lat_set[cell_lat])/2:
                    break
                cell_lat=cell_lat+1
        while cell_lon<=len(long_set):
            if (long_set[cell_lon-1]+long_set[cell_lon])/2<=error_stations['Long'][z] and error_stations['Long'][z]<=(long_set[cell_lon+1]+long_set[cell_lon])/2:
                break
            cell_lon=cell_lon+1
        # Values in the cell where the station is located
        cell_values=data[:,cell_lat,cell_lon]

In [None]:
# Joined DataFrame
data_join=pd.DataFrame(index=stations.index)
data_join['Local']=stations[z]
data_join['ERA5']=cell_values

## 2. Error calculation to interpolation

### ERA-Land vs IDW010

In [None]:
### Basin Mask 0.10 (.nc file)
rain_nc="Data/Magdalena/GIS/Basin/Basin_ext_010.nc"
basin = nc.Dataset(rain_nc)
basin_lat=basin.variables['lat'][:]
basin_lon=basin.variables['lon'][:]
var_type=list(basin.variables.keys())[0]  
cells_x=np.array(basin_lat)[:]
cells_y=np.array(basin_lon)[:]
mascara=np.array(basin.variables[var_type][:,:])
mask=mascara!=mascara[1,1]
mask=np.flip(mask*1,axis=0).astype('float64')
mask[mask==0]=np.nan
mask_st=mask.reshape((1, mask.shape[0],mask.shape[1]))
mask_basin=mask_st.reshape(mask_st.shape[1],mask_st.shape[2])
basin.close()
# Find coordinates where basin exists in mask
basin_coords=np.where(mask_basin==1)
# Find position of coordinates in datasets arrays
basin_true_longs=basin_lon.data[basin_coords[1]]+360
basin_true_lats=np.flip(basin_lat.data)[basin_coords[0]]

In [None]:
# Metrics to evaluate
metrics=['MAE','MSE','RMSE','Correlation','Mutual Information','Cross Entropy']
# Pre-allocate results
template=mask_basin*-9999
mae_res=template.copy()
mse_res=template.copy()
rmse_res=template.copy()
correlation_res=template.copy()
mutinf_res=template.copy()
cent_res=template.copy()
# Iterate over basin pixels
for j in range(len(basin_coords[0])):
    # Array position 
    x_long=basin_coords[0][j]
    y_lat=basin_coords[1][j]

    # Geographic position
    look_long=basin_true_longs[j]
    look_lat=basin_true_lats[j]
    
    # IDW position:
    ## Lat
    lat_idw_pos=np.where((nc_lat_idw010==look_lat))
    if len(lat_idw_pos[0])==0:
        continue
    lat_idw=lat_idw_pos[0]
    ## Long
    long_idw_pos=np.where((nc_lon_idw010==look_long))
    if len(long_idw_pos[0])==0:
        continue
    long_idw=long_idw_pos[0]
    
    # ERA5 position:
    ## Lat
    lat_eraland_pos=np.where((nc_lat_eraland==look_lat))
    if len(lat_eraland_pos[0])==0:
        continue
    lat_eraland=lat_eraland_pos[0]
    ## Long
    long_eraland_pos=np.where((nc_lon_eraland==look_long))
    if len(long_eraland_pos[0])==0:
        continue
    long_eraland=long_eraland_pos[0]
    
    # Datasets values
    ## IDW value
    idw10_value=idw10[:,lat_idw,long_idw].flatten()
    ## ERA5 value
    eraland_value=eraland[:,lat_eraland,long_eraland].flatten()
    
    # Error calculation
    ## MAE
    mae_res[x_long,y_lat]=mean_absolute_error(idw10_value, eraland_value)
    ## MSE
    mse_res[x_long,y_lat]=mean_squared_error(idw10_value, eraland_value)
    ## RMSE
    rmse_res[x_long,y_lat]=sqrt(mean_squared_error(idw10_value, eraland_value))
    ## Correlation
    correlation_res[x_long,y_lat]=np.corrcoef(idw10_value,eraland_value)[0,1]
    ## Mutual Information
    mutinf_res[x_long,y_lat]=calc_MI(idw10_value,eraland_value)
    ## Cross Entropy
    cent_res[x_long,y_lat]=cross_entropy(idw10_value, eraland_value) 

In [None]:
##### SAVE results TO NC FILE ######
metrics=['MAE','MSE','RMSE','Correlation','Mutual Information','Cross Entropy']
res_list=[mae_res,mse_res,rmse_res,correlation_res,mutinf_res,cent_res]
for i in range(len(metrics)):
    # Define Path 
    name=metrics[i]+'_daily_010.nc'
    ncfile = nc.Dataset("Error to interpolations/"+name,mode='w',format='NETCDF3_64BIT_OFFSET') 
    # Creating dimensions
    lat_dim = ncfile.createDimension('latitude',len(basin_lat))    
    lon_dim = ncfile.createDimension('longitude',len(basin_lon) ) 
    ncfile.title=metrics[i]
    lat = ncfile.createVariable('latitude', np.float32, ('latitude',))
    lat.units = idw10nc.variables['latitude'].units
    lon = ncfile.createVariable('longitude', np.float32, ('longitude',))
    lon.units = idw10nc.variables['longitude'].units
    rest_nc= ncfile.createVariable(metrics[i],np.float64,('latitude','longitude')) 
    rest_nc.missing_value=-9999
    lat[:] = np.flip(basin_lat.data) 
    lon[:] = basin_lon 
    rest_nc[:,:] = res_list[i]
    ncfile.close()