In [None]:
import tensorflow as tf
from tensorflow import keras
import keras.backend as K
import numpy as np
import numpy
import os
import math
import pandas as pd
import time

In [None]:
# helper functions, partly taken/adapted from  https://github.com/slerch/ppnn

# CRPS loss from Rasp and Lerch (2018)
def crps_cost_function(y_true, y_pred):
    # Split input
    mu = y_pred[:, 0]
    sigma = y_pred[:, 1]
    y_true = y_true[:, 0]

    # To stop sigma from becoming negative we first have to 
    # convert it the the variance and then take the square
    # root again. 
    var = K.square(sigma)
    # The following three variables are just for convenience
    loc = (y_true - mu) / K.sqrt(var)
    phi = 1.0 / numpy.sqrt(2.0 * numpy.pi) * K.exp(-K.square(loc) / 2.0)
    Phi = 0.5 * (1.0 + tf.math.erf(loc / numpy.sqrt(2.0)))
    # First we will compute the crps for each input/target pair
    crps =  K.sqrt(var) * (loc * (2. * Phi - 1.) + 2 * phi - 1. / numpy.sqrt(numpy.pi))
    # Then we take the mean. The cost is now a scalar
    return K.mean(crps)

def normalize(data, method=None, shift=None, scale=None):
    result = numpy.zeros(data.shape)
    if method == "MINMAX":
        shift = numpy.min(data, axis=0)
        scale = numpy.max(data, axis=0) - numpy.min(data, axis=0) 
    elif method == "STD":
        shift = numpy.mean(data, axis=0)
        scale = numpy.std(data, axis=0)
    elif method == "MAX":
        scale = numpy.max(data, axis=0)
        shift = numpy.zeros(scale.shape)
    for index in range(len(data[0])):
        result[:,index] = (data[:,index] - shift[index]) / scale[index]
    return result, shift, scale

from scipy.stats import norm

# Note: needs to be adjusted if GaussianCRPS is used with exp-transformation
def crps_normal(mu, sigma, y):
    """
    Compute CRPS for a Gaussian distribution. 
    """
    # Make sure sigma is positive
    sigma = numpy.abs(sigma)
    loc = (y - mu) / sigma
    crps = sigma * (loc * (2 * norm.cdf(loc) - 1) + 
                    2 * norm.pdf(loc) - 1. / numpy.sqrt(numpy.pi))
    return crps

def save_ensemble(preds, exp_name, save=True):
    preds = numpy.array(preds)
    preds[:, :, 1] = numpy.abs(preds[:, :, 1])   # Make sure std is positive 
    mean_preds = numpy.mean(preds, 0)
    ens_score = crps_normal(mean_preds[:, 0], mean_preds[:, 1], testY).mean()
    # print(f'Ensemble test score = {ens_score}')
    if save:
        results_df = create_results_df(testDates[:,0], testIDs[:,0], mean_preds[:, 0], mean_preds[:, 1])
        results_df.to_csv(f'{exp_name}.csv')
    return(ens_score)

In [None]:
# import non-AE data
dataRaw = numpy.load('/home/sebastian/Projects/AE_postprocessing/local_tests/data/ppnn.npy')

# remove soil moisture forecasts due to missing values
data = dataRaw[:,2:39] 

stations = numpy.genfromtxt('/home/sebastian/Projects/AE_postprocessing/local_tests/data/station_info.csv', delimiter=',', skip_header=1, usecols=[1,2,3,5,6])
stationsMap = {}
i = 0
for station in stations:
    stationsMap[int(station[0])] = numpy.concatenate(([i], station[1:]))
    i = i + 1
stationColumns = numpy.zeros((data.shape[0],4))
stationID = numpy.zeros((data.shape[0],1))
for t in range(stationColumns.shape[0]):
    stationColumns[t] = stationsMap[data[:,1][t]][1:]
    stationID[t] = stationsMap[data[:,1][t]][0]
       
obs = data[:,2]
# remove MJD from data
data = data[:,3:]

data_MJD = dataRaw[:,2].reshape((-1,1))

# output:
#    data: NWP input features
#    stationColumns: station-specific input features (lon, lat, altitude, orography)
#    data_MJD: date in MJD format

In [None]:
names_data = np.array(['t2m_fc_mean', 't2m_fc_std','u_pl500_fc_mean', 'u_pl500_fc_std',
       'v_pl500_fc_mean', 'v_pl500_fc_std', 'gh_pl500_fc_mean',
       'gh_pl500_fc_std', 'u_pl850_fc_mean', 'u_pl850_fc_std',
       'v_pl850_fc_mean', 'v_pl850_fc_std', 'q_pl850_fc_mean',
       'q_pl850_fc_std', 'cape_fc_mean', 'cape_fc_std', 'sp_fc_mean',
       'sp_fc_std', 'tcc_fc_mean', 'tcc_fc_std', 'sshf_fc_mean',
       'sshf_fc_std', 'slhf_fc_mean', 'slhf_fc_std', 'u10_fc_mean',
       'u10_fc_std', 'v10_fc_mean', 'v10_fc_std', 'ssr_fc_mean',
       'ssr_fc_std', 'str_fc_mean', 'str_fc_std', 'd2m_fc_mean',
       'd2m_fc_std'])

names_stationColumns = np.array(['station_lat',
       'station_lon', 'orog', 'station_alt', ])

In [None]:
# parameters
this_AE_var_input = "t2m"
this_AE_variant = "simple"
this_AE_enc_dim = 2
this_AE_early_stopped = "True"
this_hidden_layers = 2
this_nodes_hidden = 100
this_emb_dim = 15
this_epochs = 100
this_early_stopping = True

In [None]:
# load relevant AE data

AE_base_dir = '/home/sebastian/Projects/AE_postprocessing/results_paper/AE_results/predictions/ConvAE_'

fname_t2m = (AE_base_dir + 't2m_' + str(this_AE_enc_dim) + '.npy')

data_AE_t2m_raw = numpy.load(fname_t2m)
data_AE_t2m_repeated = np.repeat(data_AE_t2m_raw, repeats=537, axis=0)

# remove missing values
eval_start = 1764045
train_end = 1763507 # train until 2015-12-30

trainX_raw = data[:train_end,:]
trainStationData = stationColumns[:train_end,:]
trainY = obs[:train_end]
trainDates = data_MJD[:train_end]
trainIDs = stationID[:train_end]
trainAEpreds_t2m = data_AE_t2m_repeated[:train_end,]

isnans = numpy.isnan(trainY)
trainY = trainY[~isnans]
trainX_raw = trainX_raw[~isnans]
trainStationData = trainStationData[~isnans]
trainDates = trainDates[~isnans]
trainIDs = trainIDs[~isnans]
trainAEpreds_t2m = trainAEpreds_t2m[~isnans]

testX_raw = data[eval_start:,:]
testStationData = stationColumns[eval_start:,:]
testY = obs[eval_start:]
testDates = data_MJD[eval_start:]
testIDs = stationID[eval_start:]
testAEpreds_t2m = data_AE_t2m_repeated[eval_start:,]

isnans = numpy.isnan(testY)
testY = testY[~isnans]
testX_raw = testX_raw[~isnans]
testStationData = testStationData[~isnans]
testDates = testDates[~isnans]
testIDs = testIDs[~isnans]
testAEpreds_t2m = testAEpreds_t2m[~isnans]

# scale input features (except IDs and AE inputs)
trainX, train_shift, train_scale = normalize(trainX_raw[:,:], method="MAX")
trainStationData, train_shift_StationData, train_scale_StationData = normalize(trainStationData[:,:], 
                                                                               method="MAX")

testX = normalize(testX_raw[:,:], shift=train_shift, scale=train_scale)[0]
testStationData = normalize(testStationData[:,:], 
                            shift=train_shift_StationData, 
                            scale=train_scale_StationData)[0]

# combine AE inputs
if this_AE_var_input == "t2m":
    trainAEpreds_combined = trainAEpreds_t2m
    testAEpreds_combined = testAEpreds_t2m

In [None]:
names_AE = np.array(['AE'])

nreps = 10
fimps = []

from tqdm.notebook import tqdm

for i in tqdm(range(nreps)):
    
    n_features = trainX.shape[1] # 34
    n_StationFeatures = trainStationData.shape[1] # 4
    emb_size = this_emb_dim
    n_hidden_layers = this_hidden_layers
    max_id = int(numpy.max([trainIDs.max(), testIDs.max()]))
    AE_enc_dim = trainAEpreds_combined.shape[1]

    hidden_nodes = this_nodes_hidden
    activation = 'relu' 
    optimizer='adam'
    lr = 0.002
    loss=crps_cost_function 
    n_outputs = 2
    this_patience = 10

    tf.compat.v1.reset_default_graph()
    keras.backend.clear_session()

    features_in = tf.keras.layers.Input(shape=(n_features,))
    stationdata_in = tf.keras.layers.Input(shape=(n_StationFeatures,))
    id_in = tf.keras.layers.Input(shape=(1,))
    AE_in = tf.keras.layers.Input(shape=(AE_enc_dim,))

    emb = tf.keras.layers.Embedding(max_id + 1, emb_size)(id_in)
    emb = tf.keras.layers.Flatten()(emb)

    x = tf.keras.layers.Concatenate()([features_in, stationdata_in, emb, AE_in])

    hidden_layers = np.repeat(hidden_nodes, n_hidden_layers)
    for h in hidden_layers:
        x = tf.keras.layers.Dense(h, activation=activation, kernel_regularizer=None)(x)
    x = tf.keras.layers.Dense(n_outputs, activation='linear', kernel_regularizer=None)(x)

    model = tf.keras.models.Model(inputs=[features_in, stationdata_in, id_in, AE_in], outputs=x)
    opt = tf.keras.optimizers.Adam(lr=lr)
    model.compile(loss=loss, optimizer=opt)

    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                   patience=this_patience, 
                                   restore_best_weights = True)
    model.fit([trainX, trainStationData, trainIDs, trainAEpreds_combined], 
          trainY, 
          epochs=this_epochs, 
          batch_size=4096, 
          verbose=0,
          validation_split=1/9, 
          callbacks=[es_callback])
    
    ref_score = model.evaluate([testX, testStationData, testIDs, testAEpreds_combined], 
                                         testY, 4096, 
                                         verbose=0)
    
    # permute inputs for feature importance computations -> by type of input
    
    ## NWP inputs interpolated to stations
    
    scores_permuted_NWPinputs = np.zeros(len(names_data))

    for j in range(len(names_data)):
        testX_shuf = testX.copy()
        testX_shuf[:, j] = np.random.permutation(testX_shuf[:, j])
        this_score_shuf = model.evaluate([testX_shuf, testStationData, testIDs, testAEpreds_combined], 
                                             testY, 4096, verbose=0)
        scores_permuted_NWPinputs[j] = this_score_shuf
     
    ## station info
    
    scores_permuted_stationinfo = np.zeros(len(names_stationColumns))

    for j in range(len(names_stationColumns)):
        testStationData_shuf = testStationData.copy()
        testStationData_shuf[:, j] = np.random.permutation(testStationData_shuf[:, j])
        this_score_shuf = model.evaluate([testX, testStationData_shuf, testIDs, testAEpreds_combined], 
                                             testY, 4096, verbose=0)
        scores_permuted_stationinfo[j] = this_score_shuf
    
    ## station ID
    
    scores_permuted_stationID = np.zeros(1)

    testIDs_shuf = testIDs.copy()
    testIDs_shuf[:,0] = np.random.permutation(testIDs_shuf[:,0])
    this_score_shuf = model.evaluate([testX, testStationData, testIDs_shuf, testAEpreds_combined], 
                                          testY, 4096, verbose=0)
    scores_permuted_stationID = this_score_shuf
    
    ## AE inputs
    
    scores_permuted_AE = np.zeros(len(names_AE))
    testAEpreds_combined_shuf = testAEpreds_combined.copy()
    shuf_rows = np.random.permutation(range(testAEpreds_combined_shuf.shape[0]))
    testAEpreds_combined_shuf = testAEpreds_combined_shuf[shuf_rows, :]
    this_score_shuf = model.evaluate([testX, testStationData, testIDs, testAEpreds_combined_shuf], 
                                         testY, 4096, verbose=0)
    scores_permuted_AE = this_score_shuf

    scores_permuted_AE = np.atleast_1d(scores_permuted_AE)
    
    # combine all and compute importances
    
    scores_shuffled = np.concatenate((scores_permuted_NWPinputs, scores_permuted_stationinfo, np.atleast_1d(scores_permuted_stationID), scores_permuted_AE),
                                axis = 0)
    
    feature_names =  np.concatenate((names_data, 
                                 names_stationColumns, 
                                 np.atleast_1d(['station ID']), 
                                 names_AE),
                                axis = 0)
    
    fimps.insert(0, scores_shuffled - ref_score)
    
# compute mean feature importance and convert to data frame

fimps = np.array(fimps).transpose()
mean_fimps = fimps.mean(axis = 1)
df = pd.DataFrame(columns=['Feature', 'Mean_Importance'])
df['Feature'] = feature_names; df['Mean_Importance'] = mean_fimps
df.sort_values('Mean_Importance')

fimps_df.to_csv('.../PP_results/feature_importances_ConvAE.csv')