# Surrogate Construction for Genz Functions with regression as a function of the number of training points

This notebook constructs a PC surrogate for Genz functions using random sampling and regression, giving the RMS error bewteen the surrogate and the actual function. Both the Genz function and PCE are defined on [-1,1].

In [1]:
# imports
import numpy as np
import math  
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import qmc

import PyUQTk.pce as uqtkpce
import PyUQTk.PyPCE.pce_tools as pce_tools
from PyUQTk.utils.func import *
import PyUQTk.uqtkarray as uqtkarray

PyMC is required for some of the MCMC postprocessing codes.
Will proceed without, but some convergence tests will not be available.


## Inputs

In [2]:
nord = 5           # Order of the PCE
pc_type = "LU"     # Polynomial type
pc_alpha = 0.0     # Free parameter > -1 for Gamma-Laguerre and Beta-Jacobi PCs
pc_beta = 1.0      # Free parameter > -1 for Gamma-Laguerre and Beta-Jacobi PCs
nSam = 10000       # Number of evaluation samples samples
c_opt='alt'
max_dim=6

First, we define the random number generator.

## Collecting Errors

We loop through different numbers of random samples to feed into regression and obtain the RMSE. We start with a number of samples equal to 90% of the basis terms and continue to 125%.

In [3]:
sigma = np.array([1.e-08]) # inital noise variance; updated in BCS

nfolds=2

In [4]:
percent=[.5, .6, .7, .8, .9, 1, 1.1] # fractions of the number of basis terms to use
e_tr=1/np.power(10,[i for i in range(1,13)])

mean_dim_errors=np.zeros((max_dim, len(percent)))
std_dim_errors=np.zeros((max_dim, len(percent)))
nonz_m_dim=np.zeros((max_dim, len(percent)))
nonz_s_dim=np.zeros((max_dim, len(percent)))
e=np.zeros((max_dim, len(percent)))

for ndim in range(1, max_dim+1):
    # instantiate random generator
    rng = qmc.LatinHypercube(d=ndim, seed=42)

    # instantiate PC object
    pc_model = uqtkpce.PCSet("NISPnoq", nord, ndim, pc_type, pc_alpha, pc_beta)
    npce=pc_model.GetNumberPCTerms()

    if (c_opt=="seq"):
        coef=np.array([1/(i+1) for i in range(npce)])
        sparse=''   
    elif (c_opt=="alt"):
        coef=np.zeros(npce)
        for i in range(npce):
            if (i%2==0):
                coef[i]=1/(i+1)   
            sparse='sparse'
    # list to store stats for each percent
    mean_per_errors=[]
    std_per_errors=[]
    nonz_m=[]
    nonz_s=[]

    for j, per in enumerate(percent):
        nTest=int(npce*per)

        # list to store stats for the 10 trials
        NRMSE_list=[]
        retain=[]
        
        # x and y training and validation data
        x=2*rng.random(n=nTest*nfolds)-1
        y=pce_tools.UQTkEvaluatePCE(pc_model, coef, x)
        
        # optimize eta
        e_tr=1/np.power(10,[i for i in range(1,13)])
        for i in range(10):
            x_tr=2*rng.random(n=nTest)-1
            y_tr=pce_tools.UQTkEvaluatePCE(pc_model, coef, x_tr) 

            x_val=2*rng.random(n=nTest)-1
            y_val=pce_tools.UQTkEvaluatePCE(pc_model, coef, x_val)
            
            eta = pce_tools.UQTkOptimizeEta(pc_model, x_tr, y_tr, x_val, y_val, e_tr, sigma)
            
        e[ndim-1][j]=eta
        
        # find coefficients
        c_k = pce_tools.UQTkBCS(pc_model, y_tr, x_tr, sigma, eta)

        # for 10 samples
        for i in range(10):
            # evaluate at random samples
            x_test=2*rng.random(n=nSam)-1
            pce_evals=pce_tools.UQTkEvaluatePCE(pc_model,c_k,x_test)

            # find error
            y_test=pce_tools.UQTkEvaluatePCE(pc_model, coef, x_test)
            MSE = np.square(np.subtract(y_test,pce_evals)).mean()
            RMSE=math.sqrt(MSE)
            NRMSE_list.append(RMSE/np.linalg.norm(y_test))

            retain.append(np.nonzero(c_k)[0].shape[0])
            
        # add the results of the 10 trials    
        mean_per_errors.append(np.array(NRMSE_list).mean())
        std_per_errors.append(np.std(np.array(NRMSE_list)))
        nonz_m.append(np.array(retain).mean()/np.nonzero(coef)[0].shape[0])
        nonz_s.append(np.std(np.array(retain))/np.nonzero(coef)[0].shape[0])
     
    mean_dim_errors[ndim-1]=mean_per_errors
    std_dim_errors[ndim-1]=std_per_errors
    nonz_m_dim[ndim-1]=nonz_m
    nonz_s_dim[ndim-1]=nonz_s

TypeError: The first input argument needs to be a sequence

## Summary
This table and figure display the error of each number of training points for each dimension. The number of training points varies as a percentage of the number of basis terms.

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(10,10))

# Plot Error Data
for ndim in range(1, max_dim +1):
    plt.errorbar(percent, mean_dim_errors[ndim-1], yerr=std_dim_errors[ndim-1], xerr=None, linewidth=2, markersize=8, capsize=10, label=str('ndim'+str(ndim)))

# Line where number of samples = number of basis terms
plt.vlines(x = 1, ymin = 0, ymax = 10, colors = 'black', linestyle="dashed")

# Label Axes
plt.xlabel("Fraction of the number of basis terms",fontsize=20)
plt.ylabel("NRMSE",fontsize=20)

# Create legend
plt.legend(loc='lower left')

# Add title
fig.suptitle("BCS with different numbers of sample points\nfor LU %s polynomial with\
 PC Order %s"%(sparse, str(nord)), fontsize=22)

# Change y scale
plt.yscale('log')

#Change size of tick labels
plt.tick_params(axis='both', labelsize=16)

#Show figure
plt.show()

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(10,10))

# Plot Error Data
for ndim in range(1, max_dim +1):
    plt.errorbar(percent, nonz_m_dim[ndim-1], yerr=nonz_s_dim[ndim-1], xerr=None, linewidth=2, markersize=8, capsize=10, label=str('ndim'+str(ndim)))

# Label Axes
plt.xlabel("Fraction of the number of basis terms",fontsize=20)
plt.ylabel("% Nonzero terms",fontsize=20)

# Create legend
plt.legend(loc='lower left')

# Add title
fig.suptitle("BCS with different numbers of sample points\nfor LU %s polynomial with\
 PC Order %s"%(sparse, str(nord)), fontsize=22)

#Change size of tick labels
plt.tick_params(axis='both', labelsize=16)

#Show figure
plt.show() 

In [None]:
    ## select E_v for 10 sets of training and validation data
#         eta_master=np.empty((10, 2))
#         eta_list=[]
#         for i in range(10):
#             E_v=[]
#             for j, eta in enumerate(e_tr):
#                 x_tr=2*rng.random(n=nTest)-1
#                 y_tr=pce_tools.UQTkEvaluatePCE(pc_model, coef, x_tr) 

#                 x_val=2*rng.random(n=nTest)-1
#                 y_val=pce_tools.UQTkEvaluatePCE(pc_model, coef, x_val)

#                 c_tr=pce_tools.UQTkBCS(pc_model, y_tr, x_tr, sigma, eta, lambda_init)
#                 phi_v_c_tr=pce_tools.UQTkEvaluatePCE(pc_model, c_tr, x_val)
#                 E_v.append(math.sqrt(np.square(np.subtract(y_val, phi_v_c_tr)).mean()))
#             eta_list.append( e_tr[E_v.index(np.min(E_v))])
            
#         eta_opt=np.array(eta_list).mean()
#         eta_std=np.std(np.array(eta_list))