Sascha Spors,
Professorship Signal Theory and Digital Signal Processing,
Institute of Communications Engineering (INT),
Faculty of Computer Science and Electrical Engineering (IEF),
University of Rostock,
Germany

# Data Driven Audio Signal Processing - A Tutorial with Computational Examples

Winter Semester 2022/23 (Master Course #24512)

- lecture: https://github.com/spatialaudio/data-driven-audio-signal-processing-lecture
- tutorial: https://github.com/spatialaudio/data-driven-audio-signal-processing-exercise

Feel free to contact lecturer frank.schultz@uni-rostock.de

# Exercise 8: Model Complexity Bias / Variance Problem

## Objectives


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.linalg import lstsq
# a nice homework: add Ridge regression using
from sklearn.linear_model import Ridge
# and check what this is doing on our simple examples

# machine learning routine shown with the
# simple linear model X theta = y
# no gradient descent is used here, see exercise 09 


def train(X, y_meas, print_theta):
    theta, res, rank, singval = lstsq(X, y_meas)
    print('number of theta coeff:' , theta.shape)
    if print_theta:
        print('theta', theta)
        #print(res)
        #print('rank', rank)
        #print('singval', singval)
    return theta


def predict(X, theta):
    y_pred = X @ theta  # e.g. == theta[0] * X[:,0] + theta[1] * X[:,1]
    return y_pred


def get_Rsq(y_meas, y_pred):
    ym = np.mean(y_meas)
    #print('overall variance of data') 
    #print(np.dot(y_meas - ym, y_meas - ym))
    #print('which is a superposition of residual stuff (not explained by our model) and the explained variance')
    #print(np.dot(y_meas - y_pred, y_meas - y_pred) + np.dot(y_pred - ym, y_pred - ym))
    # from that we can derive so called R^2

    # fraction of explained variance by regression vs. overall variance
    Rsq = np.dot(y_pred - ym, y_pred - ym) / np.dot(y_meas - ym, y_meas - ym)
    #print('Rsquared', Rsq)
    # can also be derived with (see lecture slides) 
    TSS = np.sum((y_meas - ym)**2)  # this is variance of y_meas
    RSS = np.sum((y_meas - y_pred)**2)  # sum of squares of residuals == res from lstsq()
    # print(np.allclose(RSS, res))
    Rsq = 1 - RSS/TSS
    #print('Rsquared', Rsq)
    return Rsq


def plot(X1, y_meas, y_pred, Rsq, title_str):
    plt.figure()
    plt.plot(X1,y_meas, label='measured', lw=3)
    plt.plot(X1, y_pred, label='predicted')
    plt.plot(X1, y_ideal_model, 'gold', label='ideal model')
    plt.xlabel('X[:,1]')
    plt.ylabel('y')
    plt.title(title_str+r', $R^2=$'+str(Rsq))
    plt.legend()
    plt.grid()

    
def check_model(X, y_train, y_test, print_theta=True):
    # training
    theta = train(X, y_train, print_theta)  #fit model
    y_pred = predict(X, theta)
    Rsq = get_Rsq(y_train, y_pred)
    plot(X1, y_train, y_pred, Rsq, 'training data')
    # test
    y_pred = predict(X, theta)
    Rsq = get_Rsq(y_test, y_pred)
    plot(X1, y_test, y_pred, Rsq, 'test data')

In [None]:
rng = np.random.default_rng(1)

N = 2**12  # number of data points
xmax = 3.
X1 = np.arange(-N//2, +N//2)*2./N*xmax
y_ideal_model = 1 + 3*X1**3  + 20*np.cos(2*np.pi*X1)

# training data
mean, stdev = 0, 3
noise = np.squeeze(rng.normal(mean, stdev, [N, 1]))
y_train = 1 + 3*X1**3  + 20*np.cos(2*np.pi*X1) + 1*noise

# test data, slightly different than training
# 'good' models should act robust on this data
mean, stdev = 0, 4
noise = np.squeeze(rng.normal(mean, stdev, [N, 1]))
pct = 1.075  # we achieve R^2 = 0.9 for no-noise data with best models
y_test = 1*pct + 3*pct * X1**3 + 20*pct * np.cos(2*pct * np.pi*X1) + 0*noise

### check models of different complexity

In [None]:
# linear univariate regression
X = np.array([np.ones(N), X1]).T  # y = intercept * ones(N) + slope * X1 => y = n + m x 
check_model(X, y_train, y_test)

In [None]:
# linear regression using x^2 as feature
X = np.array([np.ones(N), X1**2]).T
check_model(X, y_train, y_test)

In [None]:
# we know what y_meas is composed of -> use this information
# for practical data we don't have this ground truth :-(
X = np.array([np.ones(N), X1**3, np.cos(2*np.pi*X1)]).T
check_model(X, y_train, y_test)

In [None]:
# we model only the x^3 part
X = np.array([np.ones(N), X1**3]).T
check_model(X, y_train, y_test)

In [None]:
# we model only the cosine part
# which is a very poor model
X = np.array([np.ones(N), np.cos(2*np.pi*X1)]).T
check_model(X, y_train, y_test)

In [None]:
# use polynomial up to x^3, no cosine
X = np.array([np.ones(N), X1**1, X1**2, X1**3]).T
check_model(X, y_train, y_test)

In [None]:
# we can utilize Vandermonde matrix to get same model
# X = np.array([np.ones(N), X1**1, X1**2, X1**3]).T ==
pol_order = 3
X = np.vander(X1, pol_order+1, increasing=True)
check_model(X, y_train, y_test)

In [None]:
# too many polynomials plus the exact cosine part
pol_order = 20
X = np.concatenate((np.vander(X1, pol_order+1, increasing=True),
                    np.expand_dims(np.cos(2*np.pi*X1), axis=1)),
                   axis=1)
check_model(X, y_train, y_test, print_theta=True)

In [None]:
# we model a fourier series with number of coeff == data points - 1
# this leads to almost exact fit onto data
# we never ever do this when we want to learn from data such that
# we can make robust predictions on unseen data
# number of model coeff << model input data
# otherwise we just 'store' the data in the coefficients
tmp = np.expand_dims(X1/xmax*np.pi, axis=1) * np.arange(1, 1+2**11-1, 1)
X = np.concatenate((np.ones([N,1]),  # for intercept
                    np.cos(tmp),
                    np.sin(tmp)), axis=1)
print(N, 'data points vs.', X.shape[1], 'model coefficients')
check_model(X, y_train, y_test, print_theta=False)

In [None]:
# we model a fourier series with many coeff
# this yields fair model performance but actually
# other models above that use x^3 and cos as regressors
# are better because:
# - less theta coeff
# - higher or about the same R^2
# - more robust training vs. test data
tmp = np.expand_dims(X1/xmax*np.pi, axis=1) * np.arange(1, 7, 1)
X = np.concatenate((np.ones([N,1]),  # for intercept
                    np.cos(tmp),
                    np.sin(tmp)), axis=1)
print(N, 'data points vs.', X.shape[1], 'model coefficients')
check_model(X, y_train, y_test, print_theta=False)

## Copyright

- the notebooks are provided as [Open Educational Resources](https://en.wikipedia.org/wiki/Open_educational_resources)
- the text is licensed under [Creative Commons Attribution 4.0](https://creativecommons.org/licenses/by/4.0/)
- the code of the IPython examples is licensed under the [MIT license](https://opensource.org/licenses/MIT)
- feel free to use the notebooks for your own purposes
- please attribute the work as follows: *Frank Schultz, Data Driven Audio Signal Processing - A Tutorial Featuring Computational Examples, University of Rostock* ideally with relevant file(s), github URL https://github.com/spatialaudio/data-driven-audio-signal-processing-exercise, commit number and/or version tag, year.