#### Introduction to Statistical Learning, Lab 11.3

# Function Fitting Example

We use the PyTorch library:
 
  - PyTorch [homepage](https://pytorch.org/)
  - PyTorch [documentation](https://pytorch.org/docs/stable/index.html)

In this example we train an FCNN to fit a function $\mathbb{R} \rightarrow \mathbb{R}$. 

It can be shown that FCNNs can approximate *any* function $\mathbb{R}^n \rightarrow \mathbb{R}^m$. Unfortunately this proof is not constructive and we have to find a good network layout.

We use most straight forward approach: an FCNN with `MSE` loss and `SDG` optimiser for training. The output layer activation is simply linear (i.e. `None).

#### Setup
Import the required modules and make sure our (and only our!) modules are reloaded before code execution.

In [None]:
%reload_ext autoreload
%aimport dataprovider, classification
%autoreload 1

# framework modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import torch
import torch.utils.data as utils_data
import torch.nn as nn
import torch.nn.functional as fun

from islpy import neuro

#### Data Sets

We generate simple toy data sets for the true model

$$
    y_i = \beta_0 + \beta_1 x_i + \beta_2 x_i^2 + \beta_3 x_i^3 + \epsilon_i
$$

with
$$
\beta_0 = -0.5
$$
$$
\beta_1 = 1.0
$$$$
\beta_2 = -3.0
$$$$
\beta_3 = 0.6
$$

In [None]:
beta = np.array([-0.5, 1.0, -3.0, 0.6])
n = 2000
x = np.linspace(-5, 5, num=n) + np.random.normal(scale=0.5, size=n)
y = beta[0] + beta[1] * x + beta[2] * x**2 + beta[3] * x**3 + np.random.normal(scale=10, size=n)
x = x.reshape((n, -1))
y = y.reshape((n, -1))


#### A Graphical Look at the Data

We show a `sns.regplot` with a quick fit. 

In [None]:
ax = sns.scatterplot(x[:,0], y[:,0])
ax = sns.regplot(x[:,0], y[:,0], order=3, scatter=False, color='C1')
ax = sns.regplot(x[:,0], y[:,0], order=2, scatter=False, color='C2')

#### The FCNN Model

Using the `torch` library, we specify the *network topology* and activation functions to create our model.

In [None]:
class FCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.activation = fun.relu
        self.layer1 = nn.Linear(1, 32)
        self.layer2 = nn.Linear(32, 8)
        self.layer3 = nn.Linear(8, 1)

    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.activation(self.layer2(x))
        x = self.layer3(x)

        return x

#### Training

We use the `nn.MSELoss` function and the `SDG` optimiser in a training loop with a fixed number of iterations.

We first have to wrap our data into a `torch` style interface and create a `torch` data loader for sampling the data.

In [None]:
x[0:10], y[0:10]

In [None]:
list(enumerate(train_loader))[0:10]

In [None]:
x_train = torch.tensor(x[:x.size//2]).float()
y_train = torch.tensor(y[:y.size//2]).float()
train_loader = utils_data.DataLoader(utils_data.TensorDataset(x_train, y_train))

x_test = torch.tensor(x[x.size//2:]).float()
y_test = torch.tensor(y[y.size//2:]).float()
test_loader = utils_data.DataLoader(utils_data.TensorDataset(x_test, y_test))

In [None]:
nn_model = FCNN()
print(nn_model)

max_epochs = 50
train_history = []
test_history = []
loss_function = nn.MSELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.00002)

for epoch in range(max_epochs):
    mean_train_loss = 0.0
    for i, (xs, ys) in enumerate(train_loader):
        optimizer.zero_grad() # reset gradients
        outputs = nn_model(xs) 
        train_loss = loss_function(outputs, ys)
        train_loss.backward() # gradient back propagation
        optimizer.step()
        mean_train_loss = (mean_train_loss * i + train_loss.item()) / (i + 1)
    if epoch:
        train_history.append(mean_train_loss)
    
    mean_test_loss = 0.0
    for i, (xs, ys) in enumerate(test_loader):
        outputs = nn_model(xs)
        eval_loss = loss_function(outputs, ys)
        mean_test_loss = (mean_test_loss * i + eval_loss) / (i + 1)
    
    if epoch:
        test_history.append(mean_test_loss)
        print('Epoch {}, mean train/test loss: {:.4f}/{:.4f}'.format(epoch, mean_train_loss, mean_test_loss))

print('Maximum number of epochs ({}) reached. Training terminated.'.format(max_epochs))

In [None]:
plt.figure()
plt.plot(train_history, label='Training Loss')
plt.plot(test_history, label='Test Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.xlim((10, 50))
plt.legend()
plt.show()


In [None]:
xs = torch.tensor(np.linspace(-5, 5, 100).reshape((100, -1))).float()

with torch.no_grad():
    ys = nn_model(xs)
    yh = nn_model(x_test)

In [None]:
plt.plot(xs[:,0], ys[:,0], lw=2, color='C1')

In [None]:
ax = sns.scatterplot(y_test[:,0], yh[:,0])

# Tensorflow version

In [None]:
import tensorflow as tf

In [None]:
tfmodel = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation="relu", input_shape=[1], ),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(1, activation="linear"),
])

In [None]:
#tfopt = tf.keras.optimizers.SGD()
tfopt = tf.keras.optimizers.SGD(lr=0.00002)
tfmodel.compile(optimizer=tfopt,
              loss="mse" )

In [None]:
tfmodel.summary()

In [None]:
# With batch=4, it reproduces the same pattern seen with pytorch. 
# Using batch_size=1, leaves it at  mse ~= 500, with 50 epochs

tfmodel.fit(x[:x.size//2], y[:x.size//2], epochs=50, shuffle=False, batch_size=1)

In [None]:
ax = sns.scatterplot(x[:,0], y[:,0], color="gray", alpha=0.5)
tfxs = np.linspace(-6, 6, 100).reshape((100, -1))
ax.plot(xs[:,0], ys[:,0], lw=2, color='#555555', linewidth=3)
ax.plot(tfxs, tfmodel.predict(tfxs), linewidth=1, color="red")