In [None]:
# Mount to my Google Drive
from google.colab import drive
import os
import pickle
drive.mount('/content/drive', force_remount=True)
os.chdir("/content/drive/MyDrive/###")

In [None]:
# Import necessary packages
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Import linear regression code 
from model import Linear_Regression
from linearized_model import Linearized_Model, LinearizedCallback
from weights_callback import WeightsCallback

# Generating Training Data and Automatic Fitting

In [None]:
# Number of training points
N = 10
# Dimension of training points 
d = 20

# Initialization shape and scale
alpha = 1
w0 = tf.ones([2*d, 1])

In [None]:
# Generate beta by taking each coordinate to be an iid Unif(0,1) random variable
beta = tf.random.uniform([d, 1], dtype=tf.float32)

# As in Woodworth et al., suppose our training points are drawn from a d-dimensional
# standard multivariate normal distribution
train_x = np.random.multivariate_normal(np.zeros((d)), np.identity(d), size=N)
train_x = tf.convert_to_tensor(train_x, dtype=tf.float32)

# Compute the corresponding y-values
train_y = tf.reshape(tf.matmul(train_x, beta), (-1, 1))

In [None]:
# Save the training data for future use
x_list = list(train_x.numpy())
y_list = list(train_y.numpy())

with open('train_data.pkl', 'wb') as f:
  pickle.dump([x_list, y_list, beta], f)

In [None]:
# Load in existing training data
with open('train_data.pkl', 'rb') as f:
    data = pickle.load(f)
train_x = tf.convert_to_tensor(data[0], dtype=tf.float32)
train_y = tf.convert_to_tensor(data[1], dtype=tf.float32)
beta = tf.convert_to_tensor(data[2], dtype=tf.float32)

In [None]:
# Initialize our models
# Linear regression:
linreg = Linear_Regression(w0, alpha=alpha)

# Linearized model:
linreg_const = Linear_Regression(w0, alpha=alpha)
# Recall that the linear regression model is 2-homogeneous
linearized = Linearized_Model(linreg_const, train_x)

In [None]:
# Parameters for network training
epochs = 1e3
lr = 1e-3

In [None]:
# Optimize each model using gradient descent
optimizer_linreg = tf.keras.optimizers.SGD(learning_rate=lr)
optimizer_linearized = tf.keras.optimizers.SGD(learning_rate=lr)

# With loss function equal to the mean-squared error
MSE = tf.keras.losses.MeanSquaredError()

In [None]:
# Instantiate callback objects for our models
weights_linear = WeightsCallback(10)
weights_linearized = WeightsCallback(10)
linearized_callback = LinearizedCallback(linreg_const)

In [None]:
# First, compile and fit the linear regression model
linreg.compile(optimizer_linreg, loss=MSE)
linreg.fit(train_x, train_y, epochs=epochs, verbose=1, callbacks=[weights_linear])

In [None]:
# As well as the linearized model
linearized.compile(optimizer_linearized, loss=MSE)
linearized.fit(train_x, train_y, epochs=epochs, verbose=1, callbacks=[weights_linearized, linearized_callback])

In [None]:
with open('linearized_weights.pkl', 'wb') as f:
  pickle.dump(weights_linear.weight_evals, f)

# Manual Model Fitting

Code adapted from [TensorFlow custom training walkthrough](https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough).

In [None]:
# We define our model using the same training data as above
tf.print(f"training data x: {tf.shape(train_x)}")
tf.print(f"training data y: {tf.shape(train_y)}")
tf.print(f"y - beta*x:\n{train_y - tf.matmul(train_x, beta)}")

In [None]:
### REMOVE THE BELOW LINE TO PRINT OUTPUT
%%capture

# Initialization scale and shape
alpha = 10
w0 = tf.ones([2*tf.shape(train_x)[1], 1])

# Instantiate the linear regression model
linreg = Linear_Regression(w0, alpha)

# Instantiate the linearized model
linreg_const = Linear_Regression(w0, alpha)
linearized = Linearized_Model(linreg_const, train_x, alpha, hom=2)

tf.print(f"w0:\n{linearized.linearized_layer_1.init}")
tf.print(f"gradients:\n{linearized.linearized_layer_1.grads}")
tf.print(f"bias:\n{linearized.linearized_layer_1.bias}")

In [None]:
# Choose learning rate and number of epochs
lr = 1e-3
num_epochs = int(1e4)

# Use this learning rate to create our optimizer objects
optimizer_linreg = tf.keras.optimizers.SGD(learning_rate=lr)
optimizer_linearized = tf.keras.optimizers.SGD(learning_rate=lr)

In [None]:
# Define our loss function
MSE = tf.keras.losses.MeanSquaredError()

# Define a function to compute the gradient of the model at each training epoch
# We use use the tf.function decorator for faster training 
@tf.function
def grad(model, x_train, y_train):
  
  N = int(tf.shape(x_train)[0])
  p = int(tf.shape(tf.reshape(model.trainable_weights, [-1, 1]))[0])

  with tf.GradientTape(persistent=True) as tape:
    # evaluate the loss of the model at the training points
    eval = model(x_train)
    loss = MSE(y_train, eval)

  # Return the training loss, the gradient of the model with respect to the training weights (averaged over all training points x),
  # and the jacobian of the model output with respect to the training points
  return loss, tape.gradient(loss, model.trainable_variables), tf.reshape(tape.jacobian(eval, model.trainable_variables), [N,p])

In [None]:
# Try taking a single training step

print("Linear Regression:")
loss_val, linreg_grads, jacobian  = grad(linreg, train_x, train_y)

# Loss before first step
print("Step: {}, Initial Loss: {}".format(optimizer_linreg.iterations.numpy(),
                                          loss_val.numpy()))

# Loss after first step
optimizer_linreg.apply_gradients(zip(linreg_grads, linreg.trainable_variables))
print("Step: {}, Loss: {}".format(optimizer_linreg.iterations.numpy(),
                                          MSE(train_y, linreg(train_x)).numpy()))

# For the linearized model
print("\nLinearized:")
loss_val, linearized_grads, jacobian  = grad(linearized, train_x, train_y)

# Loss before first step
print("Step: {}, Initial Loss: {}".format(optimizer_linearized.iterations.numpy(),
                                          loss_val.numpy()))

# Loss after first step
optimizer_linearized.apply_gradients(zip(linearized_grads, linearized.trainable_variables))
print("Step: {}, Loss: {}".format(optimizer_linearized.iterations.numpy(),
                                          MSE(train_y, linearized(train_x)).numpy()))

In [None]:
# The gradient of the linearized model at the training model should remain constant throughout training (by definition of the linearized model)
tf.norm(linearized.linearized_layer_1.grads - jacobian, ord=np.inf, axis=None).numpy()

In [None]:
# Now, train for num_epochs epochs

# List of model weights
linreg_weights = []
linearized_weights = []

for i in range(num_epochs):

  # Compute the loss, gradient of the model output
  linreg_loss, linreg_grads, linreg_jacobian = grad(linreg, train_x, train_y)
  linearized_loss, linearized_grads, linearized_jacobian = grad(linearized, train_x, train_y)

  # Every 10 iterations, print the loss and store the models' weights
  if not i % 10:
    print("Step: {}".format(optimizer_linearized.iterations.numpy()))
    print("(Linear regression) Loss: {}".format(MSE(train_y, linreg(train_x))))
    print("(Linearized) Loss: {}\n".format(MSE(train_y, linearized(train_x))))

    linreg_weights.append(tf.reshape(linreg.trainable_variables, [-1,1]))
    linearized_weights.append(tf.reshape(linearized.trainable_variables, [-1,1]))
  

  # Update the model by taking the gradient descent step 
  optimizer_linreg.apply_gradients(zip(linreg_grads,linreg.trainable_variables))
  optimizer_linearized.apply_gradients(zip(linearized_grads,linearized.trainable_variables))

# Save weights at the end of training
linreg_weights.append(tf.reshape(linreg.trainable_variables, [-1,1]))
linearized_weights.append(tf.reshape(linearized.trainable_variables, [-1,1]))

In [None]:
# Save the loss arrays
with open('linreg_loss_5.pkl', 'wb') as f:
  pickle.dump(linreg_weights, f)

with open('linearized_loss_5.pkl', 'wb') as g:
  pickle.dump(linearized_weights, g)

# Visualizing the Weights During Training

In [None]:
# Load in the training weights for the models trained with each corresponding alpha

linreg_weights = []
linearized_weights = []

alphas = ["0.05", "0.1", "0.5", "1", "5"]

for i in alphas:
  
  with open('linreg_loss_'+f'{i}'+'.pkl', 'rb') as f:
    linreg_weights.append(pickle.load(f))
  
  with open('linearized_loss_'+f'{i}'+'.pkl', 'rb') as g:
    linearized_weights.append(pickle.load(g))

In [None]:
# Plot the \ell_2 norm of the difference between the nonlinear and linearized model weights throughout training
# As \alpha \rightarrow \infty, we should observe that this norm goes to 0 for all times t
# See Chizat et al. 2018 Theorem 2.2
fig = plt.figure(0)

for i in range(5):

  diffs = [tf.norm(linreg_weights[i][j] - linearized_weights[i][j], ord=2) for j in range(len(linreg_weights[i]))]

  plt.plot(10*np.arange(len(linreg_weights[i])), diffs, label=f"{alphas[i]}")

plt.xlabel(r"Epoch $t$")
plt.ylabel(r"$\left\Vert w(t) - \bar{w}(t) \right\Vert_2$")
plt.legend()

fig.show()
fig.savefig('visualize_weights.png', dpi=300)

In [None]:
# Similarly, we should observe that the \ell_2 norm of the difference between the initialization w(0) and the model weights w(t)
# of the nonlinear model goes to 0 as \alpha \rightarrow \infty for all times t during training
fig = plt.figure(1)

for i in range(5):

  delta = [tf.norm(linreg_weights[i][j] - linreg_weights[i][0], ord=2) for j in range(len(linreg_weights[i]))]

  plt.plot(10*np.arange(len(linreg_weights[i])), delta, label=f"{alphas[i]}")

plt.xlabel(r"Epoch $t$")
plt.ylabel(r"$\left\Vert w(t) - w(0) \right\Vert_2$")
plt.legend()

fig.show()
fig.savefig('visualize_change.png', dpi=300)