In [None]:
# Library imports.
import numpy as np
import matplotlib.pyplot as plt
import sys
import torch
import torchvision.datasets
sys.path.append('../')
plt.style.use('seaborn')

# Repository imports.
from FFBrainNet import FFBrainNet
from LocalNetBase import Options, UpdateScheme
from DataGenerator import random_halfspace_data
from train import *

Decide which form of plasticity rule to analyze:

In [None]:
from FFLocalTableRules.FFLocalTable_PrePostCount import FFLocalTable_PrePostCount
PlasRuleClass = FFLocalTable_PrePostCount

---
# Train an FFBrainNet using regular gradient descent on all parameters:

In [None]:
# Generate some test data
N = 1000
X,y = random_halfspace_data(dim=4, n=3*N)
X_test = X[:N]
y_test = y[:N]
X_train = X[N:]
y_train = y[N:]

In [None]:
# Build a multi-layer, randomly connected, and capped feed-forward net.
brain = FFBrainNet(n=4, m=2, l=4, w=20, p=0.5, cap=5, full_gd=True)

In [None]:
# Train the net using regular backprop on the weights.
(all_losses, all_train_acc, all_test_acc, sample_counts, other_stats) = train_downstream(
    X_train, y_train, brain, num_epochs=10, batch_size=100, vanilla=True,
    learn_rate=0.1, X_test=X_test, y_test=y_test, verbose=True)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(all_losses[-1]))
print("Last train accuracy: {0:.4f}".format(all_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(all_test_acc[-1]))

In [None]:
# Plot downstream training curves.
plt.figure()
plt.plot(sample_counts[1:], all_losses[1:], label='loss')
plt.plot(sample_counts, all_train_acc, label='train')
plt.plot(sample_counts, all_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Regular backprop learning curves')
plt.grid(True)
plt.legend()
plt.show()

---
# Meta-Learn an Output-layer Plasticity Rule on halfspace data:

In [None]:
# Generate data
dimension = 4
N = 1000
X, y = random_halfspace_data(dim = dimension, n = 3*N)
X_test = X[:N]
y_test = y[:N]
X = X[N:]
y = y[N:]

In [None]:
# Set options to meta-learn output layer plasticity rule, while using regular GD on input weights directly
opts = Options(gd_input=True,
               use_output_rule=True,
               gd_output_rule=True)
scheme = UpdateScheme(update_misclassified_only=False, update_all_edges=True)

In [None]:
# Instantiate an FFLocalNet with 1 hidden layer, width 10
brain = PlasRuleClass(n=4, m=2, l=1, w=10, p=0.5, cap=5, options=opts, update_scheme=scheme)

# Print initial output rule
print('brain output_rule:')
print(brain.get_output_rule())  # zero initialized

In [None]:
# Meta-Learn a plasticity rule for the output layer
(meta_losses, meta_train_acc, meta_test_acc, meta_sample_counts, meta_stats) = metalearn_rules(
    X, y, brain, num_rule_epochs=20, num_epochs=2, batch_size=100, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=True)

In [None]:
# Show updated output layer rule
torch.set_printoptions(precision=4, sci_mode=False)
print('brain output_layer_rule:')
print(brain.get_output_rule())

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot upstream training curves.
plt.figure()
plt.plot(meta_sample_counts, meta_losses, label='loss')
plt.plot(meta_sample_counts, meta_train_acc, label='train')
plt.plot(meta_sample_counts, meta_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based upstream meta-learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Retrain the **same data** using the learned plasticity rule:

In [None]:
# NOTE: The output weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Train with the already learned rule on a **different** halfspace:

**NOTE**: The input weights learned from previous data will still be used.

In [None]:
# Generate data.
dimension = 4
N = 3000
X, y = random_halfspace_data(dim=dimension, n=3*N)
X_test = X[:N]
y_test = y[:N]
X = X[N:]
y = y[N:]

In [None]:
# NOTE: The output weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Does re-learning **input weights** for this new data improve the performance of the output plasticity rule?

In [None]:
# Create a new network with the same params as before
brain2 = PlasRuleClass(n=4, m=2, l=1, w=10, p=0.5, cap=5, options=opts, update_scheme=scheme)

In [None]:
# Learn input weights (via GD) for the new data
# NOTE: We'll also be meta-learning a new output layer rule, but we'll throw it away later
(meta_losses, meta_train_acc, meta_test_acc, meta_sample_counts, meta_stats) = metalearn_rules(
    X, y, brain2, num_rule_epochs=20, num_epochs=2, batch_size=100, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=True)

In [None]:
# Replace the output layer plasticity rule with the rule we learned from the original data
brain2.output_rule = nn.Parameter(brain.output_rule.clone().detach())

In [None]:
# Try learning again with the original output layer plasticity rule, but with input weights GD-learned for this data
# NOTE: The output weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain2, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

If this performance is significantly better than the previous plot, then the input weights are important. \
So, **YES** - the input weights make a big difference!

---
# Meta-Learn a Hidden-layer Plasticity Rule on halfspace data:

In [None]:
# Generate data
dimension = 4
N = 1000
X, y = random_halfspace_data(dim = dimension, n = 3*N)
X_test = X[:N]
y_test = y[:N]
X = X[N:]
y = y[N:]

In [None]:
# Set options to meta-learn a hidden layer plasticity rule, while using regular GD on input and output weights directly
opts = Options(use_graph_rule=True,
               gd_graph_rule=True,
               gd_input=True,
               gd_output=True)
scheme = UpdateScheme(update_misclassified_only=False, update_all_edges=True)

In [None]:
# Instantiate an FFLocalNet with 2 hidden layers, width 20
brain = PlasRuleClass(n=4, m=2, l=2, w=20, p=0.5, cap=10, options=opts, update_scheme=scheme)

# Print initial rule
print('brain hidden_layer_rule:')
print(brain.get_hidden_layer_rule())  # randomly initialized

In [None]:
# Meta-Learn a single plasticity rule between the two hidden layers
(meta_losses, meta_train_acc, meta_test_acc, meta_sample_counts, meta_stats) = metalearn_rules(
    X, y, brain, num_rule_epochs=20, num_epochs=2, batch_size=100, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False)

In [None]:
# all_true_y, all_pred_y = meta_stats[:2]
# all_true_y = np.array(all_true_y, dtype=np.int32)
# all_pred_y = np.array(all_pred_y, dtype=np.int32)
# plt.hist(all_true_y, bins=2)
# plt.hist(all_pred_y, bins=2)

In [None]:
# Show updated hidden layer rule
print('brain hidden_layer_rule:')
print(brain.get_hidden_layer_rule())

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot upstream training curves.
plt.figure()
plt.plot(meta_sample_counts, meta_losses, label='loss')
plt.plot(meta_sample_counts, meta_train_acc, label='train')
plt.plot(meta_sample_counts, meta_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based upstream meta-learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Retrain the **same data** using the learned plasticity rule:

In [None]:
# NOTE: The hidden layer weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Train with the already learned rule on a **different** halfspace:

**NOTE**: the input and output weights learned from previous data will still be used.

In [None]:
# Generate data.
dimension = 4
N = 5000
X, y = random_halfspace_data(dim=dimension, n=3*N)
X_test = X[:N]
y_test = y[:N]
X = X[N:]
y = y[N:]

In [None]:
# NOTE: The hidden layer weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

## Does re-learning **input & output weights** for this new data improve the performance of the hidden layer plasticity rule?

In [None]:
# Create a new network with the same params as before
brain2 = PlasRuleClass(n=4, m=2, l=2, w=20, p=0.5, cap=10, options=opts, update_scheme=scheme)

In [None]:
# Learn input & output weights (via GD) for the new data
# NOTE: We'll also be meta-learning a new hidden layer rule, but we'll throw it away later
(meta_losses, meta_train_acc, meta_test_acc, meta_sample_counts, meta_stats) = metalearn_rules(
    X, y, brain2, num_rule_epochs=20, num_epochs=2, batch_size=100, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False)

In [None]:
# Replace the hidden layer plasticity rule with the rule we learned from the original data
brain2.hidden_layer_rule = nn.Parameter(brain.hidden_layer_rule.clone().detach())

In [None]:
# Try learning again with the original hidden layer plasticity rule, but with input & output weights GD-learned for this data
# NOTE: The hidden layer weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X, y, brain2, num_epochs=1, batch_size=1, vanilla=False, learn_rate=1e-2,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=500)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves')
plt.grid(True)
plt.legend()
plt.show()

If this performance is significantly better than the previous plot, then the input & output weights are important. \
So, **YES** - the input & output weights make a big difference!

---

# Validate Output Layer meta-learning by recreating **Figure 2**

In [None]:
# Bring in MNIST
mnist_train = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=None)
mnist_test = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=None)

X_train = np.array([np.array(pair[0]).flatten() for pair in mnist_train]) / 255.0
y_train = np.array([pair[1] for pair in mnist_train])
X_test = np.array([np.array(pair[0]).flatten() for pair in mnist_test]) / 255.0
y_test = np.array([pair[1] for pair in mnist_test])

Train with full GD on MNIST with one hidden layer, w=1000 \
This should be similar to the benchmark GD case for Figure 2 (GD-Trained w/ batch=200)

In [None]:
brain = FFBrainNet(n=784, m=10, l=1, w=1000, p=0.1, cap=100, full_gd=True)
(gd_losses, gd_train_acc, gd_test_acc, gd_sample_counts, gd_stats) = train_downstream(
    X_train, y_train, brain, num_epochs=2, batch_size=200, vanilla=True, learn_rate=1e-3,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=3000)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(gd_losses[-1]))
print("Last train accuracy: {0:.4f}".format(gd_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(gd_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(gd_sample_counts[1:], gd_losses[1:], label='loss')
plt.plot(gd_sample_counts, gd_train_acc, label='train')
plt.plot(gd_sample_counts, gd_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Backprop learning curves on MNIST (batch size = 200)')
plt.grid(True)
plt.legend()
plt.show()

Now compare this to learning via an output plasticity rule. \
We'll first meta-learn an output rule using MNIST:

In [None]:
# Learn an output layer plasticity rule
opts = Options(use_output_rule=True, gd_output_rule=True, gd_input=True)
scheme = UpdateScheme(update_misclassified_only=False, update_all_edges=True)
local_brain = PlasRuleClass(n=784, m=10, l=1, w=1000, p=0.1, cap=100, options=opts, update_scheme=scheme)

(meta_losses, meta_train_acc, meta_test_acc, meta_sample_counts, meta_stats) = metalearn_rules(
    X_train, y_train, local_brain, num_rule_epochs=10, num_epochs=1, batch_size=100, learn_rate=1e-3,
    X_test=X_test, y_test=y_test, verbose=False)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot upstream training curves.
plt.figure()
plt.plot(meta_sample_counts, meta_losses, label='loss')
plt.plot(meta_sample_counts, meta_train_acc, label='train')
plt.plot(meta_sample_counts, meta_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based upstream meta-learning curves on MNIST')
plt.grid(True)
plt.legend()
plt.show()

Now try plasticity-based training using the learned output rule:

In [None]:
# NOTE: The output weights will be automatically reset during the first sample within train_given_rule()
(plas_losses, plas_train_acc, plas_test_acc, plas_sample_counts, plas_stats) = train_downstream(
    X_train, y_train, local_brain, num_epochs=2, batch_size=200, vanilla=False, learn_rate=1e-3,
    X_test=X_test, y_test=y_test, verbose=False, stats_interval=3000)

In [None]:
# Print essential stats.
print("Last loss: {0:.4f}".format(meta_losses[-1]))
print("Last train accuracy: {0:.4f}".format(meta_train_acc[-1]))
print("Last test accuracy: {0:.4f}".format(meta_test_acc[-1]))

# Plot downstream training curves.
plt.figure()
plt.plot(plas_sample_counts[1:], plas_losses[1:], label='loss')
plt.plot(plas_sample_counts, plas_train_acc, label='train')
plt.plot(plas_sample_counts, plas_test_acc, label='test')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Plasticity-based downstream learning curves on MNIST')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Compare the learning curves.
plt.plot(plas_sample_counts, plas_train_acc, label='Rule-Trained')
plt.plot(gd_sample_counts, gd_train_acc, label='GD-Trained (batch 200)')
plt.xlabel('Cumulative number of training samples')
plt.ylabel('Accuracy')
plt.title('Convergence of Accuracy')
plt.grid(True)
plt.legend()
plt.show()

---