## Importing necessary libraries and packages

In [71]:
import torch
import torch.nn as nn
from torch import optim
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from torch.optim import lr_scheduler
import math
import wandb
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import random_split, Dataset
from torch.optim.lr_scheduler import LambdaLR
from sklearn.model_selection import train_test_split

## Making the most of my M2 MacBook Pro

In [72]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

device = 'mps' # 'cuda' if torch.cuda.is_available() else 'cpu'

## Getting the path to some relevent folders

In [73]:
checkpoints_path = '/Users/sienkadounia/lab/ai-futures/Project/ewdd/'
label_noise_path = '/Users/sienkadounia/lab/ai-futures/Project/label_noise/'
rlcts_path = '/Users/sienkadounia/lab/checkpoints/rlcts/ewdd/'

## List all hyperparameters in one cell for easy use

In [81]:
lr = 0.1
num_epochs = 2000
data_seed = 42
N = 1000 # Number of samples
D = 5  # Number of features

sigma = 0.5  # Noise
# Set the random seed and test size
RAND_ST = 42
test_size = 0.2
data_seed = 42
epochs = 1000

## Generating the training and testing data

In [82]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return x, y

In [83]:
# Let's create the data to train our DLN
# Generate feature matrix X and weight vector w
np.random.seed(data_seed)
X = np.random.randn(N, D)  # Samples from standard normal distribution
X = torch.from_numpy(X).float()

np.random.seed(data_seed)
W_true = np.random.randn(N, D) @ np.random.randn(D, N) # Weights from standard normal distribution
W_true = torch.from_numpy(W_true).float()

'''# Generate noise vector epsilon
np.random.seed(data_seed)
epsilon = sigma * np.random.randn(N)  # Noise from standard normal distribution
epsilon = torch.from_numpy(epsilon)'''

# Compute target variable y
#y = torch.matmul(X, w) + epsilon
y = W_true @ X

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RAND_ST)

train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

## Building Deep Linear Network model architecture

In [84]:
# Define the neural network model
class RidgeRegressionNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RidgeRegressionNN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size, bias=False)
        self.linear2 = nn.Linear(hidden_size, output_size, bias=False)

    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        return x

In [85]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def lr_lambda(epoch):
    base_lr = 0.1
    return base_lr/math.sqrt(epoch+1)

## Training loop

In [86]:
# Train the model for a range of hidden layer sizes
hidden_layer_sizes = list(range(0, 200, 20))
train_errors = []
test_errors = []
params = []

for hidden_layer_size in hidden_layer_sizes:
    # Update the model architecture
    torch.manual_seed(data_seed)
    input_size = X_train.shape[-1]
    model = RidgeRegressionNN(input_size=input_size,
    hidden_size = input_size + hidden_layer_size,
    output_size=input_size).to(device)
    param = count_parameters(model)
    params.append(param)
    print(f'Number of model parameters is:  {param}')

    criterion = nn.MSELoss()
    # Define the optimizer
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda)

    wandb.init(project="SLT of Double Descent", 
    config = {'epochs': num_epochs,
              'learning_rate': lr,
              'use_label_noise': 'False',
              'use_test_label_noise': 'False',
              'noise_level': '0',
              'dataset': "Toy Uniform Linear Regression",
              'architecture': "DLN",
              'model_width': count_parameters(model),
              'augmented': 'False',
              'optimizer': 'SGD'})

    wandb.watch(model)
    config = wandb.config

    # Training the model
    step = 0
    for epoch in range(epochs):
        model.train()
        model_train_errors = 0
        for _, data in enumerate(train_loader):
            X_train, y_train = data
            X_train = X_train.to(device)
            y_train = y_train.to(device)
            optimizer.zero_grad()
            y_pred = model(X_train)
            loss = criterion(y_pred.squeeze(), y_train)
            model_train_errors+=loss.item()

            loss.backward()
            optimizer.step()
            
            step += 1
            wandb.log({'batch_loss': loss.item()}, step=step)
        
        model_train_error = model_train_errors / len(train_loader)
        train_errors.append(model_train_error)

        # Computing the training and testing errors
        model.eval()
        model_test_errors = 0
        with torch.no_grad():
            for _, data in enumerate(test_loader):
                X_test, y_test = data
                X_test = X_test.to(device)
                y_test = y_test.to(device)

                y_test_pred = model(X_test)
                test_error = criterion(y_test_pred, y_test)
                model_test_errors+=test_error.item()
                
        model_test_error = model_test_errors / len(test_loader)
        test_errors.append(model_test_error)

        wandb.log({'epoch': epoch,
                    'loss/train': model_train_error,
                    'loss/test': model_test_error,
                }, step=step)

torch.save(model.state_dict(), checkpoints_path + 'dln'+str(hidden_layer_size)+'.pth')

Number of model parameters is:  50




0,1
batch_loss,█▇▇▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,███▇▅▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,███▇▆▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,9.10723
epoch,80.0
loss/test,10.86866
loss/train,11.03247


Number of model parameters is:  250




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  450




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  650




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  850




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  1050




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  1250




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  1450




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  1650




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


Number of model parameters is:  1850




0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/test,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_loss,0.0
epoch,999.0
loss/test,0.0
loss/train,0.0


In [None]:
# Ploting the training and testing errors as a function of hidden layer size
plt.plot(params, test_errors, label='Testing error')
plt.plot(params, train_errors, label='Training error')
plt.axvline(x=N, color='r', linestyle='--', label='Model parameters= Number of samples')
plt.xlabel('Parameters')
plt.ylabel('MSE')
plt.xscale('log')
#plt.yscale('log')
plt.legend()
plt.show()