In [1]:
import torch

from model import Model
import utils

# Getting data
For this task we will use an sample salary data.

In [2]:
n_train, n_test = 1000, 200
n_inputs, batch_size = 200, 10

weights, bias = utils.get_weights_and_bias(n_inputs)

train_data = utils.create_data(weights, bias, n_train)
test_data = utils.create_data(weights, bias, n_test)

train_loader = utils.get_dataloader(train_data, batch_size=batch_size)
test_loader = utils.get_dataloader(test_data, batch_size=batch_size)

# Training the model 
Now we will create the model and train it.

In [3]:
model = Model(weights, bias)

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

epochs = 100

In [7]:
def train(epochs, lambda_):
    for epoch in range(epochs):
        for batch in train_loader:
            inputs, targets = batch
            outputs = model(inputs.detach())
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets) + lambda_ * \
                torch.sum(model.weights ** 2)
            loss.backward(retain_graph=True)
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}: loss: {loss.item()}')

    print('L2 norm of w:', torch.norm(model.weights).item())


In [8]:
# traning without weight decay
train(epochs, lambda_=0.0)

Epoch 10: loss: 0.00017794388986658305
Epoch 20: loss: 0.00018841915880329907
Epoch 30: loss: 0.00018843093130271882
Epoch 40: loss: 0.00018838970572687685
Epoch 50: loss: 0.00018840693519450724
Epoch 60: loss: 0.00018842447025235742
Epoch 70: loss: 0.00018840274424292147
Epoch 80: loss: 0.00018841258133761585
Epoch 90: loss: 0.00018838851246982813
Epoch 100: loss: 0.00018843838188331574
L2 norm of w: 13.886139869689941


In [10]:
# training with weight decay
train(epochs, lambda_=2)

Epoch 10: loss: 219.04275512695312
Epoch 20: loss: 219.0427703857422
Epoch 30: loss: 219.0427703857422
Epoch 40: loss: 219.0427703857422
Epoch 50: loss: 219.0427703857422
Epoch 60: loss: 219.0427703857422
Epoch 70: loss: 219.0427703857422
Epoch 80: loss: 219.04278564453125
Epoch 90: loss: 219.0427703857422
Epoch 100: loss: 219.0427703857422
L2 norm of w: 5.704485893249512


# Conclusion
We have implemented the weight decay from scratch with a adjustable factor lambda. Thus we can use the same script to train the model with different lambda even with out weight decay.

## What we found
While the first training, we did not use the weight decay. We notice that the training loss is very small. But during the second training, the training loss is large. Now if we look at the L2 norms of both the training, we see L2 norm of first training is about three times of the second training. This is because the weight decay is used.