In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 4

<img src="./images/04.png" width=800>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader, Subset
import numpy as np
from tqdm.autonotebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from utils import train_network, View, set_seed, weight_reset
import mlflow
from torchinfo import summary
import os
from collections import defaultdict

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns07_4'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise07_4')

<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_07/mlruns07_1/143507330168611334', creation_time=1750415411076, experiment_id='143507330168611334', last_update_time=1750415411076, lifecycle_stage='active', name='Exercise07_1', tags={}>

In [3]:
torch.backends.cudnn.deterministic = True
set_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and DataLoader

In [None]:
class AutoencodDataset(Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        # x, y = self.dataset.__getitem__(index)
        x, y = self.dataset[index]
        return  x, x

In [None]:
train_data = AutoencodDataset(torchvision.datasets.MNIST("./data", train=True, transform=transforms.ToTensor(), download=True))
test_data_xy = torchvision.datasets.MNIST("./data", train=False, transform=transforms.ToTensor(), download=True)
test_data_xx = AutoencodDataset(test_data_xy)
batch_size = 128
test_loader = DataLoader(test_data_xx, batch_size=batch_size)


## Model

In [None]:
D = 28 * 28
n = 64
C =1
classes = 10

In [None]:
def count_trainable_parameters(model):
    """
    Counts the total number of trainable parameters in a PyTorch model.
    Parameters are considered trainable if p.requires_grad is True.
    """
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

### Non-Weight Sharing

In [None]:
def get_layer(in_size, out_size):
    return nn.Sequential(
        nn.Linear(in_size, out_size),
        nn.BatchNorm1d(out_size),
        nn.ReLU()
    )

In [None]:
auto_encoder = nn.Sequential(
    nn.Flatten(),
    get_layer(D, D//2),
    get_layer(D//2, D//3),
    nn.Linear(D//3, n),
)

auto_decoder = nn.Sequential(
    get_layer(n, D//3),
    get_layer(D//3, D//2),
    nn.Linear(D//2, D),
    nn.Sigmoid(),
    View(-1, 1, 28, 28)
)

auto_encoder_non_sharing = nn.Sequential(
    auto_encoder,
    auto_decoder,
)

### With Weight Sharing

In [None]:
class TransposeLinear(nn.Module):
    def __init__(self, linearlayer, bias=False):
        super().__init__()
        self.weight = linearlayer.weight
        if bias:
            self.bias = nn.Parameter(torch.Tensor(self.weight.shape[1]))
        else:
            self.register_parameter('bias', None)
    def forward(self, x):
        return F.linear(x, self.weight.t(), self.bias)

In [None]:
def get_layer(layer):
    if isinstance(layer, nn.Linear):
        out_size = layer.weight.shape[0] # nn.Linear weight is (out_features, in_features)
    elif isinstance(layer, TransposeLinear):
        # TransposeLinear's effective output size is the input size of the original Linear
        out_size = layer.weight.shape[1] # original_linear_weight is (original_out, original_in)
                                        # so original_in is shape[1]
    else:
        raise ValueError("Unsupported layer type for get_layer")
    return nn.Sequential(
        layer, 
        nn.BatchNorm1d(out_size),
        nn.ReLU()
    )

In [None]:
layer_1 = nn.Linear(D, D//2)
layer_2 = nn.Linear(D//2, D//3)
layer_3 = nn.Linear(D//3, n)

encoder = nn.Sequential(
    nn.Flatten(),
    get_layer(layer_1),
    get_layer(layer_2),
    get_layer(layer_3)
    )

decoder = nn.Sequential(
    get_layer(TransposeLinear(layer_1)),
    get_layer(TransposeLinear(layer_2)),
    TransposeLinear(layer_3),
    nn.Sigmoid(),
    View(-1, 1, 28, 28)
    )
auto_encoder_weight_sharing = nn.Sequential(
    encoder,
    decoder
)

In [None]:
def get_layer(linear_or_transposelinear_module):
    # Determine the output size for BatchNorm1d
    if isinstance(linear_or_transposelinear_module, nn.Linear):
        out_features = linear_or_transposelinear_module.out_features
    elif isinstance(linear_or_transposelinear_module, TransposeLinear):
        # The output of TransposeLinear is the in_features of its original linear layer
        # For this to work, TransposeLinear needs to know the original layer's dimensions.
        # This requires a slight change in TransposeLinear init or how get_layer is used with it.
        # Let's adjust TransposeLinear to explicitly take in_features/out_features
        # or rely on its internal shared_weight's shape.
        out_features = linear_or_transposelinear_module.output_features # New attribute in TransposeLinear
    else:
        raise ValueError("Unsupported layer type for get_layer")

    return nn.Sequential(
        linear_or_transposelinear_module,
        nn.BatchNorm1d(out_features),
        nn.ReLU()
    )


class TransposeLinear(nn.Module):
    def __init__(self, shared_weight_tensor, output_features, decoder_bias=True):
        super().__init__()
        # Store the shared_weight_tensor, but DO NOT register it as a parameter of this module.
        # It's a reference to a parameter owned by an encoder layer.
        self._shared_weight_ref = shared_weight_tensor # Use a different name to avoid confusion with nn.Module's 'weight'

        # Store the output_features for BatchNorm in get_layer
        self.output_features = output_features

        # Create a *new, independent* bias for the decoder layer if requested.
        # This bias *is* a parameter of this TransposeLinear module.
        if decoder_bias:
            self.decoder_bias = nn.Parameter(torch.empty(output_features))
            nn.init.uniform_(self.decoder_bias, -0.1, 0.1)
        else:
            self.register_parameter('decoder_bias', None)

    def forward(self, x):
        # Access the shared weight tensor directly from the reference.
        # Its transpose is (original_in_features, original_out_features)
        return F.linear(x, self._shared_weight_ref.t(), self.decoder_bias)

# 1. Define the core Linear layers for the encoder. These own the trainable weights.
# These are the *only* Linear layers that should register weights in the model.
encoder_linear_1 = nn.Linear(D, D // 3)  # 784 -> 261
encoder_linear_2 = nn.Linear(D // 3, D // 2)  # 261 -> 392
encoder_linear_3 = nn.Linear(D // 2, D)  # 392 -> 784

# 2. Build the Encoder using these core linear layers wrapped in get_layer
encoder = nn.Sequential(
    nn.Flatten(),
    get_layer(encoder_linear_1),
    get_layer(encoder_linear_2),
    get_layer(encoder_linear_3)
)

# 3. Build the Decoder using TransposeLinear layers that *reference* the encoder's weights.
decoder = nn.Sequential(
    # Transpose of layer_3: Input D (784), Output D//2 (392)
    get_layer(TransposeLinear(encoder_linear_3.weight, output_features=D // 2, decoder_bias=True)),

    # Transpose of layer_2: Input D//2 (392), Output D//3 (261)
    get_layer(TransposeLinear(encoder_linear_2.weight, output_features=D // 3, decoder_bias=True)),

    # Transpose of layer_1: Input D//3 (261), Output D (784)
    # This is the last linear layer before Sigmoid, no BatchNorm/ReLU
    TransposeLinear(encoder_linear_1.weight, output_features=D, decoder_bias=True),

    nn.Sigmoid(),
    View(-1, 1, 28, 28)
)

auto_encoder_weight_sharing = nn.Sequential(
    encoder,
    decoder
)
summary(auto_encoder_weight_sharing, inpt_size=(batch_size, C, 28, 28))

## Training

In [None]:
loss_func = nn.MSELoss()
epochs = 50
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size,
    }

In [None]:
models = {
    'auto_encoder_non_sharing': auto_encoder_non_sharing,
    'auto_encoder_weight_sharing': auto_encoder_weight_sharing
    }

In [None]:
for samples in [1024, 8192, 32768, 60000]:
    params['samples'] = samples
    subset_indices = list(range(samples))
    train_subset = Subset(train_data, subset_indices)
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    for experiment, model in models.items(): 
        model.apply(weight_reset)
        params['experiment'] = experiment
        optimizer = optim.AdamW(model.parameters())

        with open('model_summary.txt', 'w') as f:
            f.write(str(summary(model, inpt_size=(batch_size, C, 28, 28))))
        with mlflow.start_run(nested=True, run_name=f'{experiment}_{samples}'):
            mlflow.log('optimizer', optimizer.defaults)
            mlflow.log_artifact('model_summary.txt')
            mlflow.log_params(params)

            results = train_network(
                model=model,
                optimizer=optimizer,
                loss_func=loss_func,
                train_loader=train_loader,
                valid_loader=test_loader,
                epochs=epochs,
                device=device,                
            )

<img src="./images/E4_train_loss.png">

<img src="./images/E4_valid_loss.png">

<img src="./images/E4_time.png">

#