<a href="https://colab.research.google.com/github/rallm/IUST-DL-Fall2025/blob/main/HW5/helper/hw5_p1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

# Define the network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Layer 1: RNN(input=64, hidden=32)
        self.layer1 = nn.RNN(input_size=64, hidden_size=32, num_layers=1, batch_first=True, nonlinearity='tanh')

        # Layer 2: RNN(input=32, hidden=64)
        self.layer2 = nn.RNN(input_size=32, hidden_size=64, num_layers=1, batch_first=True, nonlinearity='tanh')

        # Layer 3: Linear(64, 512)
        self.layer3 = nn.Linear(64, 512)

        # Layer 4: Linear(512, 10)
        self.layer4 = nn.Linear(512, 10)

model = Net()

In [5]:
# --- Part A: Output Shapes & Parameters ---
def count_params(layer):
    weights = sum(p.numel() for name, p in layer.named_parameters() if 'weight' in name)
    biases = sum(p.numel() for name, p in layer.named_parameters() if 'bias' in name)
    return weights, biases

print("--- Part A: Parameters ---")
l1_w, l1_b = count_params(model.layer1)
print(f"Layer 1 (RNN): Weights={l1_w}, Biases={l1_b}, Total={l1_w + l1_b}")

l2_w, l2_b = count_params(model.layer2)
print(f"Layer 2 (RNN): Weights={l2_w}, Biases={l2_b}, Total={l2_w + l2_b}")

l3_w = model.layer3.weight.numel()
l3_b = model.layer3.bias.numel()
print(f"Layer 3 (Linear): Weights={l3_w}, Biases={l3_b}, Total={l3_w + l3_b}")

l4_w = model.layer4.weight.numel()
l4_b = model.layer4.bias.numel()
print(f"Layer 4 (Linear): Weights={l4_w}, Biases={l4_b}, Total={l4_w + l4_b}")

--- Part A: Parameters ---
Layer 1 (RNN): Weights=3072, Biases=64, Total=3136
Layer 2 (RNN): Weights=6144, Biases=128, Total=6272
Layer 3 (Linear): Weights=32768, Biases=512, Total=33280
Layer 4 (Linear): Weights=5120, Biases=10, Total=5130


In [4]:
# --- Part B: FLOPs Estimation ---
# Formula for RNN Cell (per step):
# Muls = H * D (W_ih) + H * H (W_hh)
# Adds = H * D (W_ih) + H * H (W_hh) + H (bias_ih) + H (bias_hh)
# Total Ops approx = 2 * H * (D + H) + Biases (ignoring activation cost for simplicity or counting as 1)
# Here we calculate Muls and Adds specifically.

def calculate_rnn_flops(seq_len, input_size, hidden_size):
    # Matrix-Vector Multiplications: (H, D)x(D, 1) and (H, H)x(H, 1)
    muls = (hidden_size * input_size) + (hidden_size * hidden_size)
    # Additions: Matrix products accumulation + biases + combining states
    adds = (hidden_size * input_size) + (hidden_size * hidden_size) + 2 * hidden_size

    total_ops_per_step = muls + adds
    return total_ops_per_step * seq_len

def calculate_linear_flops(input_size, output_size):
    muls = input_size * output_size
    adds = input_size * output_size + output_size # +bias
    return muls + adds

print("\n--- Part B: FLOPs ---")
seq_len = 50
l1_flops = calculate_rnn_flops(seq_len, 64, 32)
l2_flops = calculate_rnn_flops(seq_len, 32, 64)
# Assuming Linear layers are applied only to the final hidden state (Many-to-One)
l3_flops = calculate_linear_flops(64, 512)
l4_flops = calculate_linear_flops(512, 10)

print(f"L1 FLOPs: {l1_flops}")
print(f"L2 FLOPs: {l2_flops}")
print(f"L3 FLOPs: {l3_flops}")
print(f"L4 FLOPs: {l4_flops}")


--- Part B: FLOPs ---
L1 FLOPs: 310400
L2 FLOPs: 620800
L3 FLOPs: 66048
L4 FLOPs: 10250
