# PyTorch Tensors: Vectors & Matrices

This notebook covers the fundamentals of PyTorch tensors - the building blocks of neural networks.

## Table of Contents
1. [Core Mental Model](#core-mental-model)
2. [Vector/Matrix Operations](#vectormatrix-operations)
3. [One-Hot vs Embedding Vectors](#one-hot-vs-embedding-vectors)
4. [Tiny Exercise: Batch Operations](#tiny-exercise-batch-operations)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Set seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)

# Check PyTorch version and device
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
device = torch.device('cpu')  # We'll use CPU for reproducibility

## Core Mental Model

Tensors are n-dimensional arrays that carry data and gradients through neural networks. Think of them as generalized matrices that know how to compute derivatives.

In [None]:
# Basic tensor creation
x = torch.tensor([1.0, 2.0, 3.0])  # 1D tensor (vector)
print(f"x: {x}, shape: {x.shape}, dtype: {x.dtype}")

# Common creation patterns
zeros = torch.zeros(3, 4)  # 3x4 matrix of zeros
ones = torch.ones(2, 3, 4)  # 2x3x4 tensor of ones  
randn = torch.randn(2, 5)  # Random normal distribution
arange = torch.arange(0, 10, 2)  # [0, 2, 4, 6, 8]

# Inspect tensor properties
print(f"zeros shape: {zeros.shape}")
print(f"zeros device: {zeros.device}")  
print(f"zeros requires_grad: {zeros.requires_grad}")

print(f"\nones shape: {ones.shape}")
print(f"randn shape: {randn.shape}")
print(f"arange: {arange}")

## Vector/Matrix Operations

In [None]:
# Matrix multiplication and broadcasting
batch_size, d_input, d_hidden = 4, 6, 8
X = torch.randn(batch_size, d_input)  # Batch of inputs [B, D_in]
W = torch.randn(d_input, d_hidden)    # Weight matrix [D_in, D_out]
b = torch.randn(d_hidden)             # Bias vector [D_out]

# Linear transformation: y = XW + b
y = X @ W + b  # Broadcasting handles bias addition
print(f"X shape: {X.shape}, W shape: {W.shape}, y shape: {y.shape}")

# Indexing and slicing
first_sample = X[0]      # First sample: [D_in]
first_two = X[:2]        # First two samples: [2, D_in]
last_dim = X[..., -1]    # Last feature across all samples: [B]

print(f"\nFirst sample shape: {first_sample.shape}")
print(f"First two samples shape: {first_two.shape}")
print(f"Last dimension across batch shape: {last_dim.shape}")

# Stacking and concatenation
X1 = torch.randn(4, 6)
X2 = torch.randn(4, 6)
stacked = torch.stack([X1, X2], dim=0)  # [2, 4, 6] - adds new dimension
concat = torch.cat([X1, X2], dim=0)     # [8, 6] - concatenates along existing dimension

print(f"\nX1 shape: {X1.shape}, X2 shape: {X2.shape}")
print(f"Stacked shape: {stacked.shape}")
print(f"Concatenated shape: {concat.shape}")

## One-Hot vs Embedding Vectors

In [None]:
# One-hot: sparse, large, inefficient for large vocabularies
vocab_size = 1000
token_id = 42
onehot = torch.zeros(vocab_size)
onehot[token_id] = 1.0
print(f"One-hot vector size: {onehot.shape}, mostly zeros: {torch.sum(onehot == 0).item()}")

# Embedding: dense, learnable, efficient
embedding_dim = 64
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedded = embedding_layer(torch.tensor([token_id]))
print(f"Embedding vector size: {embedded.shape}, all meaningful values")

# Why embeddings are better:
print("\nWhy embeddings are better:")
print(f"1. Memory: {embedding_dim} floats vs {vocab_size} floats")
print("2. Computation: Dense ops vs sparse ops")
print("3. Learning: Embeddings learn representations, one-hot is fixed")

# Demonstrate embedding learning
batch_tokens = torch.tensor([1, 5, 10, 42, 100])  # Batch of token IDs
batch_embedded = embedding_layer(batch_tokens)
print(f"\nBatch tokens shape: {batch_tokens.shape}")
print(f"Batch embeddings shape: {batch_embedded.shape}")

# Show that embeddings are learnable parameters
print(f"\nEmbedding layer has {embedding_layer.weight.numel():,} learnable parameters")
print(f"Embedding weight shape: {embedding_layer.weight.shape}")

## Tiny Exercise: Batch Operations

In [None]:
# Construct batch data
batch_size, d_model = 3, 4
X_batch = torch.randn(batch_size, d_model)  # [B, D]
W_linear = torch.randn(d_model, d_model)    # [D, D]
b_linear = torch.randn(d_model)             # [D]

# Compute linear transformation
y_batch = X_batch @ W_linear + b_linear     # [B, D]

# Verify shapes
assert X_batch.shape == (batch_size, d_model)
assert y_batch.shape == (batch_size, d_model)
print("✓ Shape assertions passed")

# Element-wise operations
relu_output = torch.relu(y_batch)          # Apply ReLU activation
squared = torch.pow(y_batch, 2)            # Element-wise square
exp_output = torch.exp(y_batch)           # Element-wise exponential

print(f"\nOriginal output:\n{y_batch}")
print(f"\nAfter ReLU (negative values -> 0):\n{relu_output}")
print(f"\nSquared:\n{squared}")

# Reduction operations
sum_all = torch.sum(y_batch)                    # Sum all elements
sum_dim0 = torch.sum(y_batch, dim=0)            # Sum along batch dimension [D]
sum_dim1 = torch.sum(y_batch, dim=1)            # Sum along feature dimension [B]
mean_batch = torch.mean(y_batch, dim=0)         # Mean across batch [D]

print(f"\nReduction operations:")
print(f"Sum all elements: {sum_all.item():.3f}")
print(f"Sum along batch dim: {sum_dim0} (shape: {sum_dim0.shape})")
print(f"Sum along feature dim: {sum_dim1} (shape: {sum_dim1.shape})")
print(f"Mean across batch: {mean_batch} (shape: {mean_batch.shape})")

# Broadcasting example
print(f"\n=== Broadcasting Demo ===")
A = torch.randn(3, 1)    # [3, 1]
B = torch.randn(1, 4)    # [1, 4]
C = A + B                # Broadcasting -> [3, 4]

print(f"A shape: {A.shape}")
print(f"B shape: {B.shape}")
print(f"A + B shape: {C.shape} (broadcasted!)")

# This is equivalent to:
A_expanded = A.expand(3, 4)  # [3, 1] -> [3, 4]
B_expanded = B.expand(3, 4)  # [1, 4] -> [3, 4]
C_manual = A_expanded + B_expanded

print(f"Manual expansion result matches: {torch.allclose(C, C_manual)}")

print("\n🎉 Tensor basics completed successfully!")