# Lab 5: NLP Basics

**Day 3 - From Deep Learning to LLMs**

| Duration | Difficulty | Prerequisites |
|----------|------------|---------------|
| 75 min | Intermediate | Labs 1-4 |

## Learning Objectives

- Understand text preprocessing and tokenization
- Learn about word embeddings
- Implement basic attention mechanism
- Understand transformer architecture concepts

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import re
from collections import Counter

np.random.seed(42)
torch.manual_seed(42)

---

## Exercise 1: Text Preprocessing

Before feeding text to models, we need to clean and normalize it.

**Your Task:** Implement basic text preprocessing functions.

In [None]:
def clean_text(text):
    """
    Clean text by:
    1. Converting to lowercase
    2. Removing special characters (keep only letters, numbers, spaces)
    3. Removing extra whitespace
    """
    # TODO: Convert to lowercase
    text = None
    
    # TODO: Remove special characters (use re.sub)
    # Pattern: [^a-z0-9\s] matches anything that's not letter, number, or space
    text = None
    
    # TODO: Remove extra whitespace
    text = None
    
    return text

In [None]:
def tokenize_simple(text):
    """
    Simple word-level tokenization.
    Split text into words.
    """
    # TODO: Split text on whitespace
    return None

In [None]:
def build_vocabulary(texts, min_freq=1):
    """
    Build a vocabulary from list of texts.
    
    Returns:
        word2idx: Dictionary mapping words to indices
        idx2word: Dictionary mapping indices to words
    """
    # TODO: Count all words across all texts
    word_counts = Counter()
    # for text in texts:
    #     tokens = tokenize_simple(clean_text(text))
    #     word_counts.update(tokens)
    
    # TODO: Filter by minimum frequency
    # vocab = [word for word, count in word_counts.items() if count >= min_freq]
    
    # TODO: Create word2idx with special tokens
    # Start with: {'<PAD>': 0, '<UNK>': 1}
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    # for word in vocab:
    #     word2idx[word] = len(word2idx)
    
    # TODO: Create idx2word (reverse mapping)
    idx2word = {v: k for k, v in word2idx.items()}
    
    return word2idx, idx2word

In [None]:
# Test Exercise 1
sample_texts = [
    "Hello, World! This is a TEST.",
    "Machine Learning is AMAZING!!!",
    "Neural networks learn from data."
]

# Test clean_text
cleaned = clean_text(sample_texts[0])
print(f"Original: {sample_texts[0]}")
print(f"Cleaned: {cleaned}")

# Test tokenize
if cleaned:
    tokens = tokenize_simple(cleaned)
    print(f"Tokens: {tokens}")

# Test vocabulary
word2idx, idx2word = build_vocabulary(sample_texts)
print(f"\nVocabulary size: {len(word2idx)}")
print(f"Sample: {dict(list(word2idx.items())[:5])}")

---

## Exercise 2: Text to Sequences

Convert text to numerical sequences for model input.

**Your Task:** Implement text encoding/decoding functions.

In [None]:
def encode_text(text, word2idx, max_length=None):
    """
    Convert text to sequence of indices.
    
    Args:
        text: Input text
        word2idx: Vocabulary mapping
        max_length: If provided, pad or truncate to this length
    
    Returns:
        List of token indices
    """
    # TODO: Clean and tokenize text
    tokens = None
    
    # TODO: Convert tokens to indices (use UNK for unknown words)
    # indices = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    indices = None
    
    # TODO: Handle max_length (pad with 0 or truncate)
    if max_length:
        pass
    
    return indices

In [None]:
def decode_sequence(indices, idx2word):
    """
    Convert sequence of indices back to text.
    
    Skip padding tokens (index 0).
    """
    # TODO: Convert indices to words, skipping PAD
    # words = [idx2word[idx] for idx in indices if idx != 0]
    # return ' '.join(words)
    return None

In [None]:
# Test Exercise 2
test_text = "Neural networks are powerful models."

encoded = encode_text(test_text, word2idx, max_length=10)
print(f"Text: {test_text}")
print(f"Encoded: {encoded}")

if encoded:
    decoded = decode_sequence(encoded, idx2word)
    print(f"Decoded: {decoded}")

---

## Exercise 3: Word Embeddings

Embeddings convert sparse indices to dense vectors.

**Your Task:** Understand and use embedding layers.

In [None]:
class SimpleEmbedding:
    """
    Manual implementation of word embeddings.
    """
    def __init__(self, vocab_size, embedding_dim):
        """
        Initialize embedding matrix with random values.
        
        Shape: (vocab_size, embedding_dim)
        """
        # TODO: Initialize embedding matrix
        # Use small random values: np.random.randn(...) * 0.1
        self.embeddings = None
    
    def __call__(self, indices):
        """
        Look up embeddings for given indices.
        
        Args:
            indices: List or array of word indices
        
        Returns:
            Array of shape (len(indices), embedding_dim)
        """
        # TODO: Return embeddings for the given indices
        # Hint: self.embeddings[indices]
        return None

In [None]:
def compute_cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors.
    
    Formula: cos(a,b) = (a Â· b) / (||a|| * ||b||)
    """
    # TODO: Implement cosine similarity
    # dot_product = np.dot(vec1, vec2)
    # norm1 = np.linalg.norm(vec1)
    # norm2 = np.linalg.norm(vec2)
    return None

In [None]:
# Test Exercise 3
vocab_size = len(word2idx)
embedding_dim = 50

# Manual embedding
simple_emb = SimpleEmbedding(vocab_size, embedding_dim)

if simple_emb.embeddings is not None:
    print(f"Embedding matrix shape: {simple_emb.embeddings.shape}")
    
    # Look up embeddings
    test_indices = [2, 3, 4]
    embeddings = simple_emb(test_indices)
    print(f"Embeddings for indices {test_indices}: shape {embeddings.shape}")

# PyTorch embedding layer
torch_emb = nn.Embedding(vocab_size, embedding_dim)
print(f"\nPyTorch Embedding: {torch_emb}")
print(f"Parameters: {torch_emb.weight.shape}")

---

## Exercise 4: Self-Attention Mechanism

Attention allows models to focus on relevant parts of the input.

**Your Task:** Implement scaled dot-product attention.

In [None]:
def scaled_dot_product_attention(query, key, value):
    """
    Compute scaled dot-product attention.
    
    Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
    
    Args:
        query: Shape (seq_len, d_k)
        key: Shape (seq_len, d_k)
        value: Shape (seq_len, d_v)
    
    Returns:
        output: Shape (seq_len, d_v)
        attention_weights: Shape (seq_len, seq_len)
    """
    d_k = query.shape[-1]
    
    # TODO: Compute attention scores: Q @ K^T
    scores = None
    
    # TODO: Scale by sqrt(d_k)
    scores = None
    
    # TODO: Apply softmax to get attention weights
    attention_weights = None
    
    # TODO: Compute output: weights @ V
    output = None
    
    return output, attention_weights

In [None]:
def visualize_attention(attention_weights, tokens):
    """
    Visualize attention weights as a heatmap.
    """
    # TODO: Use plt.imshow to create heatmap
    # Add token labels on axes
    pass

In [None]:
# Test Exercise 4
# Simulate embeddings for a sentence
seq_len = 5
d_model = 8

# Random embeddings (in practice, these would be learned)
embeddings = torch.randn(seq_len, d_model)

# For self-attention, Q=K=V=embeddings
output, weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)

if output is not None:
    print(f"Input shape: {embeddings.shape}")
    print(f"Output shape: {output.shape}")
    print(f"Attention weights shape: {weights.shape}")
    print(f"\nAttention weights (each row sums to 1):")
    print(weights.numpy().round(3))
    
    # Visualize
    tokens = ['The', 'cat', 'sat', 'on', 'mat']
    visualize_attention(weights.numpy(), tokens)
else:
    print("Implement scaled_dot_product_attention()")

---

## Exercise 5: Simple Transformer Block

**Your Task:** Build a basic transformer encoder block.

In [None]:
class SelfAttention(nn.Module):
    """
    Self-attention layer using PyTorch.
    """
    def __init__(self, embed_dim):
        super().__init__()
        # TODO: Create linear layers for Q, K, V projections
        # self.query = nn.Linear(embed_dim, embed_dim)
        # self.key = nn.Linear(embed_dim, embed_dim)
        # self.value = nn.Linear(embed_dim, embed_dim)
        pass
    
    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch, seq_len, embed_dim)
        """
        # TODO: Project inputs to Q, K, V
        # Q = self.query(x)
        # K = self.key(x)
        # V = self.value(x)
        
        # TODO: Compute attention
        # Use F.scaled_dot_product_attention or implement manually
        pass

In [None]:
class TransformerBlock(nn.Module):
    """
    Basic transformer encoder block.
    
    Structure:
    x -> Self-Attention -> Add & Norm -> FFN -> Add & Norm -> output
    """
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        # TODO: Self-attention
        # self.attention = SelfAttention(embed_dim)
        
        # TODO: Feed-forward network
        # self.ffn = nn.Sequential(
        #     nn.Linear(embed_dim, ff_dim),
        #     nn.ReLU(),
        #     nn.Linear(ff_dim, embed_dim)
        # )
        
        # TODO: Layer normalization
        # self.norm1 = nn.LayerNorm(embed_dim)
        # self.norm2 = nn.LayerNorm(embed_dim)
        pass
    
    def forward(self, x):
        # TODO: Attention with residual connection
        # attn_out = self.attention(x)
        # x = self.norm1(x + attn_out)
        
        # TODO: FFN with residual connection
        # ffn_out = self.ffn(x)
        # x = self.norm2(x + ffn_out)
        pass

In [None]:
# Test Exercise 5
embed_dim = 64
ff_dim = 128
batch_size = 2
seq_len = 10

# Create transformer block
transformer = TransformerBlock(embed_dim, ff_dim)

if hasattr(transformer, 'attention'):
    print("TransformerBlock:")
    print(transformer)
    
    # Test forward pass
    test_input = torch.randn(batch_size, seq_len, embed_dim)
    output = transformer(test_input)
    print(f"\nInput shape: {test_input.shape}")
    print(f"Output shape: {output.shape}")
else:
    print("Implement TransformerBlock")

---

## Exercise 6: Text Classification with Attention

**Your Task:** Build a simple text classifier using attention.

In [None]:
class TextClassifier(nn.Module):
    """
    Simple text classifier:
    Embedding -> Self-Attention -> Global Average Pooling -> Classification
    """
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        # TODO: Embedding layer
        # TODO: Attention layer
        # TODO: Classification head
        pass
    
    def forward(self, x):
        """
        Args:
            x: Token indices of shape (batch, seq_len)
        """
        # TODO: Embed tokens
        # TODO: Apply attention
        # TODO: Global average pool over sequence
        # TODO: Classify
        pass

In [None]:
# Test Exercise 6
vocab_size = 1000
embed_dim = 32
num_classes = 2

classifier = TextClassifier(vocab_size, embed_dim, num_classes)

if hasattr(classifier, 'embedding'):
    print("TextClassifier:")
    print(classifier)
    
    # Test
    batch = torch.randint(0, vocab_size, (4, 20))  # 4 samples, 20 tokens
    output = classifier(batch)
    print(f"\nInput shape: {batch.shape}")
    print(f"Output shape: {output.shape}")
else:
    print("Implement TextClassifier")

---

## Checkpoint

Congratulations! You've completed Lab 5.

### Key Takeaways:
- Tokenization converts text to sequences
- Embeddings map tokens to dense vectors
- Attention allows focusing on relevant parts
- Transformers use attention + feed-forward layers

**Next:** Lab 6 - LLM APIs