# Lab 5: NLP Basics - SOLUTIONS

**Day 3 - From Deep Learning to LLMs**

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import re
from collections import Counter

np.random.seed(42)
torch.manual_seed(42)

## Exercise 1: Text Preprocessing - SOLUTION

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_simple(text):
    return text.split()

def build_vocabulary(texts, min_freq=1):
    word_counts = Counter()
    for text in texts:
        tokens = tokenize_simple(clean_text(text))
        word_counts.update(tokens)
    
    vocab = [word for word, count in word_counts.items() if count >= min_freq]
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    for word in vocab:
        word2idx[word] = len(word2idx)
    
    idx2word = {v: k for k, v in word2idx.items()}
    return word2idx, idx2word

# Test
sample_texts = [
    "Hello, World! This is a TEST.",
    "Machine Learning is AMAZING!!!",
    "Neural networks learn from data."
]
print(f"Cleaned: {clean_text(sample_texts[0])}")
word2idx, idx2word = build_vocabulary(sample_texts)
print(f"Vocabulary size: {len(word2idx)}")

## Exercise 2: Text to Sequences - SOLUTION

In [None]:
def encode_text(text, word2idx, max_length=None):
    tokens = tokenize_simple(clean_text(text))
    indices = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    
    if max_length:
        if len(indices) < max_length:
            indices = indices + [0] * (max_length - len(indices))
        else:
            indices = indices[:max_length]
    
    return indices

def decode_sequence(indices, idx2word):
    words = [idx2word[idx] for idx in indices if idx != 0]
    return ' '.join(words)

# Test
test_text = "Neural networks are powerful models."
encoded = encode_text(test_text, word2idx, max_length=10)
print(f"Encoded: {encoded}")
print(f"Decoded: {decode_sequence(encoded, idx2word)}")

## Exercise 3: Word Embeddings - SOLUTION

In [None]:
class SimpleEmbedding:
    def __init__(self, vocab_size, embedding_dim):
        self.embeddings = np.random.randn(vocab_size, embedding_dim) * 0.1
    
    def __call__(self, indices):
        return self.embeddings[indices]

def compute_cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Test
simple_emb = SimpleEmbedding(len(word2idx), 50)
print(f"Embedding shape: {simple_emb.embeddings.shape}")
print(f"Lookup shape: {simple_emb([2, 3, 4]).shape}")

## Exercise 4: Self-Attention - SOLUTION

In [None]:
def scaled_dot_product_attention(query, key, value):
    d_k = query.shape[-1]
    scores = torch.matmul(query, key.transpose(-2, -1))
    scores = scores / np.sqrt(d_k)
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, value)
    return output, attention_weights

def visualize_attention(attention_weights, tokens):
    plt.figure(figsize=(8, 6))
    plt.imshow(attention_weights, cmap='Blues')
    plt.colorbar()
    plt.xticks(range(len(tokens)), tokens, rotation=45)
    plt.yticks(range(len(tokens)), tokens)
    plt.xlabel('Keys')
    plt.ylabel('Queries')
    plt.title('Self-Attention Weights')
    plt.tight_layout()
    plt.show()

# Test
embeddings = torch.randn(5, 8)
output, weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)
print(f"Output shape: {output.shape}")
print(f"Attention weights:\n{weights.numpy().round(3)}")

tokens = ['The', 'cat', 'sat', 'on', 'mat']
visualize_attention(weights.numpy(), tokens)

## Exercise 5: Transformer Block - SOLUTION

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.scale = np.sqrt(embed_dim)
    
    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        weights = F.softmax(scores, dim=-1)
        return torch.matmul(weights, V)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.attention = SelfAttention(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
    
    def forward(self, x):
        attn_out = self.attention(x)
        x = self.norm1(x + attn_out)
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x

# Test
transformer = TransformerBlock(64, 128)
test_input = torch.randn(2, 10, 64)
output = transformer(test_input)
print(f"Input: {test_input.shape}, Output: {output.shape}")

## Exercise 6: Text Classifier - SOLUTION

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attention = SelfAttention(embed_dim)
        self.classifier = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)  # (batch, seq, embed)
        attended = self.attention(embedded)  # (batch, seq, embed)
        pooled = attended.mean(dim=1)  # (batch, embed)
        return self.classifier(pooled)  # (batch, classes)

# Test
classifier = TextClassifier(1000, 32, 2)
batch = torch.randint(0, 1000, (4, 20))
output = classifier(batch)
print(f"Input: {batch.shape}, Output: {output.shape}")

## Checkpoint

Lab 5 complete! **Next:** Lab 6 - LLM APIs