In [1]:
import numpy as np
import math

#### encoder-decoder tranformers and attention mechanism's purpose

In [2]:
class SimpleEncoder:

    def __init__(self):
        self.name = "Encoder"

    def encode(self, input_tokens):
        print(f"Encoder processing: {input_tokens}")
        # In real transformers, this involves self-attention and feed-forward layers
        encoded = [f"encoded_{token}" for token in input_tokens]
        print(f"Encoder output: {encoded}")
        return encoded

class SimpleDecoder:
    def __init__(self):
        self.name = "Decoder"

    def decode(self, encoded_input, target_length=3):
        print(f"Decoder using encoded input: {encoded_input}")
        # Decoder generates output tokens one by one
        output = []
        for i in range(target_length):
            next_token = f"output_{i+1}"
            output.append(next_token)
            print(f"Generated token {i+1}: {next_token}")
        return output

In [3]:
print("=== ENCODER-DECODER DEMO ===")
encoder = SimpleEncoder()
decoder = SimpleDecoder()

input_sequence = ["hello", "world"]
encoded_sequence = encoder.encode(input_sequence)
output_sequence = decoder.decode(encoded_sequence)
print(f"Final output: {output_sequence}")
print()

=== ENCODER-DECODER DEMO ===
Encoder processing: ['hello', 'world']
Encoder output: ['encoded_hello', 'encoded_world']
Decoder using encoded input: ['encoded_hello', 'encoded_world']
Generated token 1: output_1
Generated token 2: output_2
Generated token 3: output_3
Final output: ['output_1', 'output_2', 'output_3']



####  How Self attention works

In [4]:
def simple_self_attention(tokens):
    print("=== SELF-ATTENTION STEP-BY-STEP ===")
    print(f"Input tokens: {tokens}")

    # Step 1: Create Query, Key, Value for each token (simplified)
    queries = [f"Q_{token}" for token in tokens]
    keys = [f"K_{token}" for token in tokens]
    values = [f"V_{token}" for token in tokens]

    print(f"Queries: {queries}")
    print(f"Keys: {keys}")
    print(f"Values: {values}")
    print()

    # Step 2: Calculate attention scores (how much each token should attend to others)
    print("Attention process for each token:")
    attention_outputs = []

    for i, token in enumerate(tokens):
        print(f"\nToken '{token}' attending to all tokens:")

        # In real implementation, this involves dot products and softmax
        # Here we simulate with simple scores
        attention_scores = {}
        for j, other_token in enumerate(tokens):
            # Simulate attention score calculation
            score = 1.0 / (abs(i - j) + 1)  # Closer tokens get higher scores
            attention_scores[other_token] = score
            print(f"  Attention to '{other_token}': {score:.2f}")

        # Normalize scores (like softmax)
        total_score = sum(attention_scores.values())
        normalized_scores = {k: v/total_score for k, v in attention_scores.items()}

        print(f"  Normalized attention: {normalized_scores}")

        # Create attended representation (weighted combination of values)
        attended_token = f"attended_{token}"
        attention_outputs.append(attended_token)

    print(f"\nSelf-attention output: {attention_outputs}")
    return attention_outputs

# Demo: Self-attention
tokens = ["I", "love", "AI"]
attended_tokens = simple_self_attention(tokens)
print()

=== SELF-ATTENTION STEP-BY-STEP ===
Input tokens: ['I', 'love', 'AI']
Queries: ['Q_I', 'Q_love', 'Q_AI']
Keys: ['K_I', 'K_love', 'K_AI']
Values: ['V_I', 'V_love', 'V_AI']

Attention process for each token:

Token 'I' attending to all tokens:
  Attention to 'I': 1.00
  Attention to 'love': 0.50
  Attention to 'AI': 0.33
  Normalized attention: {'I': 0.5454545454545455, 'love': 0.27272727272727276, 'AI': 0.18181818181818182}

Token 'love' attending to all tokens:
  Attention to 'I': 0.50
  Attention to 'love': 1.00
  Attention to 'AI': 0.50
  Normalized attention: {'I': 0.25, 'love': 0.5, 'AI': 0.25}

Token 'AI' attending to all tokens:
  Attention to 'I': 0.33
  Attention to 'love': 0.50
  Attention to 'AI': 1.00
  Normalized attention: {'I': 0.18181818181818182, 'love': 0.27272727272727276, 'AI': 0.5454545454545455}

Self-attention output: ['attended_I', 'attended_love', 'attended_AI']



#### Transformer vs RNN

In [5]:
class SimpleRNN:
    """Simplified RNN to show sequential processing"""
    def __init__(self):
        self.hidden_state = None

    def process_sequence(self, tokens):
        print("=== RNN PROCESSING (Sequential) ===")
        print("Processing one token at a time, left to right:")

        outputs = []
        for i, token in enumerate(tokens):
            print(f"Step {i+1}: Processing '{token}'")
            # RNN updates hidden state sequentially
            self.hidden_state = f"hidden_after_{token}"
            output = f"rnn_output_{i+1}"
            outputs.append(output)
            print(f"  Hidden state: {self.hidden_state}")
            print(f"  Output: {output}")

        print(f"Final RNN outputs: {outputs}")
        return outputs

class SimpleTransformer:
    """Simplified Transformer to show parallel processing"""
    def process_sequence(self, tokens):
        print("=== TRANSFORMER PROCESSING (Parallel) ===")
        print("Processing ALL tokens simultaneously:")

        # All tokens processed at once through self-attention
        print("Step 1: Self-attention on all tokens")
        attended = simple_self_attention(tokens)

        print("Step 2: Feed-forward processing (parallel)")
        outputs = [f"transformer_output_{i+1}" for i in range(len(tokens))]
        print(f"Final Transformer outputs: {outputs}")
        return outputs

In [6]:
# Comparison Demo
print("=== ADVANTAGE: TRANSFORMER vs RNN ===")
test_tokens = ["The", "cat", "sat"]

rnn = SimpleRNN()
transformer = SimpleTransformer()

print("\n--- RNN Approach ---")
rnn_outputs = rnn.process_sequence(test_tokens)

print("\n--- Transformer Approach ---")
transformer_outputs = transformer.process_sequence(test_tokens)

=== ADVANTAGE: TRANSFORMER vs RNN ===

--- RNN Approach ---
=== RNN PROCESSING (Sequential) ===
Processing one token at a time, left to right:
Step 1: Processing 'The'
  Hidden state: hidden_after_The
  Output: rnn_output_1
Step 2: Processing 'cat'
  Hidden state: hidden_after_cat
  Output: rnn_output_2
Step 3: Processing 'sat'
  Hidden state: hidden_after_sat
  Output: rnn_output_3
Final RNN outputs: ['rnn_output_1', 'rnn_output_2', 'rnn_output_3']

--- Transformer Approach ---
=== TRANSFORMER PROCESSING (Parallel) ===
Processing ALL tokens simultaneously:
Step 1: Self-attention on all tokens
=== SELF-ATTENTION STEP-BY-STEP ===
Input tokens: ['The', 'cat', 'sat']
Queries: ['Q_The', 'Q_cat', 'Q_sat']
Keys: ['K_The', 'K_cat', 'K_sat']
Values: ['V_The', 'V_cat', 'V_sat']

Attention process for each token:

Token 'The' attending to all tokens:
  Attention to 'The': 1.00
  Attention to 'cat': 0.50
  Attention to 'sat': 0.33
  Normalized attention: {'The': 0.5454545454545455, 'cat': 0.27272

In [7]:
def attention_math_demo():
    """Very simple math behind attention"""
    print("\n=== ATTENTION MATH (SIMPLIFIED) ===")

    # Simple vectors representing words
    word_vectors = {
        "cat": [1, 0, 1],
        "sat": [0, 1, 1],
        "mat": [1, 1, 0]
    }

    print("Word vectors:")
    for word, vec in word_vectors.items():
        print(f"  {word}: {vec}")

    print("\nCalculating attention between 'cat' and other words:")

    cat_vec = word_vectors["cat"]
    for word, vec in word_vectors.items():
        # Dot product as similarity measure
        similarity = sum(a * b for a, b in zip(cat_vec, vec))
        print(f"  'cat' ⋅ '{word}' = {similarity}")

    print("\nHigher dot product = higher attention weight")
    print("This helps model understand which words are most relevant!")

attention_math_demo()


=== ATTENTION MATH (SIMPLIFIED) ===
Word vectors:
  cat: [1, 0, 1]
  sat: [0, 1, 1]
  mat: [1, 1, 0]

Calculating attention between 'cat' and other words:
  'cat' ⋅ 'cat' = 2
  'cat' ⋅ 'sat' = 1
  'cat' ⋅ 'mat' = 1

Higher dot product = higher attention weight
This helps model understand which words are most relevant!
