In [None]:
# Our two "meanings" (classes)
LABEL_MAP = {
    "Financial": 0,
    "River": 1
}

data = [
    ("I went to the bank to deposit money", 4, LABEL_MAP["Financial"]),
    ("He sat on the river bank", 5, LABEL_MAP["River"]),
    ("The savings bank is closed today", 2, LABEL_MAP["Financial"]),
    ("The boat is near the bank", 5, LABEL_MAP["River"]),
    ("I need a loan from the bank", 6, LABEL_MAP["Financial"]),
    ("The river bank was muddy", 2, LABEL_MAP["River"]),

    # --- "Rule Breaker" Sentences ---
    ("The river bank is high today", 2, LABEL_MAP["River"]),
    ("The bank is open for business", 1, LABEL_MAP["Financial"]),
    ("This bank is my favorite for loans", 1, LABEL_MAP["Financial"]),
    ("We went to the river bank", 5, LABEL_MAP["River"]),

    # --- More "Financial" Examples ---
    ("He opened a new bank account", 3, LABEL_MAP["Financial"]),
    ("I need to transfer money at the bank", 7, LABEL_MAP["Financial"]),
    ("The bank approved my mortgage", 1, LABEL_MAP["Financial"]),
    ("My paycheck goes to the bank", 5, LABEL_MAP["Financial"]),
    ("The bank charged an overdraft fee", 1, LABEL_MAP["Financial"]),
    ("Online bank security is important", 1, LABEL_MAP["Financial"]),
    ("The ATM at the bank is broken", 4, LABEL_MAP["Financial"]),
    ("He works at the bank", 4, LABEL_MAP["Financial"]),

    # --- More "River" Examples ---
    ("We had a picnic on the river bank", 6, LABEL_MAP["River"]),
    ("The fishing boat passed the bank", 5, LABEL_MAP["River"]),
    ("Erosion is affecting the bank", 4, LABEL_MAP["River"]),
    ("We saw ducks on the bank", 4, LABEL_MAP["River"]),
    ("The bank was steep and grassy", 1, LABEL_MAP["River"]),
    ("Flood waters rose over the bank", 5, LABEL_MAP["River"]),
    ("The north bank of the river is flooded", 2, LABEL_MAP["River"]),
    ("Trees line the bank of the stream", 3, LABEL_MAP["River"]),
]

print(f"Corrected dataset size: {len(data)} sentences")

Corrected dataset size: 26 sentences


In [None]:
# 1. Collect all words
all_words = set()
for sentence, index, label in data:
    words_in_sentence = sentence.lower().split() # split() turns the string into a list of words
    all_words.update(words_in_sentence)         # .update() adds all items from the list to the set

# 2. Create the vocabulary mapping
# Start with 0 for the PAD token
vocab = {"<PAD>": 0}
index = 1
for word in sorted(list(all_words)): # sort them so the order is consistent
    vocab[word] = index
    index += 1

print("\nYour vocabulary:")
print(vocab)


Your vocabulary:
{'<PAD>': 0, 'a': 1, 'account': 2, 'affecting': 3, 'an': 4, 'and': 5, 'approved': 6, 'at': 7, 'atm': 8, 'bank': 9, 'boat': 10, 'broken': 11, 'business': 12, 'charged': 13, 'closed': 14, 'deposit': 15, 'ducks': 16, 'erosion': 17, 'favorite': 18, 'fee': 19, 'fishing': 20, 'flood': 21, 'flooded': 22, 'for': 23, 'from': 24, 'goes': 25, 'grassy': 26, 'had': 27, 'he': 28, 'high': 29, 'i': 30, 'important': 31, 'is': 32, 'line': 33, 'loan': 34, 'loans': 35, 'money': 36, 'mortgage': 37, 'muddy': 38, 'my': 39, 'near': 40, 'need': 41, 'new': 42, 'north': 43, 'of': 44, 'on': 45, 'online': 46, 'open': 47, 'opened': 48, 'over': 49, 'overdraft': 50, 'passed': 51, 'paycheck': 52, 'picnic': 53, 'river': 54, 'rose': 55, 'sat': 56, 'savings': 57, 'saw': 58, 'security': 59, 'steep': 60, 'stream': 61, 'the': 62, 'this': 63, 'to': 64, 'today': 65, 'transfer': 66, 'trees': 67, 'was': 68, 'waters': 69, 'we': 70, 'went': 71, 'works': 72}


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # You'll need this for the optimizer
import math

In [None]:
class SimpleAttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super().__init__()

        # 1. Input Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # 2. Self-Attention Layers
        self.q_layer = nn.Linear(embedding_dim, embedding_dim)
        self.k_layer = nn.Linear(embedding_dim, embedding_dim)
        self.v_layer = nn.Linear(embedding_dim, embedding_dim)

        # 3. Output Layer
        self.output_layer = nn.Linear(embedding_dim, num_classes)

        self.embedding_dim = embedding_dim

    def forward(self, sentence_indices, target_word_index):
        # sentence_indices is a list of word IDs, e.g., [1, 2, 3, 4, 5, 6]

        # --- 1. Get Initial Embeddings ---
        X = self.embedding(sentence_indices) # Shape: [seq_len, embedding_dim]

        # --- 2. Implement Self-Attention ---
        Q = self.q_layer(X)
        K = self.k_layer(X)
        V = self.v_layer(X)

        Scores = torch.matmul(Q, K.transpose(-2, -1))

        scale_factor = math.sqrt(self.embedding_dim)
        Scaled_Scores = Scores / scale_factor

        # We save these weights for our visualization!
        # Shape: [seq_len, seq_len]
        self.Attention_Weights = F.softmax(Scaled_Scores, dim=-1)

        Output = torch.matmul(self.Attention_Weights, V)

        # --- 3. Get the Final Classification ---

        # We only care about the vector for our *target word*
        # Shape: [embedding_dim]
        target_word_vector = Output[target_word_index]

        # Pass it to the output layer to get the final class scores
        # Shape: [num_classes]
        final_scores = self.output_layer(target_word_vector)

        return final_scores, self.Attention_Weights

### Explanation of the Attention Mechanism in `SimpleAttentionModel`

The `SimpleAttentionModel` uses a simplified form of self-attention to determine the meaning of the target word ("bank" in most cases) based on the other words in the sentence. Here's how it works within the `forward` method:

1.  **Input Embeddings**:
    *   The input `sentence_indices` is a tensor of integer IDs representing the words in the sentence.
    *   `self.embedding(sentence_indices)` converts these IDs into dense numerical vectors. This `X` tensor has the shape `[seq_len, embedding_dim]`, where `seq_len` is the number of words in the sentence and `embedding_dim` is the size of each word vector.

2.  **Query, Key, and Value Projections**:
    *   The `X` tensor is passed through three separate linear layers: `self.q_layer`, `self.k_layer`, and `self.v_layer`.
    *   These layers transform the input embeddings into three new tensors: `Q` (Query), `K` (Key), and `V` (Value). Each also has the shape `[seq_len, embedding_dim]`.
    *   Intuitively, the Query vector for a word represents "what I'm looking for", the Key vector represents "what I contain", and the Value vector represents "what information I provide".

3.  **Calculating Attention Scores**:
    *   `Scores = torch.matmul(Q, K.transpose(-2, -1))` calculates the raw attention scores. This is a matrix multiplication between the Query matrix `Q` and the transpose of the Key matrix `K`.
    *   The resulting `Scores` tensor has the shape `[seq_len, seq_len]`. Each element `Scores[i, j]` represents the dot product between the Query vector of word `i` and the Key vector of word `j`. A higher dot product means word `i` finds word `j` more relevant.

4.  **Scaling**:
    *   `scale_factor = math.sqrt(self.embedding_dim)` is calculated. Scaling the scores by the square root of the embedding dimension is a common technique to prevent the dot products from becoming too large, which could lead to vanishing gradients during training.
    *   `Scaled_Scores = Scores / scale_factor` applies this scaling.

5.  **Calculating Attention Weights**:
    *   `self.Attention_Weights = F.softmax(Scaled_Scores, dim=-1)` applies the softmax function along the last dimension (`dim=-1`).
    *   Softmax converts the scaled scores into probability-like weights that sum up to 1 for each row. The `Attention_Weights` tensor, also `[seq_len, seq_len]`, now indicates how much each word *attends* to every other word in the sentence. `Attention_Weights[i, j]` is the weight word `i` places on word `j`.
    *   These weights are stored in `self.Attention_Weights` so they can be accessed and visualized later.

6.  **Calculating the Output (Weighted Sum of Values)**:
    *   `Output = torch.matmul(self.Attention_Weights, V)` performs another matrix multiplication. This time, the `Attention_Weights` matrix is multiplied by the Value matrix `V`.
    *   The resulting `Output` tensor has the shape `[seq_len, embedding_dim]`. The vector for each word `i` in this `Output` tensor is a weighted sum of all the Value vectors in the sentence, where the weights are determined by how much word `i` attended to each word (the `i`-th row of `Attention_Weights`).

7.  **Focusing on the Target Word**:
    *   `target_word_vector = Output[target_word_index]` extracts the vector from the `Output` tensor that corresponds to the `target_word_index`. This vector now contains information about the target word, influenced by the context of the entire sentence through the attention mechanism.

8.  **Final Classification**:
    *   `final_scores = self.output_layer(target_word_vector)` passes the target word's context-aware vector through a final linear layer.
    *   This layer produces the final scores (`[num_classes]`) for each possible class (Financial or River), which are then used to make the final prediction.

In essence, the attention mechanism allows the model to dynamically weigh the importance of different words in the sentence when trying to understand the meaning of a specific target word, making it particularly useful for tasks like word sense disambiguation. The visualization in the later cell (`PY6IyGENoKC5`) shows exactly these calculated attention weights for the target word.

In [None]:
def sentence_to_tensor(sentence, vocab_map):
    """Converts a string sentence to a tensor of integer IDs."""
    words = sentence.lower().split()

    # Look up each word in the vocab, use 0 (<PAD>) if it's not found
    # (Though for this tiny dataset, all words will be found)
    indices = [vocab_map.get(word, 0) for word in words]

    return torch.tensor(indices, dtype=torch.long)

# Test it on one sentence:
test_sentence = data[0][0] # "I went to the bank to deposit money"
test_tensor = sentence_to_tensor(test_sentence, vocab)
print(f"Sentence: {test_sentence}")
print(f"Tensor:   {test_tensor}")

Sentence: I went to the bank to deposit money
Tensor:   tensor([30, 71, 64, 62,  9, 64, 15, 36])


In [None]:
# Hyperparameters
# These are the "settings" for our model
VOCAB_SIZE = len(vocab)            # We get this from the vocab we just built
EMBEDDING_DIM = 32                 # You can pick any size. 32 is small and fast.
NUM_CLASSES = len(LABEL_MAP)       # This is 2 (Financial or River)
LEARNING_RATE = 0.001
EPOCHS = 200                       # How many times to loop over the data

# Initialization

# 1. Create the model
model = SimpleAttentionModel(VOCAB_SIZE, EMBEDDING_DIM, NUM_CLASSES)

loss_function = nn.CrossEntropyLoss()

# 3. Create the "mechanic" (updates the model to make it better)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print("Model, Loss, and Optimizer are ready.")

Model, Loss, and Optimizer are ready.


In [None]:
print("--- Starting Training ---")
for epoch in range(EPOCHS):
    total_loss = 0

    # Loop over every sentence in our dataset
    for sentence, target_index, label in data:

        # 1. Clear old gradients
        # PyTorch adds up gradients, so we reset them to zero
        optimizer.zero_grad()

        # 2. Prepare inputs
        # Convert sentence to tensor of IDs
        input_indices = sentence_to_tensor(sentence, vocab)
        # Convert label to a tensor
        target_label = torch.tensor([label], dtype=torch.long)

        # 3. Forward pass (Get the model's prediction)
        # We run the model's "forward" function
        scores, _ = model(input_indices, target_index)

        # 4. Calculate loss (How wrong was the prediction?)
        # CrossEntropyLoss expects scores as [Batch, Classes] and labels as [Batch]
        # Our scores are just [Classes], so we .unsqueeze(0) to add a batch dim of 1
        loss = loss_function(scores.unsqueeze(0), target_label)

        # 5. Backward pass and optimize
        loss.backward()  # Calculate all the gradients
        optimizer.step() # Update the model's weights

        total_loss += loss.item()

    # Print the average loss every 20 epochs
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(data):.4f}")

print("--- Training Complete ---")

--- Starting Training ---
Epoch 20/200, Loss: 0.0057
Epoch 40/200, Loss: 0.0010
Epoch 60/200, Loss: 0.0004
Epoch 80/200, Loss: 0.0002
Epoch 100/200, Loss: 0.0001
Epoch 120/200, Loss: 0.0001
Epoch 140/200, Loss: 0.0000
Epoch 160/200, Loss: 0.0000
Epoch 180/200, Loss: 0.0000
Epoch 200/200, Loss: 0.0000
--- Training Complete ---


In [None]:
print("\n--- Model Predictions and Attention ---")
# Use torch.no_grad() to turn off gradient calculations (we're just testing)
with torch.no_grad():

    # Loop over our data one more time
    for sentence, target_index, label in data:
        print(f"\n---------------------------------")
        print(f"Sentence: '{sentence}'")

        # Get the original words
        words = sentence.lower().split()

        # Prepare inputs
        input_indices = sentence_to_tensor(sentence, vocab)

        # Run the model
        scores, attention_weights = model(input_indices, target_index)

        # Get the prediction
        predicted_class = torch.argmax(scores).item()
        predicted_meaning = "Financial" if predicted_class == 0 else "River"
        actual_meaning = "Financial" if label == 0 else "River"

        print(f"  Target Word: '{words[target_index]}'")
        print(f"  Prediction:  {predicted_meaning} (Actual: {actual_meaning})")

        # --- This is the key part ---
        # The 'attention_weights' matrix is [seq_len, seq_len]
        # We want the weights for our *target word*
        # This row shows how much the target word "paid attention" to every other word

        target_word_attention = attention_weights[target_index]

        print(f"  Attention weights for '{words[target_index]}':")
        for word, weight in zip(words, target_word_attention):
            # Print the word and its corresponding attention score
            print(f"    {word:<10} : {weight.item():.4f}")


--- Model Predictions and Attention ---

---------------------------------
Sentence: 'I went to the bank to deposit money'
  Target Word: 'bank'
  Prediction:  Financial (Actual: Financial)
  Attention weights for 'bank':
    i          : 0.0000
    went       : 0.0000
    to         : 0.0001
    the        : 0.0042
    bank       : 0.0000
    to         : 0.0001
    deposit    : 0.9952
    money      : 0.0003

---------------------------------
Sentence: 'He sat on the river bank'
  Target Word: 'bank'
  Prediction:  River (Actual: River)
  Attention weights for 'bank':
    he         : 0.0399
    sat        : 0.0000
    on         : 0.0247
    the        : 0.0004
    river      : 0.9349
    bank       : 0.0000

---------------------------------
Sentence: 'The savings bank is closed today'
  Target Word: 'bank'
  Prediction:  Financial (Actual: Financial)
  Attention weights for 'bank':
    the        : 0.0050
    savings    : 0.9825
    bank       : 0.0000
    is         : 0.0005
   

# Simple Word Sense Disambiguation with Self-Attention

This notebook demonstrates a basic implementation of a self-attention mechanism to perform word sense disambiguation. The goal is to determine the correct meaning of the word "bank" in different sentences, classifying it as either "Financial" or "River".

## Process

The notebook follows these steps:

1.  **Data Preparation**: A small dataset is defined containing sentences with the word "bank", the index of the target word "bank" (or a word related to the meaning in some cases), and the corresponding label (Financial or River).

2.  **Vocabulary Creation**: A vocabulary is built from all the unique words in the dataset, mapping each word to a unique integer ID. A special `<PAD>` token is included at index 0.

3.  **Model Definition**: A simple self-attention model is defined using PyTorch.
    *   An embedding layer converts word IDs into dense vectors.
    *   Linear layers (`q_layer`, `k_layer`, `v_layer`) are used to create the Query, Key, and Value matrices for the self-attention calculation.
    *   The attention scores are calculated by multiplying the Query and Key matrices, scaled by the square root of the embedding dimension.
    *   A softmax function is applied to the scaled scores to get attention weights, indicating how much each word attends to every other word in the sentence.
    *   The output of the attention mechanism is a weighted sum of the Value vectors.
    *   A final linear layer (`output_layer`) takes the vector corresponding to the target word and predicts the class (Financial or River).

4.  **Model Initialization**: The model is initialized with the vocabulary size, embedding dimension, and number of output classes. A loss function (CrossEntropyLoss) and an optimizer (Adam) are set up for training.

5.  **Training**: The model is trained on the prepared dataset for a specified number of epochs. In each training step:
    *   Gradients are zeroed.
    *   The sentence and target word index are converted to tensors.
    *   The model performs a forward pass to get predicted scores and attention weights.
    *   The loss is calculated based on the predicted scores and the actual label.
    *   The gradients are computed using the backward pass.
    *   The model's weights are updated by the optimizer.

6.  **Inference and Attention Visualization**: After training, the model's predictions are evaluated on the training data. For each sentence, the predicted meaning is shown alongside the actual meaning. Crucially, the attention weights for the target word are displayed, showing which words the model focused on when making its classification decision.

This example highlights how a simple attention mechanism can learn to weigh the importance of different words in a sentence to understand the context and disambiguate word senses.