# Positional Encoding vs No Positional Encoding
This notebook reproduces the experiment comparing token embeddings **with** and **without** sinusoidal Positional Encoding (PE). It generates cosine similarity heatmaps, a PCA projection, and summary metrics.

Sentence: *"The quick brown fox jumps over the lazy dog"*


In [None]:
# Setup: imports and helper functions
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pandas as pd
import math

np.random.seed(42)

# Positional encoding (sinusoidal)
def positional_encoding(seq_len, d_model):
    pe = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            div = pos / (10000 ** (i / d_model))
            pe[pos, i] = math.sin(div)
            if i+1 < d_model:
                pe[pos, i+1] = math.cos(div)
    return pe

def cosine_similarity_matrix(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    Xn = X / (norms + 1e-9)
    return Xn @ Xn.T

def pairwise_position_diff_matrix(n):
    idxs = np.arange(n)
    return np.abs(idxs[:,None] - idxs[None,:])

def compute_metrics(sim_matrix, pos_diff):
    n = sim_matrix.shape[0]
    mask = ~np.eye(n, dtype=bool)
    sims = sim_matrix[mask]
    diffs = pos_diff[mask]
    avg_sim = sims.mean()
    std_sim = sims.std()
    corr = np.corrcoef(diffs.flatten(), sims.flatten())[0,1]
    return avg_sim, std_sim, corr

print('helpers ready')


In [None]:
# Experiment parameters and embeddings
sentence = "The quick brown fox jumps over the lazy dog"
tokens = sentence.lower().split()
vocab = sorted(set(tokens))
token_to_idx = {t:i for i,t in enumerate(vocab)}
indices = [token_to_idx[t] for t in tokens]

vocab_size = len(vocab)
d_model = 64
n_tokens = len(tokens)

# Random token embeddings (simulated learned embeddings)
embedding_matrix = np.random.randn(vocab_size, d_model) * 0.5
embeddings = np.array([embedding_matrix[idx] for idx in indices])

# Positional encoding and combined embeddings
pe = positional_encoding(n_tokens, d_model)
emb_no_pe = embeddings.copy()
emb_with_pe = embeddings + pe

print('embeddings created')


In [None]:
# Compute similarity matrices and metrics
sim_no_pe = cosine_similarity_matrix(emb_no_pe)
sim_with_pe = cosine_similarity_matrix(emb_with_pe)
pos_diff = pairwise_position_diff_matrix(n_tokens)

avg_no, std_no, corr_no = compute_metrics(sim_no_pe, pos_diff)
avg_pe, std_pe, corr_pe = compute_metrics(sim_with_pe, pos_diff)

results = pd.DataFrame({
    'Setting':['No Positional Encoding','With Positional Encoding'],
    'Avg Cosine Similarity':[avg_no, avg_pe],
    'Std Cosine Similarity':[std_no, std_pe],
    'Corr(position_diff, similarity)':[corr_no, corr_pe]
})

results


In [None]:
# Plot 1: similarity heatmap (No PE)
plt.figure(figsize=(6,5))
plt.title('Cosine similarity (No Positional Encoding)')
plt.imshow(sim_no_pe, interpolation='nearest', aspect='auto')
plt.colorbar()
plt.xticks(np.arange(n_tokens), tokens, rotation=45, ha='right')
plt.yticks(np.arange(n_tokens), tokens)
plt.tight_layout()
plt.show()


In [None]:
# Plot 2: similarity heatmap (With PE)
plt.figure(figsize=(6,5))
plt.title('Cosine similarity (With Positional Encoding)')
plt.imshow(sim_with_pe, interpolation='nearest', aspect='auto')
plt.colorbar()
plt.xticks(np.arange(n_tokens), tokens, rotation=45, ha='right')
plt.yticks(np.arange(n_tokens), tokens)
plt.tight_layout()
plt.show()


In [None]:
# Plot 3: PCA 2D projection comparing embeddings
pca = PCA(n_components=2)
all_emb = np.vstack([emb_no_pe, emb_with_pe])
pca2 = pca.fit_transform(all_emb)
proj_no = pca2[:n_tokens]
proj_pe = pca2[n_tokens:]

plt.figure(figsize=(8,6))
plt.title('PCA 2D projection: tokens (No PE) and tokens (With PE)')
plt.scatter(proj_no[:,0], proj_no[:,1], marker='o', label='No PE')
for i,t in enumerate(tokens):
    plt.text(proj_no[i,0], proj_no[i,1], f' {t}', fontsize=9)
plt.scatter(proj_pe[:,0], proj_pe[:,1], marker='x', label='With PE')
for i,t in enumerate(tokens):
    plt.text(proj_pe[i,0], proj_pe[i,1], f' {t}', fontsize=9)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Plot 4: Bar chart comparing metrics
labels = ['Avg Cosine','Std Cosine','PosDiff-Corr']
no_vals = [avg_no, std_no, corr_no]
pe_vals = [avg_pe, std_pe, corr_pe]

x = np.arange(len(labels))
width = 0.35

plt.figure(figsize=(7,5))
plt.title('Comparison of metrics: No PE vs With PE')
plt.bar(x - width/2, no_vals, width, label='No PE')
plt.bar(x + width/2, pe_vals, width, label='With PE')
plt.xticks(x, labels)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Summary and interpretation
print('Summary metrics:')
print(results.to_string(index=False))

print('\nInterpretation:')
print('- A negative correlation between position difference and similarity indicates that tokens farther apart tend to be less similar; with PE we expect a stronger negative correlation.')
print('- Compare heatmaps and PCA to visually inspect how PE changes embedding geometry.')


## Next steps (optional)
- Replace random embeddings with pre-trained word embeddings (e.g., GloVe) for more realistic behavior.
- Use trainable positional embeddings instead of sinusoidal PE and compare.
- Run a small downstream task (e.g., distance-based classifier) to measure empirical performance change.
