$$\Large\boxed{\text{AME 5202 Deep Learning, Even Semester 2026}}$$

$$\large\text{Theme}: \underline{\text{Problem Set 1}}$$

---

Load essential libraries

---

In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns

---

Mount Google Drive folder if running Google Colab

---

In [2]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2026MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

In [3]:
# Load the Wikipedia-trained GLoVe word vectors (50-dimensional) from the pickle file
with open(DATA_DIR + 'glove_wiki_gigaword_50.pkl', 'rb') as f:
    loaded_word_vectors = pickle.load(f)

---


Basic building blocks of a transformer: form the embeddings matrix $\mathbf{X}$

---

In [None]:
sentence = 'i swam quickly across the river to get to the other bank'
nltk.download('punkt_tab')
tokens = word_tokenize(sentence)
X = np.empty((len(tokens), 50))
X = torch.stack([torch.tensor(loaded_word_vectors.get(token, None), dtype = torch.float64) for token in tokens])
print(tokens)
print(X.shape)

---

Form the non-normalized pairwise similarities matrix $\mathbf{X}\mathbf{X}^\mathrm{T}.$

---

In [None]:
S = torch.matmul(X, X.T)
print(S[-1]) # similarities associated with the word 'bank'

tensor([11.2500, -1.5239, 12.9733, 14.1935, 18.0800, 16.6162, 19.2234, 14.3078,
        19.2234, 18.0800, 15.7850, 36.3920], dtype=torch.float64)


---

Form the softmax-normalized pairwise similarities matrix $\texttt{softmax}(\mathbf{X}\mathbf{X}^\mathrm{T}).$

---

In [None]:
softmax = torch.nn.Softmax(dim = 1)
S = softmax(torch.matmul(X, X.T))
#S = torch.nn.functional.softmax(torch.matmul(X, X.T), dim = 1)
print(S[-1]) # scaled similarities associated with the word 'bank'

tensor([1.2049e-11, 3.4143e-17, 6.7510e-11, 2.2871e-10, 1.1148e-08, 2.5790e-09,
        3.4976e-08, 2.5642e-10, 3.4976e-08, 1.1148e-08, 1.1233e-09, 1.0000e+00],
       dtype=torch.float64)


---

 Which word is most similar to the last word *bank*?

 ---

In [16]:
tokens[torch.argmax(S[-1])]

'bank'

${\mathbf{Y}} = \text{softmax}\left(\mathbf{X}\mathbf{X}^\mathrm{T}\right)\mathbf{X}.$

---

Embeddings matrix updated using softmax-normalized similarities

---

In [17]:
Y = torch.matmul(S, X)
print(Y.shape)

torch.Size([12, 50])


---

Query and key matrix representations of the embeddings 

---

In [18]:
W_q = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
W_k = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
Q = torch.matmul(X, W_q) # query representation of the tokens
K = torch.matmul(X, W_k) # key representation of the tokens
print(Q.shape)
print(K.shape)

torch.Size([12, 8])
torch.Size([12, 8])


---

Using trial and error, find which query and key matrix representations lead to the largest similarity between the last work 'bank' and the word 'swam'

---

In [22]:
max_similarity = -np.inf
for k in range(10000):
    W_q = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
    W_k = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
    Q = torch.matmul(X, W_q) # query representation of the tokens
    K = torch.matmul(X, W_k) # key representation of the tokens
    S = softmax(torch.matmul(Q, K.T))
    if S[-1, 1] > max_similarity:
        max_similarity = S[-1, 1]
        W_q_best = W_q
        W_k_best = W_k

print(max_similarity)
print(W_q_best)
print(W_k_best) 

tensor(1., dtype=torch.float64)
tensor([[ 0.5121, -2.2863, -0.7100, -1.6716,  1.3397,  1.2927,  0.7662, -1.1501],
        [-0.0162, -0.2855, -0.0701, -0.4488, -0.4321, -0.6852, -0.8879, -0.1945],
        [-1.8284, -1.6213,  0.6755, -1.4297,  0.6428, -0.6726,  0.2025, -0.3942],
        [-0.2171,  0.4269,  0.4915,  2.1167, -0.0191,  0.2080, -1.0152, -0.7633],
        [-1.3826, -0.9824, -0.7788, -0.9405, -0.5523, -1.3775,  1.6169, -0.4378],
        [ 0.7077,  0.3358, -0.5310,  1.3346,  2.0044,  0.0132, -0.7951,  0.6220],
        [-1.7504,  1.1676, -1.0791,  0.7659,  0.5480,  0.8782, -0.7841,  0.6917],
        [ 0.2380, -1.4674,  0.4133, -0.1393, -1.0199, -0.3409, -0.3654, -1.5718],
        [ 0.9145, -0.4800,  0.0068, -1.2686,  0.4795, -0.9410,  0.1942, -0.2478],
        [-2.0898, -1.6384, -1.9453,  0.2956,  0.0892, -1.3726,  1.3026, -0.4432],
        [-1.4114, -0.3548,  0.6401,  0.4576, -0.9763,  1.8653, -0.6325,  2.1513],
        [ 0.0491,  1.2467, -0.9593,  1.1594, -0.2259, -0.2397, -0.

---

Recalculate matrix $\mathbf{S}$ using the best query and key matrices and use that to find to which word is the word river most similar to

---

In [23]:
Q = torch.matmul(X, W_q_best)
K = torch.matmul(X, W_k_best)
S = softmax(torch.matmul(Q, K.T))
tokens[torch.argmax(S[-1])]

'swam'