$$\Large\boxed{\text{AME 5202 Deep Learning, Even Semester 2026}}$$

$$\large\text{Theme}: \underline{\text{Problem Set 1}}$$

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---

Load essential libraries

---

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns

---

Mount Google Drive folder if running Google Colab

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/datasets/glove_wiki_gigaword_50.pkl'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

Mounted at /content/drive


In [None]:
# Load the Wikipedia-trained GLoVe word vectors (50-dimensional) from the pickle file
with open(DIR, 'rb') as f:
    loaded_word_vectors = pickle.load(f)

---


Basic building blocks of a transformer: form the embeddings matrix $\mathbf{X}$

---

In [None]:
sentence = 'i swam quickly across the river to get to the other bank'
nltk.download('punkt_tab')
tokens = word_tokenize(sentence)
X = np.empty((len(tokens), 50))
X = torch.stack([torch.tensor(loaded_word_vectors.get(token, None) ,dtype=torch.float64) for token in tokens])
print(tokens)
print(X.shape)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['i', 'swam', 'quickly', 'across', 'the', 'river', 'to', 'get', 'to', 'the', 'other', 'bank']
torch.Size([12, 50])


---

Form the non-normalized pairwise similarities matrix $\mathbf{X}\mathbf{X}^\mathrm{T}.$

---

In [None]:
S = torch.matmul(X, X.T)
print(S)
print(S.shape)

tensor([[39.0197,  4.5574, 20.2995, 14.5481, 19.4420, 12.0143, 22.6571, 29.8528,
         22.6571, 19.4420, 17.8858, 11.2500],
        [ 4.5574, 24.7064,  4.2668,  7.2292,  0.9357,  8.8437,  2.3160,  5.4718,
          2.3160,  0.9357, -0.6112, -1.5239],
        [20.2995,  4.2668, 21.2841, 16.5796, 16.1261, 12.0684, 18.8431, 19.4523,
         18.8431, 16.1261, 15.2439, 12.9733],
        [14.5481,  7.2292, 16.5796, 28.8804, 19.3706, 22.9543, 17.9298, 16.6827,
         17.9298, 19.3706, 19.6938, 14.1935],
        [19.4420,  0.9357, 16.1261, 19.3706, 24.6793, 18.2549, 21.1141, 17.5502,
         21.1141, 24.6793, 19.8692, 18.0800],
        [12.0143,  8.8437, 12.0684, 22.9543, 18.2549, 44.2210, 14.7193, 12.0669,
         14.7193, 18.2549, 12.5996, 16.6162],
        [22.6571,  2.3160, 18.8431, 17.9298, 21.1141, 14.7193, 24.5706, 22.2478,
         24.5706, 21.1141, 20.6623, 19.2234],
        [29.8528,  5.4718, 19.4523, 16.6827, 17.5502, 12.0669, 22.2478, 30.1930,
         22.2478, 17.5502, 19.

In [None]:
print(S[-1]) # similarities associated with the word bank

tensor([11.2500, -1.5239, 12.9733, 14.1935, 18.0800, 16.6162, 19.2234, 14.3078,
        19.2234, 18.0800, 15.7850, 36.3920], dtype=torch.float64)


---

Form the softmax-normalized pairwise similarities matrix $\texttt{softmax}(\mathbf{X}\mathbf{X}^\mathrm{T}).$

---

In [None]:
softmax = torch.nn.Softmax(dim = 1)
S = softmax(S) # or softmax(torch.matmul(X, X.T))
S=torch.nn.functional.softmax(torch.matmul(X, X.T), dim=1) # other way of doing the above line
print(S[-1])

tensor([1.2049e-11, 3.4143e-17, 6.7510e-11, 2.2871e-10, 1.1148e-08, 2.5790e-09,
        3.4976e-08, 2.5642e-10, 3.4976e-08, 1.1148e-08, 1.1233e-09, 1.0000e+00],
       dtype=torch.float64)


---

 Which word is most similar to the last word *bank*?

 ---

In [None]:
tokens[torch.argmax(S[-1])]

'bank'

---

Embeddings matrix updated using softmax-normalized similarities

---

In [None]:
Y = torch.matmul(S,X)
print(Y)
print(Y.shape)

tensor([[ 1.1891e-01,  1.5251e-01, -8.1998e-02, -7.4143e-01,  7.5912e-01,
         -4.8325e-01, -3.1015e-01,  5.1475e-01, -9.8697e-01,  6.4455e-04,
         -1.5045e-01,  8.3766e-01, -1.0796e+00, -5.1453e-01,  1.3188e+00,
          6.2006e-01,  1.3783e-01,  4.7104e-01, -7.2844e-02, -7.2679e-01,
         -7.4111e-01,  7.5261e-01,  8.8176e-01,  2.9561e-01,  1.3547e+00,
         -2.5700e+00, -1.3522e+00,  4.5877e-01,  1.0068e+00, -1.1856e+00,
          3.4737e+00,  7.7902e-01, -7.2928e-01,  2.5101e-01, -2.6152e-01,
         -3.4679e-01,  5.5837e-01,  7.5094e-01,  4.9831e-01, -2.6830e-01,
         -2.7820e-03, -1.8266e-02, -2.8093e-01,  5.5320e-01,  3.7693e-02,
          1.8551e-01, -1.5025e-01, -5.7506e-01, -2.6670e-01,  9.2118e-01],
        [-3.5303e-01,  3.6953e-01,  4.7266e-01,  1.3832e-02, -1.6484e-01,
         -5.2687e-01, -7.3986e-01,  1.2058e+00,  1.1147e+00, -4.6772e-01,
          1.7928e-01, -5.9239e-01,  2.5257e-01,  4.3449e-01,  6.7023e-01,
         -2.8594e-01,  7.3105e-01,  3

---

Query and key matrix representations of the embeddings

---

In [None]:
W_q = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
W_k = torch.normal(mean = 0, std = 1, size = (50,8) , dtype = torch.float64)
Q=torch.matmul(X, W_q)
K=torch.matmul(X, W_k)
print(Q.shape)
print(K.shape)

torch.Size([12, 8])
torch.Size([12, 8])


---

Using trial and error, find wHich query and key matrix representations lead to the largest similarity between the last work 'bank' and the word 'swam'

---

In [None]:
S[-1,1]

tensor(3.4143e-17, dtype=torch.float64)

In [None]:
max_similarity = -np.inf
for k in range(10000):
    W_q = torch.normal(mean = 0, std = 1, size = (50, 8), dtype = torch.float64)
    W_k = torch.normal(mean = 0, std = 1, size = (50,8) , dtype = torch.float64)
    Q=torch.matmul(X, W_q)
    K=torch.matmul(X, W_k)
    S = softmax(torch.matmul(Q, K.T))
    if S[-1, 1] > max_similarity:
        max_similarity = S[-1, 1]
        W_q_best = W_q
        W_k_best = W_k

print(max_similarity)
print(W_q_best)
print(W_k_best)

tensor(1., dtype=torch.float64)
tensor([[ 4.2227e-02, -8.2224e-01,  8.0329e-01,  4.4207e-01,  8.9340e-01,
          7.6784e-01,  1.9494e+00,  1.2373e+00],
        [ 6.6148e-01,  7.0856e-01, -7.7646e-01, -1.2664e+00, -3.8334e-01,
         -4.5948e-01, -1.0708e+00, -1.2484e+00],
        [-7.8507e-01,  1.4995e+00, -1.4451e+00, -8.0298e-01,  1.1567e+00,
          1.8339e+00, -1.6501e+00, -2.3492e-01],
        [ 7.8473e-01,  7.2872e-01, -1.3325e+00,  4.1067e-01,  1.5609e+00,
          6.2829e-01,  3.3232e-02, -1.9008e-01],
        [-7.3750e-01, -3.9683e-01, -1.5846e+00, -5.3113e-02,  3.0105e-02,
          4.5472e-02,  6.7885e-01,  5.6149e-01],
        [ 1.2867e+00,  1.5496e-01, -4.7439e-01, -1.4122e+00, -7.6336e-01,
         -7.1954e-02, -8.1696e-01, -1.2752e-01],
        [ 2.0946e-01, -4.3587e-02, -9.7411e-01, -5.1895e-01, -1.2282e-02,
         -2.2301e+00,  1.5288e+00, -5.3583e-01],
        [ 3.1739e-01, -2.9016e-02,  1.2230e+00,  6.0498e-01,  2.8497e-01,
         -6.4933e-01,  8.6745e-01

---

Recalculate matrix $\mathbf{S}$ using the best query and key matrices and use that to find to which word is the word river most similar to

---

In [None]:
Q = torch.matmul(X, W_q_best)
K = torch.matmul(X, W_k_best)
S = softmax(torch.matmul(Q, K.T))
tokens[torch.argmax(S[-1])]

'swam'