$$\Large\boxed{\text{AME 5202 Deep Learning, Even Semester 2026}}$$

$$\large\text{Theme}: \underline{\text{linear algebra computational foundation for deep learning}}$$

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---

Load essential libraries

---

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns

---

Mount Google Drive folder if running Google Colab

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/datasets/glove_wiki_gigaword_50.pkl'

else:
    DATA_DIR = 'Data/'

Mounted at /content/drive





---

**We will now use PyTorch to create tensors**

The patient data matrix:

![patient data matrix](https://1drv.ms/i/s!AjTcbXuSD3I3hsxIkL4V93-CGq8RkQ?embed=1&width=1000)

**Notation**:

Zeroth patient vector $\mathbf{x}^{(0)}= \begin{bmatrix}72\\120\\37.3\\104\\32.5\end{bmatrix}$ and zeroth feature (heart rate vector) $\mathbf{x}_0 = \begin{bmatrix}72\\85\\68\\90\\84\\78\end{bmatrix}.$

---



In [None]:
## Create a patient data matrix as a constant tensor
X = torch.tensor([
                  [72, 120, 37.3, 104, 32.5],
                  [85, 130, 37.0, 110, 14],
                  [68, 110, 38.5, 125, 34],
                  [90, 140, 38.0, 130, 26],
                  [84,132,38.3,146,30],
                  [78,128,37.2,102,12]
                  ],
                  dtype = torch.float64)
print(X)
print(X.shape)
print(type(X))
print(X[0]) #this is the patient 0 information which is rank 1 tensor
print(X[0, :]) #this wil also print the patient 0 information
print(X[0,2]) # feature 2 (temperature) od patient 0
print(X[-1]) #this is the last patient (patient 5 ) information
print(X[:, 1]) # this is feature 1(BP) of all the patients


tensor([[ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000],
        [ 85.0000, 130.0000,  37.0000, 110.0000,  14.0000],
        [ 68.0000, 110.0000,  38.5000, 125.0000,  34.0000],
        [ 90.0000, 140.0000,  38.0000, 130.0000,  26.0000],
        [ 84.0000, 132.0000,  38.3000, 146.0000,  30.0000],
        [ 78.0000, 128.0000,  37.2000, 102.0000,  12.0000]],
       dtype=torch.float64)
torch.Size([6, 5])
<class 'torch.Tensor'>
tensor([ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000], dtype=torch.float64)
tensor([ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000], dtype=torch.float64)
tensor(37.3000, dtype=torch.float64)
tensor([ 78.0000, 128.0000,  37.2000, 102.0000,  12.0000], dtype=torch.float64)
tensor([120., 130., 110., 140., 132., 128.], dtype=torch.float64)


---

**Convert a PyTorch object into a numpy array**

---

In [None]:
X.detach().numpy()

array([[ 72. , 120. ,  37.3, 104. ,  32.5],
       [ 85. , 130. ,  37. , 110. ,  14. ],
       [ 68. , 110. ,  38.5, 125. ,  34. ],
       [ 90. , 140. ,  38. , 130. ,  26. ],
       [ 84. , 132. ,  38.3, 146. ,  30. ],
       [ 78. , 128. ,  37.2, 102. ,  12. ]])

---

**Addition and subtraction of vectors, scalar multiplication (apply operation componentwise)**

![vector addition](https://1drv.ms/i/c/37720f927b6ddc34/IQQ03G17kg9yIIA3NokBAAAAAZLAaAoWwhtn8Vk26NotALo?width=256)

![vector subtracton](https://1drv.ms/i/c/37720f927b6ddc34/IQQ03G17kg9yIIA3M4kBAAAAAU_n_mAEv006QFZm_sUj2Dc?width=256)

![vector multiplication](https://1drv.ms/i/c/37720f927b6ddc34/IQQ03G17kg9yIIA3NIkBAAAAAa_qL04bLT4kWoNeHcrR9LQ?width=256)

![vector geometry1](https://1drv.ms/i/c/37720f927b6ddc34/IQSGNMr5z3SSRry7LSKL7LybAcGYuzgw5smabV8-6DudXIs?width=230)

![vector geometry2](https://1drv.ms/i/c/37720f927b6ddc34/IQQ03G17kg9yIIA3WokBAAAAAQi8FPV9YCebl5WnyEKJ3vg?width=213&height=192)


---

rank 0 -scalar values

rank 1 - 1dimension

rank 2 - 2 dimension (matrix)

In [None]:
(9/5) * X[:,2] +32      #broadcasting happens here

tensor([ 99.1400,  98.6000, 101.3000, 100.4000, 100.9400,  98.9600],
       dtype=torch.float64)

In [None]:
x_avg = (1/6)*(X[0, :] + X[1, :] + X[2, :] + X[3, :] + X[4, :] + X[5, :])
print(x_avg)

tensor([ 79.5000, 126.6667,  37.7167, 119.5000,  24.7500], dtype=torch.float64)


In [None]:
torch.mean(X, dim=0)

tensor([ 79.5000, 126.6667,  37.7167, 119.5000,  24.7500], dtype=torch.float64)

In [None]:
# Vector addition
print(X[1, :] + X[2, :])

print("#############################")

# Vector subtraction
print(X[1, :] - X[2, :])

print("#############################")
# Scalar-vector multiplication
(9/5) * X[:, 2] + 32 # Temperature in Fahrenheit


# Average patient
x_avg = (1/6)*(X[0, :] + X[1, :] + X[2, :] + X[3, :] + X[4, :] + X[5, :])
print(x_avg)
print("#############################")
# Using the in-built torch function
x_avg = torch.mean(X, dim=0)
print(x_avg)
print("#############################")

# Another broadcasting example
print(X - x_avg)
print("#############################")

tensor([153.0000, 240.0000,  75.5000, 235.0000,  48.0000], dtype=torch.float64)
#############################
tensor([ 17.0000,  20.0000,  -1.5000, -15.0000, -20.0000], dtype=torch.float64)
#############################
tensor([ 79.5000, 126.6667,  37.7167, 119.5000,  24.7500], dtype=torch.float64)
#############################
tensor([ 79.5000, 126.6667,  37.7167, 119.5000,  24.7500], dtype=torch.float64)
#############################
tensor([[ -7.5000,  -6.6667,  -0.4167, -15.5000,   7.7500],
        [  5.5000,   3.3333,  -0.7167,  -9.5000, -10.7500],
        [-11.5000, -16.6667,   0.7833,   5.5000,   9.2500],
        [ 10.5000,  13.3333,   0.2833,  10.5000,   1.2500],
        [  4.5000,   5.3333,   0.5833,  26.5000,   5.2500],
        [ -1.5000,   1.3333,  -0.5167, -17.5000, -12.7500]],
       dtype=torch.float64)
#############################


---

Application of vector subtraction in natural language processing (NLP): download the word embedding model trained on Wikipedia articles.

---

In [None]:
# Load the Wikipedia-trained GLoVe word vectors (50-dimensional) from the pickle file
with open(DIR , 'rb') as f:
    loaded_word_vectors = pickle.load(f)

In [None]:
loaded_word_vectors.get('king')

array([ 0.50451 ,  0.68607 , -0.59517 , -0.022801,  0.60046 , -0.13498 ,
       -0.08813 ,  0.47377 , -0.61798 , -0.31012 , -0.076666,  1.493   ,
       -0.034189, -0.98173 ,  0.68229 ,  0.81722 , -0.51874 , -0.31503 ,
       -0.55809 ,  0.66421 ,  0.1961  , -0.13495 , -0.11476 , -0.30344 ,
        0.41177 , -2.223   , -1.0756  , -1.0783  , -0.34354 ,  0.33505 ,
        1.9927  , -0.04234 , -0.64319 ,  0.71125 ,  0.49159 ,  0.16754 ,
        0.34344 , -0.25663 , -0.8523  ,  0.1661  ,  0.40102 ,  1.1685  ,
       -1.0137  , -0.21585 , -0.15155 ,  0.78321 , -0.91241 , -1.6106  ,
       -0.64426 , -0.51042 ], dtype=float32)

In [None]:
loaded_word_vectors.get('king') - loaded_word_vectors.get('queen')

array([ 0.12596998, -1.1372299 ,  0.66962993,  0.081499  ,  0.24217   ,
       -0.73527   ,  0.08725001, -0.36390004, -0.561182  ,  0.44783002,
       -0.30347598,  0.50713   , -0.640059  , -0.66753995,  0.39352003,
        0.25708997,  0.25581998, -0.386451  ,  0.01601005,  0.45079002,
       -0.38064003, -0.52175   ,  0.01098001, -0.58356   ,  0.13042   ,
       -0.41770005, -0.03350008, -0.88575   ,  0.21020997,  0.389576  ,
        0.4353    , -0.43530002, -0.39569002,  0.36874   ,  0.03794   ,
        0.00517   , -0.18120003, -0.186358  , -0.01485997,  1.1987001 ,
       -0.05844   ,  0.91547996, -0.83533   ,  0.51813   ,  0.0487    ,
        0.54850996, -0.35146004,  0.6733    , -0.6535353 ,  0.09241998],
      dtype=float32)

---

Now we will see what embedding vector comes as a result of applying the model for the words *cricket* and *football*.

Next, we will do an *intuitive* subtraction of word embeddings as in

1. Cricket without Tendulkar
2. Football without Messi

Note that the embedding vectors have 50 components corresponding to the 50-dimensional embedding of model suggested by the name '**glove-wiki-gigaword-50**'

---

In [None]:
# Cricket without Tendulkar
a = loaded_word_vectors.get('cricket', None) - loaded_word_vectors.get('tendulkar', None)

# Football without Messi
b = loaded_word_vectors.get('football', None) - loaded_word_vectors.get('messi', None)
print(a)
print(b)

# How different is cricket-without-tendulkar from
# football-without-messi?
print(a-b)

[-0.7716      0.41267997 -1.725968   -0.10445005 -1.1475699  -0.854661
 -1.089      -0.08342999  0.62349    -1.67822    -0.2488078  -0.49199998
  0.18756002 -1.67098     0.6117872   0.42784432  1.05656     0.91583097
 -0.03299999 -0.04422501  0.200326   -0.33737004  0.31068     1.37842
 -1.13689    -0.57445    -0.70685995  0.41552    -0.28937     0.54485
  1.0492998   0.62732    -0.8105     -1.27723    -0.02612001  0.53963
 -0.14065999 -0.738244   -0.30487    -1.18129     0.05651999 -0.993618
 -0.911399   -0.09289992  0.535432    0.26259995 -0.63031     0.64473
  0.77843     0.15099996]
[-2.06898     0.66804904 -1.077512    0.79964995 -0.27109998 -0.26289004
 -0.881       0.377503   -0.10869002 -2.47329    -0.23453003 -0.58438
  0.10404003 -0.52671003 -0.03030002  0.237764    0.19168997  1.60344
 -0.42980003  0.59058     0.59800005 -0.67075     0.45888     1.4538
 -1.15642    -1.63534    -1.1248189  -0.20879    -0.00812     0.25545004
  1.92044     0.30049008  0.19949001 -0.675167   -0

---

Understanding pen & paper versions of tensors w.r.t. their representations in the code

---

In [None]:
# Pen & paper: 3-vector, Code: rank-1 tensor
a_vector = torch.tensor([1.0, 2.0, 3.0], dtype = torch.float64)
print(a_vector)
print(a_vector.shape)
print('-------')
# Pen & paper: 1x3-matrix, Code: rank-2 tensor
a_matrix_version1 = torch.tensor([[1.0, 2.0, 3.0]], dtype = torch.float64)
print(a_matrix_version1)
print(a_matrix_version1.shape)
print('-------')
# Pen & paper: 3x1-matrix, Code: rank-2 tensor
a_matrix_version2 = torch.tensor([[1.0], [2.0], [3.0]], dtype = torch.float64)
print(a_matrix_version2)
print(a_matrix_version2.shape)

tensor([1., 2., 3.], dtype=torch.float64)
torch.Size([3])
-------
tensor([[1., 2., 3.]], dtype=torch.float64)
torch.Size([1, 3])
-------
tensor([[1.],
        [2.],
        [3.]], dtype=torch.float64)
torch.Size([3, 1])


---

A tensor of rank 3 corresponding to 4 time stamps (hourly), 3 samples (patients), 2 features (HR and BP). Assume that admission time is 9AM.

---

In [None]:
# A rank-3 patient tensor with shape (4, 3, 2)
# with meaning for
# dim-0 as 4 hourly timestamps,
# dim-1 as 3 patients, and
# dim-2 as 2 features (HR and BP)
# T = torch.tensor([[[HR, BP], [HR, BP], [HR, BP]],
#                   [[HR, BP], [HR, BP], [HR, BP]],
#                   [[HR, BP], [HR, BP], [HR, BP]],
#                   [[HR, BP], [HR, BP], [HR, BP]]])
T = torch.tensor([[[74., 128], [79, 116], [71, 116]],
                 [[78, 118], [82, 124], [72, 128]],
                 [[84, 138], [84, 130], [74, 120]],
                 [[82, 126], [76, 156], [82, 132]]])
print(T)
print(T.shape)

tensor([[[ 74., 128.],
         [ 79., 116.],
         [ 71., 116.]],

        [[ 78., 118.],
         [ 82., 124.],
         [ 72., 128.]],

        [[ 84., 138.],
         [ 84., 130.],
         [ 74., 120.]],

        [[ 82., 126.],
         [ 76., 156.],
         [ 82., 132.]]])
torch.Size([4, 3, 2])


---

**Accessing elements of a tensor**

---

In [None]:
## Accessing elements of a tensor
# Rank-3 tensor T has axes order (timestamps, patients, features)

# Element of T at postion 3 w.r.t. dim-0, position 2 w.r.t. dim-1,
# position-1 w.r.t dim-2
print(T[3, 2, 1]) # timestamp-3, patient-2, feature -1 also the BP of patient-2 at noon

# Element-0 of object T which is also the info for all patients at
print(T[0]) # patients' info at admission time
print(T[-1])

# Last admitted patient's info at noon
print(T[3,2])

tensor(132.)
tensor([[ 74., 128.],
        [ 79., 116.],
        [ 71., 116.]])
tensor([[ 82., 126.],
        [ 76., 156.],
        [ 82., 132.]])
tensor([ 82., 132.])


---

Understanding shapes

---

In [None]:
#a = torch.tensor([1.0, 2.0, 3.0]) # rank-1 tensor, a 3-vector in pen & paper
#a = torch.tensor([[1.0, 2.0, 3.0]]) # rank-2 tensor, a 1x3-matrix in pen & paper
#a = torch.tensor([[1.0], [2.0], [3.0]]) # rank-2 tensor, a 3x1-matrix in pen & paper
a = torch.tensor([[[1.0, 2.0, 3.0]]]) # rank-3 tensor, a 1x1x3-tensor in pen & paper
print(a)
print(a.shape)

tensor([[[1., 2., 3.]]])
torch.Size([1, 1, 3])


---

Broadcasting example that does not work

---

In [None]:
# How different are the patients compared to patient-0 across all timestamps
T_patient0 = T[:, 0, :]
print(T)
print(T_patient0)
print(T.shape)
print(T_patient0.shape)
#print(T - T_patient0) # does not work

tensor([[[ 74., 128.],
         [ 79., 116.],
         [ 71., 116.]],

        [[ 78., 118.],
         [ 82., 124.],
         [ 72., 128.]],

        [[ 84., 138.],
         [ 84., 130.],
         [ 74., 120.]],

        [[ 82., 126.],
         [ 76., 156.],
         [ 82., 132.]]])
tensor([[ 74., 128.],
        [ 78., 118.],
        [ 84., 138.],
        [ 82., 126.]])
torch.Size([4, 3, 2])
torch.Size([4, 2])


---

Broadcasting example that works

---

In [None]:
T_patient0 = T[:, 0, :]
# Add a new dimension to a tensor using the unsqueeze() function
T_patient0_new = torch.unsqueeze(T_patient0, 1)
print(T_patient0_new.shape)

torch.Size([4, 1, 2])


---

**Exercise**: interpret $\texttt{T[:, -1, :]}$

---

In [None]:
# Interpret T[:, -1, :]
T[:, -1, :]

tensor([[ 71., 116.],
        [ 72., 128.],
        [ 74., 120.],
        [ 82., 132.]])

---

$l_2$ norm or the geometric length of a vector denoted as $\lVert \mathbf{a}\rVert_2$ tells us how long a vector is. In 2-dimensions, $$\mathbf{a}=\begin{bmatrix}a_1\\a_2\end{bmatrix}\Rightarrow \lVert\mathbf{a}\rVert_2 = \sqrt{a_1^2+a_2^2}$$ and in $n$-dimensions, $$\mathbf{a}=\begin{bmatrix}a_1\\a_2\\\vdots\\a_n\end{bmatrix}\Rightarrow\lVert \mathbf{a}\rVert_2 = \sqrt{a_1^2+a_2^2+\cdots+a_n^2}.$$

![vector norm](https://1drv.ms/i/c/37720f927b6ddc34/IQT817WmpQjlRqZ1R0d5Cfv6AUW6c4robL-gk06i9wmCaFU?width=500)

---

In [None]:
## l2 norm of a vector
x = torch.tensor([76.0, 124.0], dtype = torch.float64)
print(x)
torch.norm(x)

tensor([ 76., 124.], dtype=torch.float64)


tensor(145.4373, dtype=torch.float64)

---

Application of norm: how different is 'cricket-without-tendulkar' compared to 'football-without-messi'?

---

In [None]:
# Back to 'cricket-without-tendulkar' and 'football-without-messi'
print(torch.norm(torch.tensor(a))) # norm of 'cricket-without-tendulkar'
print(torch.norm(torch.tensor(b))) # norm of 'football-without-messi'

tensor(3.7417)
tensor(6.0760)


  print(torch.norm(torch.tensor(a))) # norm of 'cricket-without-tendulkar'



---

**Dot Product of Vectors**

A scalar resulting from an elementwise multiplication and addition: $$\mathbf{a}{\color{cyan}\cdot}\mathbf{b} = {\color{red}{a_1b_1}}+{\color{green}{a_2b_2}}+\cdots+{\color{magenta}{a_nb_n}}$$

The <font color="cyan">dot</font> ${\color{cyan}\cdot}$ represents the computation of the dot product.


---

In [None]:
## Dot product of vectors
a = torch.tensor([1.0, 2.0, 3.0], dtype = torch.float64)
b = torch.tensor([4.0, 5.0, 6.0], dtype = torch.float64)
print(a * b) # This is called the Hadamard product
print(torch.dot(a, b)) # This is called the dot product

tensor([ 4., 10., 18.], dtype=torch.float64)
tensor(32., dtype=torch.float64)


---

The dot product is a measure of similarity between vectors (or, how aligned they are geometrically).

![dot product](https://1drv.ms/i/c/37720f927b6ddc34/IQTbcGSjdbhSTJ7J39d5BCWAAWS6-y5U6J87vHuDWeAqGwM?width=6000)

---

In [None]:
a = torch.tensor([1.0, 2.0])
b = torch.tensor([2.0, 4.0])
c = torch.tensor([-2.0, 1.0])
d = torch.tensor([-1.0, -2.0])
print(torch.dot(a, b))
print(torch.dot(a, c))
print(torch.dot(a, d))

tensor(10.)
tensor(0.)
tensor(-5.)


---

Cauchy-Schwarz inequality: for any two vectors $\mathbf{x}$ and $\mathbf{y},$ it is always true that $$-1\leq\frac{\mathbf{x}\cdot{\mathbf{y}}}{\lVert\mathbf{x}\rVert_2\lVert\mathbf{y}\rVert_2}\leq1.$$

This is a normalized measure of similarity (or extent of alignment) between vectors which also referred to as the cosine similarity.

This helps define the angle between two vectors $\mathbf{x}$ and $\mathbf{y}$ as $$\angle(\mathbf{x},\mathbf{y}) = \cos^{-1}\left(\frac{\mathbf{x}\cdot{\mathbf{y}}}{\lVert\mathbf{x}\rVert_2\lVert\mathbf{y}\rVert_2}\right)$$ which is a value from $0$ through $\pi$ radians.

Two ways to measure the difference between two vectors:

![angle](https://1drv.ms/i/c/37720f927b6ddc34/IQQ03G17kg9yIIA3WokBAAAAAQi8FPV9YCebl5WnyEKJ3vg?width=213&height=400)


---

In [None]:
x = torch.tensor([1.0, 2.0])
y = torch.tensor([2.0, 1.0])

#Linear difference between x & y
print(torch.norm(x - y))

# Angle between x and y in radians
print(torch.acos(torch.dot(x,y) / (torch.norm(x) * torch.norm(y))))

# Angle between x and y in degrees
print((180.0/torch.pi)*(torch.acos(torch.dot(x,y) / (torch.norm(x) * torch.norm(y)))))


tensor(1.4142)
tensor(0.6435)
tensor(36.8699)


---

Application of the Cauchy-Schwarz inequality: how different is "cricket without tendulkar" from "football without messi"?

---

In [None]:
a = torch.tensor(loaded_word_vectors.get('cricket', None) - loaded_word_vectors.get('tendulkar', None),
                 dtype = torch.float64)
b = torch.tensor(loaded_word_vectors.get('football', None) - loaded_word_vectors.get('messi', None),
                 dtype = torch.float64)

# Linear difference between and a and b
print(torch.norm(a-b))

# Angle difference between a and b in radians (cosine similarity)
print(torch.acos(torch.dot(a, b) / (torch.norm(a) * torch.norm(b))))

# Angle difference between a and b in degrees
print((180.0/torch.pi)*(torch.acos(torch.dot(a, b) / (torch.norm(a) * torch.norm(b)))))

tensor(4.2349, dtype=torch.float64)
tensor(0.7420, dtype=torch.float64)
tensor(42.5126, dtype=torch.float64)



---

**Hadamard Product of Vectors**

A vector resulting from an elementwise multiplication: $$\mathbf{a}{\color{cyan}\otimes}\mathbf{b} = \begin{bmatrix}{\color{red}{a_1\times b_1}}\\{\color{green}{a_2\times b_2}}\\\vdots\\{\color{magenta}{a_n\times b_n}}\end{bmatrix}.$$

The $\color{cyan}\otimes$ represents the computation of the Hadamard product.

---

In [None]:
## Hadamard product
a = torch.tensor([1.0, 2.0, 3.0],dtype=torch.float64)
b = torch.tensor([4.0, 5.0, 6.0],dtype=torch.float64)

# Element-wise multiplication (Hadamard product)
print(torch.mul(a,b))

tensor([ 4., 10., 18.], dtype=torch.float64)


---


A matrix-vector product is simply a sequence of dot products of the rows of the matrix (seen as vectors) with the vector

![matvec product](https://1drv.ms/i/c/37720f927b6ddc34/IQQ1cQ8fZdFmS4cnGkBlsZbAAaL2zMtzWdjHe-HCMt4UTA0?width=700)

---

In [None]:
## Matrix-vector product
A = torch.tensor([[1.0, 2.0, 4.0],
                  [2.0, -1.0, 3.0]])
x = torch.tensor([4.0, 2.0, -2.0])

# Matrix-vector multiplication
print(torch.matmul(A,x))


tensor([0., 0.])


---

A matrix-matrix product is simply a sequence of matrix-vector products.

![matmatprod](https://1drv.ms/i/c/37720f927b6ddc34/IQQ-B3z7tbWHQqBrW9k2ElDVAUc5fWzM24txLkgBK7f8Yac?width=550)


---

In [None]:
## Matrix-matrix product
A = torch.tensor([[1.0, 2.0, 4.0],
                  [2.0, -1.0, 3.0]], dtype = torch.float64)

B = torch.tensor([[4, -1],
                  [2, 0],
                  [-2, 3]], dtype = torch.float64)
print(torch.matmul(A,B))

tensor([[ 0., 11.],
        [ 0.,  7.]], dtype=torch.float64)


---

Matrix-matrix product using patient data matrix and a weights matrix:

![patient dataset](https://1drv.ms/i/s!AjTcbXuSD3I3hspfrgklysOtJMOjaA?embed=1&width=800)

$$\mathbf{Z} = \mathbf{XW}.$$

$\mathbf{Z}$ is called the raw scores matrix.

---

In [None]:
# Patients data matrix
X = torch.tensor([[72, 120, 37.3, 104, 32.5],
                 [85, 130, 37.0, 110, 14],
                 [68, 110, 38.5, 125, 34],
                 [90, 140, 38.0, 130, 26],
                 [84, 132, 38.3, 146, 30],
                 [78, 128, 37.2, 102, 12]], dtype = torch.float64)
print(f'Patient data matrix X:\n {X}') #f-string in Python

# Weights matrix
W = torch.tensor([[-0.1, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64)
print(f'Weights matrix:\n {W}')

Z =torch.matmul(X,W)
print("#################")
print(Z)

Patient data matrix X:
 tensor([[ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000],
        [ 85.0000, 130.0000,  37.0000, 110.0000,  14.0000],
        [ 68.0000, 110.0000,  38.5000, 125.0000,  34.0000],
        [ 90.0000, 140.0000,  38.0000, 130.0000,  26.0000],
        [ 84.0000, 132.0000,  38.3000, 146.0000,  30.0000],
        [ 78.0000, 128.0000,  37.2000, 102.0000,  12.0000]],
       dtype=torch.float64)
Weights matrix:
 tensor([[-0.1000,  0.5000,  0.3000],
        [ 0.9000,  0.3000,  0.5000],
        [-1.5000,  0.4000,  0.1000],
        [ 0.1000,  0.1000, -1.0000],
        [-1.2000,  0.5000, -0.8000]], dtype=torch.float64)
#################
tensor([[ 16.2500, 113.5700, -44.6700],
        [ 47.2000, 114.3000, -27.0000],
        [  6.1500, 111.9000, -72.9500],
        [ 41.8000, 128.2000, -50.0000],
        [ 31.5500, 126.5200, -74.9700],
        [ 47.4000, 108.4800, -20.4800]], dtype=torch.float64)


neuron 0 - diabetic
neuron 1 - non diabetic
neuron 2 - pre diabetic
neurons are assigned based alphabetically based on output labels y

---

**Version-1** view of the matrix-matrix product $\mathbf{Z} = \mathbf{XW}$:

*What a particular neuron understands about a particular patient.*

![matrix-matrix product version-1](https://1drv.ms/i/c/37720f927b6ddc34/IQQdAOCwtndURKA-h4yvpTqlAYjBjlcweRSeMYkPvf7dwmQ?width=660)

$$\begin{align*}[\mathbf{Z}]_{i,j} &= (i,j)\text{-th element of }\mathbf{Z}\\&=\text{what the }j\text{th neuron learns about the } i\text{th patient}\\&=\mathbf{x}^{(i)}\cdot\mathbf{w}_j\\& = {\mathbf{x}^{(i)}}^\mathrm{T}\mathbf{w}_j\\\Rightarrow \underbrace{[\mathbf{Z}]_{{\color{yellow}0},{\color{cyan}2}}}_{{\color{yellow}0}\text{th patient},\,{\color{cyan}2}\text{nd neuron}} &= \mathbf{x}^{({\color{yellow}0})}\cdot\mathbf{w}_{{\color{cyan}2}}\\ &= \begin{bmatrix}72\\120\\37.3\\104\\32.5\end{bmatrix}\cdot\begin{bmatrix}0.3\\0.5\\0.1\\-1.0\\-0.8\end{bmatrix}\\ &= -44.67.\end{align*}$$

---

In [None]:
## The (0, 2)-th element of the matrix-matrix product XW
torch.dot(X[0,:], W[:,2])

tensor(-44.6700, dtype=torch.float64)

---

**Version-2** view of the matrix-matrix product $\mathbf{Z} = \mathbf{XW}$:

*What a particular neuron understands about all the patients.*

![matrix-matrix product version-2](https://1drv.ms/i/c/37720f927b6ddc34/IQRm1-w-6TG0R4C4J4BizyzyAWIbcHzbEjgmx-0JFREdHsE?width=660)

$$\begin{align*}\mathbf{z}_j &= \mathbf{X}\mathbf{w}_j\\&=\text{what the } j\text{th neuron learns about the all the patients}\\&=w_{j,0}\times\textbf{HR}+w_{j,1}\times\textbf{BP}+w_{j,2}\times\textbf{Temp}+w_{j,3}\times\textbf{Sugar}+w_{j,4}\times\textbf{Vitamin D}\\&= w_{j,0}\mathbf{x}_0+w_{j,1}\mathbf{x}_1+w_{j,2}\mathbf{x}_2+w_{j,3}\mathbf{x}_3+w_{j,4}\mathbf{x}_4\\\Rightarrow\underbrace{\mathbf{z}_{{\color{cyan}0}}}_{{\color{cyan}0}\text{th neuron understanding}} &= \underbrace{\mathbf{X}}_{\color{yellow}{\text{all patients}}}\ \underbrace{\mathbf{w}_{{\color{cyan}0}}}_{{\color{cyan}0}\text{th neuron weights}}\\&= {\color{cyan}{-0.1}}\times\begin{bmatrix}{\color{yellow}{72}}\\{\color{yellow}{85}}\\{\color{yellow}{68}}\\{\color{yellow}{90}}\\{\color{yellow}{84}}\\{\color{yellow}{78}}\end{bmatrix}+{\color{cyan}{0.9}}\times\begin{bmatrix}{\color{yellow}{120}}\\{\color{yellow}{130}}\\{\color{yellow}{110}}\\{\color{yellow}{140}}\\{\color{yellow}{132}}\\{\color{yellow}{128}}\end{bmatrix}+({\color{cyan}{-1.5}})\times\begin{bmatrix}{\color{yellow}{37.3}}\\{\color{yellow}{37.0}}\\{\color{yellow}{38.5}}\\{\color{yellow}{38.0}}\\{\color{yellow}{38.3}}\\{\color{yellow}{37.2}}\end{bmatrix}+{\color{cyan}{0.1}}\times\begin{bmatrix}{\color{yellow}{104}}\\{\color{yellow}{110}}\\{\color{yellow}{125}}\\{\color{yellow}{130}}\\{\color{yellow}{146}}\\{\color{yellow}{102}}\end{bmatrix}+({\color{cyan}{-1.2}})\times\begin{bmatrix}{\color{yellow}{32.5}}\\{\color{yellow}{14}}\\{\color{yellow}{34}}\\{\color{yellow}{26}}\\{\color{yellow}{30}}\\{\color{yellow}{12}}\end{bmatrix}\\&=\begin{bmatrix}16.25\\47.20\\6.15\\41.80\\31.55\\47.40\end{bmatrix}.\end{align*}$$



---

In [None]:
## The 0-th column of the matrix-matrix product XW
torch.matmul(X, W[:,0])

tensor([16.2500, 47.2000,  6.1500, 41.8000, 31.5500, 47.4000],
       dtype=torch.float64)

---

**Version-3** view of the matrix-matrix product $\mathbf{Z} = \mathbf{XW}$:

*What all neurons understand about a particular patient.*

![matrix-matrix product version-3](https://1drv.ms/i/c/37720f927b6ddc34/IQRfO-qEJQ9mQYLH_f-lyjeQAaWV4FrDjTjaEHJpPB1PmCg?width=660)

$$\begin{align*}{\mathbf{z}^{(i)}}^\mathrm{T}&={\mathbf{x}^{(i)}}^\mathrm{T}\mathbf{W}\\&= \text{what is learned about the }i\text{th patient by all the neurons}\\&=i\text{th HR }\times{\mathbf{w}^{(0)}}^\mathrm{T}+i\text{th BP }\times{\mathbf{w}^{(1)}}^\mathrm{T}+i\text{th Temp }\times{\mathbf{w}^{(2)}}^\mathrm{T}+i\text{th Sugar }\times{\mathbf{w}^{(3)}}^\mathrm{T}+i\text{th Vitamin D }\times{\mathbf{w}^{(4)}}^\mathrm{T}\\&=x^{(i)}_0\times{\mathbf{w}^{(0)}}^\mathrm{T}+x^{(i)}_1\times{\mathbf{w}^{(1)}}^\mathrm{T}+x^{(i)}_2\times{\mathbf{w}^{(2)}}^\mathrm{T}+x^{(i)}_3\times{\mathbf{w}^{(3)}}^\mathrm{T}+x^{(i)}_4\times{\mathbf{w}^{(4)}}^\mathrm{T}\\\underbrace{\Rightarrow{{\mathbf{z}^{({\color{yellow}0})}}^\mathrm{T}}}_{{\color{yellow}{0}}\text{th patient understanding}}&=\underbrace{{{\mathbf{x}^{({\color{yellow}0})}}^\mathrm{T}}}_{{\color{yellow}{0}}\text{th patient}}\ \underbrace{\mathbf{W}}_{{\color{cyan}{\text{all neurons}}}}\\ &= {\color{yellow}{72}}\times\begin{bmatrix}{\color{cyan}{-0.1}} & {\color{cyan}{0.5}} & {\color{cyan}{0.3}}\end{bmatrix} \\&+ {\color{yellow}{120}}\times\begin{bmatrix}{\color{cyan}{0.9}} & {\color{cyan}{0.3}} & {\color{cyan}{0.5}}\end{bmatrix}\\&+{\color{yellow}{37.3}}\times\begin{bmatrix}{\color{cyan}{-1.5}} & {\color{cyan}{0.4}} & {\color{cyan}{0.1}}\end{bmatrix}\\&+{\color{yellow}{104}}\times\begin{bmatrix}{\color{cyan}{0.1}} & {\color{cyan}{0.1}} & {\color{cyan}{-1.0}}\end{bmatrix}\\&+{\color{yellow}{32.5}}\times\begin{bmatrix}{\color{cyan}{-1.2}} & {\color{cyan}{0.5}} & {\color{cyan}{-0.8}}\end{bmatrix}\\&=\begin{bmatrix}16.25 & 113.57 & -44.67\end{bmatrix}.\end{align*}$$


---

In [None]:
## The 0-th row of the matrix-matrix product XW
torch.matmul(X[0],W)

tensor([ 16.2500, 113.5700, -44.6700], dtype=torch.float64)

---

The softmax function: takes a $k$-vector $\mathbf{z}$ as input and returns a vector $\mathbf{a}$ of the same shape as the output which is referred to as the softmax-activated scores.

$$\begin{align*}\mathbf{a}&=\text{softmax}(\mathbf{z})=\begin{bmatrix}\dfrac{e^{z_1}}{e^{z_1}+e^{z_2}+\cdots+e^{z_k}}\\\dfrac{e^{z_2}}{e^{z_1}+e^{z_2}+\cdots+e^{z_k}}\\\vdots\\\dfrac{e^{z_k}}{e^{z_1}+e^{z_2}+\cdots+e^{z_k}}\end{bmatrix}.\end{align*}$$

In the following example, we consider a raw scores vector $\mathbf{z}$ with 3 components which leads to the softmax-activated scores vectors $\mathbf{a}$ which can be interpreted as the predicted probabilities that the sample belongs to each one of the output classes:

![softmax](https://1drv.ms/i/s!AjTcbXuSD3I3hscmdol7J2G4GDo5WQ?embed=1&width=660)


---

In [None]:
z = torch.tensor([5.6, 6.4, -4.5], dtype = torch.float64)
print(z)
softmax=torch.nn.Softmax(dim=0)
a=softmax(z)
print(a)

tensor([ 5.6000,  6.4000, -4.5000], dtype=torch.float64)
tensor([3.1002e-01, 6.8997e-01, 1.2736e-05], dtype=torch.float64)


---

Calculating the raw scores followed by the softmax-activated scores for the patient data matrix.

---

In [None]:
# Raw scores matrix (matrix-matrix multiplication)
Z = torch.matmul(X,W)
print(f'Raw zcores matrix:\n {Z}')

# Calculate the softmax scores
softmax = torch.nn.Softmax(dim = 1)
A = softmax(Z)
print(A)

Raw zcores matrix:
 tensor([[ 16.2500, 113.5700, -44.6700],
        [ 47.2000, 114.3000, -27.0000],
        [  6.1500, 111.9000, -72.9500],
        [ 41.8000, 128.2000, -50.0000],
        [ 31.5500, 126.5200, -74.9700],
        [ 47.4000, 108.4800, -20.4800]], dtype=torch.float64)
tensor([[5.4258e-43, 1.0000e+00, 1.8934e-69],
        [7.2250e-30, 1.0000e+00, 4.3071e-62],
        [1.1840e-46, 1.0000e+00, 5.2561e-81],
        [2.9989e-38, 1.0000e+00, 4.0618e-78],
        [5.6892e-42, 1.0000e+00, 3.1189e-88],
        [2.9737e-27, 1.0000e+00, 9.8488e-57]], dtype=torch.float64)


---

Standardization of data to get rid of the effects of units.

The standard deviation of a vector is a measure of how much the components or elements of that vector typically deviate from their average value. For an $n$-vector $\mathbf{x},$ the standard deviation is denoted and calculated as
$$\mathbf{x} = \begin{bmatrix}x_1\\x_2\\\vdots\\x_n\end{bmatrix}\Rightarrow\text{std}(\mathbf{x}) = \sqrt{\frac{\left[x_1-\text{avg}(\mathbf{x})\right]^2+\cdots+\left[x_n-\text{avg}(\mathbf{x})\right]^2}{n}}.$$ The quantity inside the square root above is the average squared deviation which is also called the variance denoted as $$\text{var}(\mathbf{x}) = \frac{\left[x_1-\text{avg}(\mathbf{x})\right]^2+\cdots+\left[x_n-\text{avg}(\mathbf{x})\right]^2}{n}.$$

This means $\text{std}(\mathbf{x}) = \sqrt{\text{var}(\mathbf{x})}.$

A large standard deviation indicates that the components of the vector typically deviate a lot from their average value or mean.

The following component plot of a vector of heart rate values has the 1-standard deviation-above and below the mean represented as red-dotted lines:

![standard deviation](https://1drv.ms/i/c/37720f927b6ddc34/IQQB_uF-TUO8SpoodLWz7sQPAc4POmYfY3hPjlX3vpYfKlY?width=540)

---

In [None]:
# Heart rate vector
a = X[:, 0]
print(f'Heart rate vector:\n {a}')

# BP vector
b = X[:, 1]
#print(f'Blood pressure vector:\n {b}')

# Average heart rate
print(f'Average heart rate: {torch.mean(a)}')

# Average BP
#print(torch.mean(b))

# Mean-centered heart rate vector or the de-meaned heart rate vector or the
# deviations in heart rate vectors
a_mc = a-torch.mean(a)
print(f'Deviations in heart rate vector:\n {a_mc}')

# The average of the components of the mean-centered heart rate vector is zero
print(torch.mean(a_mc))

# The squared deviations vector
print(f'Squared-deviations in heart rate vector:\n {a_mc**2}')

# The average of the squared deviations vector a.k.a. the variance in
# the heart rate
v = torch.mean(a_mc**2)
print(f'Average squared deviation or variance in the heart rate: {v}')

# Square-root of the average of the squared deviations vector
# which is the same as the square root of the variance a.k.a. the
# standard deviation in the heart rate
s = torch.sqrt(v)
print(f'Standard deviation of the heart rate: {s}')

# Standardized heart rate vector a.k.a. the z-scores of the heart rate is
# obtained by subtracting the mean heart rate and dividing by the
# deviation of the heart rates
z = a_mc/s #same as doing (a-torch.mean(a))/torch.std(a)
print(f'Standardized heart rate vector:\n{z}')

Heart rate vector:
 tensor([72., 85., 68., 90., 84., 78.], dtype=torch.float64)
Average heart rate: 79.5
Deviations in heart rate vector:
 tensor([ -7.5000,   5.5000, -11.5000,  10.5000,   4.5000,  -1.5000],
       dtype=torch.float64)
tensor(0., dtype=torch.float64)
Squared-deviations in heart rate vector:
 tensor([ 56.2500,  30.2500, 132.2500, 110.2500,  20.2500,   2.2500],
       dtype=torch.float64)
Average squared deviation or variance in the heart rate: 58.583333333333336
Standard deviation of the heart rate: 7.65397500213669
Standardized heart rate vector:
tensor([-0.9799,  0.7186, -1.5025,  1.3718,  0.5879, -0.1960],
       dtype=torch.float64)


---

Suppose heart rate is measured in beats per hour instead of beats per minute. How do the z-scores look like now?

---

In [None]:
# Suppose heart rate is measured in beats per hour instead of beats per minute
a = X[:, 0]*60
print(f'Heart rate vector:\n {a}')


# Average heart rate
print(f'Average heart rate: {torch.mean(a)}')


# Mean-centered heart rate vector or the de-meaned heart rate vector or the
# deviations in heart rate vectors
a_mc = a - torch.mean(a)
print(f'Deviations in heart rate vector:\n {a_mc}')

# The average of the components of the mean-centered heart rate vector is zero
#print(torch.mean(a_mc))

# The squared deviations vector
print(f'Squared-deviations in heart rate vector:\n {a_mc**2}')

# The average of the squared deviations vector a.k.a. the variance in
# the heart rate
v = torch.mean(a_mc**2)
print(f'Average squared deviation or variance in the heart rate: {v}')

# Square-root of the average of the squared deviations vector
# which is the same as the square root of the variance a.k.a. the
# standard deviation in the heart rate
s = torch.sqrt(v)
print(f'Standard deviation of the heart rate: {s}')

# Standardized heart rate vector a.k.a. the z-scores of the heart rate
z = a_mc / s
print(f'Standardized heart rate vector:\n{z}')
# The z-scores are the same as before when the heart rate was in beats per minute

Heart rate vector:
 tensor([4320., 5100., 4080., 5400., 5040., 4680.], dtype=torch.float64)
Average heart rate: 4770.0
Deviations in heart rate vector:
 tensor([-450.,  330., -690.,  630.,  270.,  -90.], dtype=torch.float64)
Squared-deviations in heart rate vector:
 tensor([202500., 108900., 476100., 396900.,  72900.,   8100.],
       dtype=torch.float64)
Average squared deviation or variance in the heart rate: 210900.0
Standard deviation of the heart rate: 459.23850012820134
Standardized heart rate vector:
tensor([-0.9799,  0.7186, -1.5025,  1.3718,  0.5879, -0.1960],
       dtype=torch.float64)
