$$\Large\boxed{\text{AME 5202 Deep Learning, Even Semester 2026}}$$

$$\large\text{Theme}: \underline{\text{computational foundations of the softmax classifier}}$$

---

Load essential libraries

---

In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns

---

Mount Google Drive folder if running Google Colab

---

In [2]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2026MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

---

**The patient data matrix with output labels and initial weight matrix**

![patient dataset](https://1drv.ms/i/s!AjTcbXuSD3I3hspfrgklysOtJMOjaA?embed=1&width=800)

---



In [3]:
## Create the patient data matrix as a constant tensor
X = torch.tensor([[72, 120, 37.3, 104, 32.5],
                  [85, 130, 37.0, 110, 14],
                  [68, 110, 38.5, 125, 34],
                  [90, 140, 38.0, 130, 26],
                  [84, 132, 38.3, 146, 30],
                  [78, 128, 37.2, 102, 12]],
                  dtype = torch.float64)
print(X)

# True output labels vector
y = np.array(['non-diabetic',
              'diabetic',
              'non-diabetic',
              'pre-diabetic',
              'diabetic',
              'pre-diabetic'])
print(y)

W = torch.tensor([[-0.1, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64,
                 requires_grad = True)
print(W)

tensor([[ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000],
        [ 85.0000, 130.0000,  37.0000, 110.0000,  14.0000],
        [ 68.0000, 110.0000,  38.5000, 125.0000,  34.0000],
        [ 90.0000, 140.0000,  38.0000, 130.0000,  26.0000],
        [ 84.0000, 132.0000,  38.3000, 146.0000,  30.0000],
        [ 78.0000, 128.0000,  37.2000, 102.0000,  12.0000]],
       dtype=torch.float64)
['non-diabetic' 'diabetic' 'non-diabetic' 'pre-diabetic' 'diabetic'
 'pre-diabetic']
tensor([[-0.1000,  0.5000,  0.3000],
        [ 0.9000,  0.3000,  0.5000],
        [-1.5000,  0.4000,  0.1000],
        [ 0.1000,  0.1000, -1.0000],
        [-1.2000,  0.5000, -0.8000]], dtype=torch.float64, requires_grad=True)


---

One-hot encoding of the true output labels

---

In [6]:
# The following does not work in PyTorch
#y = torch.tensor(['non-diabetic', 'diabetic'])

# Create a 1D-numpy array of output labels (equivalent to a rank-1 tensor in
# PyTorch which itself is equivalent to a vector in pen & paper)

ohe = OneHotEncoder(sparse_output = False)
Y = ohe.fit_transform(y.reshape(-1, 1))
print(Y)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


---

Loss for each sample can be quantified using the categorical crossentropy (CCE) loss function which is defined as $$\color{yellow}{-\log(\text{predicted probability that the sample belongs its correct class})}$$

For example, consider a sample with

- true label = [$\color{yellow}{1}$ 0 0]
- predicted probabilities = [$\color{yellow}{0.05}$, 0.99, 0.05]

$\Rightarrow$ categorical crossentropy loss = $-\log(\color{yellow}{0.05}).$

---

In [9]:
y = torch.tensor([1.0, 0.0, 0.0])
a = torch.tensor([0.05, 0.99, 0.05])
-torch.log(torch.sum(y*a))

tensor(2.9957)

---

The forward propagation

---

In [11]:
# Standardize the data
sc = StandardScaler() # create a standard scaler object
print(X)
X_std = sc.fit_transform(X)
print(f'The standardized data matrix:\n{X_std}')

tensor([[ 72.0000, 120.0000,  37.3000, 104.0000,  32.5000],
        [ 85.0000, 130.0000,  37.0000, 110.0000,  14.0000],
        [ 68.0000, 110.0000,  38.5000, 125.0000,  34.0000],
        [ 90.0000, 140.0000,  38.0000, 130.0000,  26.0000],
        [ 84.0000, 132.0000,  38.3000, 146.0000,  30.0000],
        [ 78.0000, 128.0000,  37.2000, 102.0000,  12.0000]],
       dtype=torch.float64)
The standardized data matrix:
[[-0.979883   -0.70186241 -0.72380201 -0.98707429  0.89204786]
 [ 0.71858087  0.3509312  -1.24493946 -0.60498101 -1.2373567 ]
 [-1.50248727 -1.75465602  1.36074779  0.35025217  1.06470228]
 [ 1.3718362   1.40372481  0.49218537  0.66866323  0.14387869]
 [ 0.5879298   0.56148993  1.01332282  1.68757862  0.60429048]
 [-0.1959766   0.14037248 -0.8975145  -1.11443871 -1.4675626 ]]


In [None]:
# Standardize the data
sc = StandardScaler() # create a standard scaler object
X_std = ?
print(f'The standardized data matrix:\n{X_std}')

# The one-hot encoded true output labels matrix
ohe = OneHotEncoder(sparse_output = False)
Y = ?
print(f'One-hot encoded true output labels matrx:\n{Y}')

# Calculate the raw scores using the standardized data matrix
# and the weights matrix
print(f'The weights matrix:\n{W}')
Z = ?
print(f'The raw scores matrix:\n{Z}')

# Calculate the softmax-activated scores matrix
softmax = ?
A = ?
print(f'The softmax-activated scores matrix:\n{A}')

# Quantify the unhappiness w.r.t. the current set of weights
print(f'Loss = {?}')

---

Using PyTorch, calculate the optimal weights for the patient data matrix

---

In [None]:
# Patients data matrix
X = torch.tensor([[72, 120, 37.3, 104, 32.5],
                 [85, 130, 37.0, 110, 14],
                 [68, 110, 38.5, 125, 34],
                 [90, 140, 38.0, 130, 26],
                 [84, 132, 38.3, 146, 30],
                 [78, 128, 37.2, 102, 12]], dtype = torch.float64)
#print(f'Patient data matrix X:\n {X}') #f-string in Python

# Initial Weights matrix (trainable tensor)
W = torch.tensor([[-0.1, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64,
                 requires_grad = True)
#print(f'Weights matrix:\n {W}')

# Create a 1D-numpy array of output labels (equivalent to a rank-1 tensor in
# PyTorch which itself is equivalent to a vector in pen & paper)
y = np.array(['non-diabetic',
              'diabetic',
              'non-diabetic',
              'pre-diabetic',
              'diabetic',
              'pre-diabetic'])
# Creating a one-hot encoder object
ohe = OneHotEncoder(sparse_output = False)
# Create the one-hot encoded true output labels matrix
Y = torch.tensor(ohe.fit_transform(y.reshape(-1, 1)), dtype = torch.float64)
#print(f'One-hot encoded output labels matrix:\n {Y}')

# Standardize the data
sc = StandardScaler() # create a standard scaler object
X_std = torch.tensor(sc.fit_transform(X), dtype = torch.float64)
#print(f'The standardized data matrix:\n{X_std}')


# Define optimizer
optimizer = ?

# Loss function
def loss_fn(W):
  # Raw scores
  Z = ?

  # Softmax-activated scores
  softmax = ?
  A = ?

  # Calculate the average training loss
  L = ?
  return L

# Optimization loop
num_epochs = ?
loss_train = ?
for epoch in range(num_epochs):
  # Zero out the gradients
  optimizer.zero_grad()

  # Forward propagation (loss calculation)
  loss = ?

  # Backward propagation and optimization
  ?

  # Print the loss every epoch
  loss_train[epoch] = loss.item()
  print(f'Epoch {epoch}, loss = {loss.item()}')

---

Plot loss curve

---

In [None]:
## Plot train loss as a function of epoch:
fig, ax = plt.subplots(1, 1, figsize = (4, 4))
fig.tight_layout(pad = 4.0)
ax.plot(loss_train, 'b')
ax.set_xlabel('Epoch', fontsize = 12)
ax.set_ylabel('Loss value', fontsize = 12)
ax.legend()
ax.set_title('Loss vs. Epoch', fontsize = 14);

In [None]:
# Print the optimized Weights matrix
print(W)