$$\Large\boxed{\text{AME 5202 Deep Learning, Even Semester 2026}}$$

$$\large\text{Theme}: \underline{\text{computational foundations of the softmax classifier}}$$

---

Load essential libraries

---

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import seaborn as sns

---

Mount Google Drive folder if running Google Colab

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2026MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

---

**The patient data matrix with output labels and initial weight matrix**

![patient dataset](https://1drv.ms/i/s!AjTcbXuSD3I3hspfrgklysOtJMOjaA?embed=1&width=800)

---



In [None]:
## Create the patient data matrix as a constant tensor
X = torch.tensor([[72, 120, 37.3, 104, 32.5],
                  [85, 130, 37.0, 110, 14],
                  [68, 110, 38.5, 125, 34],
                  [90, 140, 38.0, 130, 26],
                  [84, 132, 38.3, 146, 30],
                  [78, 128, 37.2, 102, 12]],
                  dtype = torch.float64)
print(X)

# True output labels vector
y = np.array(['non-diabetic',
              'diabetic',
              'non-diabetic',
              'pre-diabetic',
              'diabetic',
              'pre-diabetic'])
print(y)

W = torch.tensor([[-0.1, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64,
                 requires_grad = True)
print(W)

---

One-hot encoding of the true output labels

---

In [None]:
# The following does not work in PyTorch
#y = torch.tensor(['non-diabetic', 'diabetic'])

# Create a 1D-numpy array of output labels (equivalent to a rank-1 tensor in
# PyTorch which itself is equivalent to a vector in pen & paper)

ohe = OneHotEncoder(sparse_output = False)
Y = torch.tensor(ohe.fit_transform(y.reshape(-1, 1)), dtype = torch.float64)
print(Y)

---

Loss for each sample can be quantified using the categorical crossentropy (CCE) loss function which is defined as $$\color{yellow}{-\log(\text{predicted probability that the sample belongs its correct class})}$$

For example, consider a sample with

- true label = [$\color{yellow}{1}$ 0 0]
- predicted probabilities (the softmax-activated scores) = [$\color{yellow}{0.05}$, 0.99, 0.05]

$\Rightarrow$ categorical crossentropy loss = $-\log(\color{yellow}{0.05}).$

---

In [None]:
y = torch.tensor([1.0, 0.0, 0.0])
a = torch.tensor([0.05, 0.99, 0.05])
-torch.log(torch.sum(y*a))

---

The forward propagation without and with standardization of the data

---

In [None]:
# Without standardizing the data

# Calculate raw scores
Z = torch.matmul(X, W)
print(f'The raw scores matrix:\n{Z}')

# Calculate the softmax-activated scores matrix
softmax = torch.nn.Softmax(dim = 1)
A = softmax(Z)
print(f'The softmax-activated scores matrix:\n{A}')
print(f'One-hot encoded true output labels:\n{Y}')

print()

# Quantify the unhappiness w.r.t. the current set of weights for all the samples
print(f'Average training loss = {torch.mean(-torch.log(torch.sum(Y*A, dim = 1)))}')


In [None]:
# Standardize the data
sc = StandardScaler() # create a standard scaler object
X_std = torch.tensor(sc.fit_transform(X), dtype = torch.float64)
print(f'The standardized data matrix:\n{X_std}')

# Calculate the raw scores using the standardized data matrix
# and the weights matrix
Z = torch.matmul(X_std, W)
print(f'The raw scores matrix:\n{Z}')

# Calculate the softmax-activated scores matrix
softmax = torch.nn.Softmax(dim = 1)
A = softmax(Z)
print(f'The softmax-activated scores matrix:\n{A}')
print(f'One-hot encoded true output labels matrx:\n{Y}')

# Quantify the unhappiness w.r.t. the current set of weights a.k.a. the training loss
print(f'Average training loss = {torch.mean(-torch.log(torch.sum(Y*A, dim = 1)))}')

---

Using PyTorch, calculate the optimal weights for the patient data matrix

---

In [None]:
# Patients data matrix
X = torch.tensor([[72, 120, 37.3, 104, 32.5],
                 [85, 130, 37.0, 110, 14],
                 [68, 110, 38.5, 125, 34],
                 [90, 140, 38.0, 130, 26],
                 [84, 132, 38.3, 146, 30],
                 [78, 128, 37.2, 102, 12]], dtype = torch.float64)
#print(f'Patient data matrix X:\n {X}') #f-string in Python

# Initial Weights matrix (trainable tensor)
W = torch.tensor([[0.9, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64,
                 requires_grad = True)
#print(f'Weights matrix:\n {W}')

# Create a 1D-numpy array of output labels (equivalent to a rank-1 tensor in
# PyTorch which itself is equivalent to a vector in pen & paper)
y = np.array(['non-diabetic',
              'diabetic',
              'non-diabetic',
              'pre-diabetic',
              'diabetic',
              'pre-diabetic'])
# Creating a one-hot encoder object
ohe = OneHotEncoder(sparse_output = False)
# Create the one-hot encoded true output labels matrix
Y = torch.tensor(ohe.fit_transform(y.reshape(-1, 1)), dtype = torch.float64)
#print(f'One-hot encoded output labels matrix:\n {Y}')

# Standardize the data
sc = StandardScaler() # create a standard scaler object
X_std = torch.tensor(sc.fit_transform(X), dtype = torch.float64)
#print(f'The standardized data matrix:\n{X_std}')

# Define optimizer
optimizer = torch.optim.Adam([W], lr = 1e-02)

# Loss function
def loss_fn(W):
  # Raw scores
  Z = torch.matmul(X_std, W)

  # Softmax-activated scores
  softmax = torch.nn.Softmax(dim = 1)
  A = softmax(Z)

  # Calculate the average training loss
  L = torch.mean(-torch.log(torch.sum(Y*A, dim = 1)))
  return L

# Optimization loop
num_epochs = 1000
loss_train = np.empty(num_epochs)
for epoch in range(num_epochs):
  # Zero out the gradients
  optimizer.zero_grad()

  # Forward propagation (loss calculation)
  loss = loss_fn(W)

  # Backward propagation and optimization
  loss.backward()
  optimizer.step()

  # Print the loss every epoch
  loss_train[epoch] = loss.item()
  print(f'Epoch {epoch}, loss = {loss.item()}')

---

Plot training loss curve

---

In [None]:
## Plot train loss as a function of epoch:
fig, ax = plt.subplots(1, 1, figsize = (4, 4))
fig.tight_layout(pad = 4.0)
ax.plot(loss_train, 'b')
ax.set_xlabel('Epoch', fontsize = 12)
ax.set_ylabel('Loss value', fontsize = 12)
ax.set_title('Training Loss vs. Epoch', fontsize = 14);

In [None]:
# Print the optimized Weights matrix
print(W)

---

How good are the weights when applied to the training data?

---

In [None]:
with torch.no_grad():
  # Raw scores 
  Z = torch.matmul(X_std, W)

  # Softmax-activated scores 
  softmax = torch.nn.Softmax(dim = 1)
  A = softmax(Z)

# Get predicted labels 
print(f'Predicted labels for training data: {A.argmax(dim = 1)}')

# True output labels
print(f'True output labels: {Y.argmax(dim = 1)}')