In [9]:
# by using this below code, we dont need to download the kaggle dataset , we can directly use from kaggle
!pip install opendatasets --quiet
import opendatasets as od
od.download("https://www.kaggle.com/datasets/zalando-research/fashionmnist", quiet=True)

Skipping, found downloaded files in "./fashionmnist" (use force=True to force download)


# This will be our architectural flow of ANN -

input layer (784) ->  hidden layer-1 ( 128 neurons) ->  relu activation function -> hidden layer-2 ( 64 neurons) ->  output layer (10 neurons)

In [2]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torch.optim as optim

In [3]:
# every run will give (almost) the same results
torch.manual_seed(42) # 42 random numbers

# check for GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [4]:
df = pd.read_csv("/content/fashionmnist/fashion-mnist_train.csv")
df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# train and test split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:],df.iloc[:,0],test_size=0.2,random_state=42)

In [8]:
# scaling the training datset (to keep all the values in the range of 0 and 1)
X_train = X_train/255.0
X_test = X_test/255.0

pixel1      float64
pixel2      float64
pixel3      float64
pixel4      float64
pixel5      float64
             ...   
pixel780    float64
pixel781    float64
pixel782    float64
pixel783    float64
pixel784    float64
Length: 784, dtype: object


In [None]:
class CustomDataset(Dataset):
  def __init__(self,input_data,output_data):
    self.input_data = torch.tensor(input_data.to_numpy(),dtype=torch.float32) # here we change the input_data dtype to numpy array and then changed the dtype to PyTorch Tensors
    # as directly from X_train or X_test we cant convert it to tensors.
    self.output_data = torch.tensor(output_data.to_numpy(),dtype=torch.long)
  def __len__(self):
    return len(self.input_data)
  def __getitem__(self,index):
    return self.input_data[index],self.output_data[index]

# we now need to create object of this class and it will automatically call the constructor that is __init__ method
train_dataset = CustomDataset(X_train,y_train)
test_dataset = CustomDataset(X_test,y_test)

# now we need to call the DataLoader class --> which will create mini batches (in our case we took 32 batches )-> like this --> DataLoader(Dataset)
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=32,shuffle=False) # because when we predict output , we dont want to shuffle the data

In [None]:
# now we will define our model
class MyNN(nn.Module):
  def __init__(self, input_dim, output_dim, num_hidden_layers, neurons_per_layer, dropout_rate):
    super().__init__() # this means we are calling the nn.Module class constructor here
    layers = []
    for i in range(num_hidden_layers):
      layers.append(nn.Linear(input_dim, neurons_per_layer))
      layers.append(nn.BatchNorm1d(neurons_per_layer))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout_rate))
      input_dim = neurons_per_layer
    layers.append(nn.Linear(neurons_per_layer, output_dim))
    self.model = nn.Sequential(*layers)

  def forward(self, x):
    return self.model(x)

In [None]:
# objective function
def objective(trial):

  # next hyperparameter values from the search space
  num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 5)
  neurons_per_layer = trial.suggest_int("neurons_per_layer", 8, 128, step=8)
  epochs = trial.suggest_int("epochs", 10, 50, step=10)
  learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
  dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
  batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
  optimizer_name = trial.suggest_categorical("optimizer", ['Adam', 'SGD', 'RMSprop'])
  weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)

  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

  # model init
  input_dim = 784
  output_dim = 10

  model = MyNN(input_dim, output_dim, num_hidden_layers, neurons_per_layer, dropout_rate)
  model.to(device)

  # optimizer selection
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=1e-4)

  if optimizer_name == 'Adam':
    optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer_name == 'SGD':
    optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  else:
    optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

  # training loop

  for epoch in range(epochs):

    for batch_features, batch_labels in train_loader:

      # move data to gpu
      batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

      # forward pass
      outputs = model(batch_features)

      # calculate loss
      loss = criterion(outputs, batch_labels)

      # back pass
      optimizer.zero_grad()
      loss.backward()

      # update grads
      optimizer.step()


  # evaluation
  model.eval()
  # evaluation on test data
  total = 0
  correct = 0

  with torch.no_grad():

    for batch_features, batch_labels in test_loader:

      # move data to gpu
      batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

      outputs = model(batch_features)

      _, predicted = torch.max(outputs, 1)

      total = total + batch_labels.shape[0]

      correct = correct + (predicted == batch_labels).sum().item()

    accuracy = correct/total

  return accuracy

What does *layers do?

The * is Python argument unpacking.

layers is a list, but nn.Sequential expects separate arguments:

nn.Sequential(layer1, layer2, layer3, ...)


Using *layers converts the list into separate arguments:

nn.Sequential(*layers) == nn.Sequential(layers[0], layers[1], layers[2], ...)

In [None]:
# Optuna is hyperparameter optimization library where we try different hyperparamters and run multiple trials and find the combination that maximize the metric.
!pip install optuna
import optuna
study = optuna.create_study(direction="maximize")



[I 2025-10-21 19:43:54,433] A new study created in memory with name: no-name-144519b6-3113-4cdd-90fa-5f70dbbd37ad


In [None]:
study.optimize(objective, n_trials=10)

[I 2025-10-21 19:46:40,476] Trial 0 finished with value: 0.8823333333333333 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 120, 'epochs': 30, 'learning_rate': 0.04013010531887342, 'dropout_rate': 0.4, 'batch_size': 128, 'optimizer': 'SGD', 'weight_decay': 0.00022921858570072268}. Best is trial 0 with value: 0.8823333333333333.
[I 2025-10-21 19:49:59,338] Trial 1 finished with value: 0.88825 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 120, 'epochs': 30, 'learning_rate': 0.06781883468145464, 'dropout_rate': 0.2, 'batch_size': 64, 'optimizer': 'Adam', 'weight_decay': 6.592847783306235e-05}. Best is trial 1 with value: 0.88825.
[I 2025-10-21 19:52:47,110] Trial 2 finished with value: 0.87775 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 104, 'epochs': 20, 'learning_rate': 1.5717704440517354e-05, 'dropout_rate': 0.4, 'batch_size': 16, 'optimizer': 'SGD', 'weight_decay': 6.40583474767133e-05}. Best is trial 1 with value: 0.88825.
[I 2025-10-21

In [None]:
study.best_value

0.88825

In [None]:
study.best_params

{'num_hidden_layers': 5,
 'neurons_per_layer': 120,
 'epochs': 30,
 'learning_rate': 0.06781883468145464,
 'dropout_rate': 0.2,
 'batch_size': 64,
 'optimizer': 'Adam',
 'weight_decay': 6.592847783306235e-05}