In [1]:
import torch #main frame work we used to perform high dimension tensor calculation
import torch.nn as nn #this class we used to define Custom Neural Network Architecture
from torch.utils.data import Dataset,DataLoader #this Dataset class we used split the entire data into chunks based on Batch_sizes
#where as DataLoader class we used to load data and perform action over it.

from sklearn.model_selection import train_test_split #this class we used to split the data into train or test sets
from sklearn.preprocessing import StandardScaler,MinMaxScaler 
#above this class used to do input scaling so that variance of all field would be same for models

import pandas as pd
from cloudpickle import pickle #this class we used to convert datastructure into bytes stream
import seaborn as sb #to see the visualization.
import optuna

In [2]:
#if i want to show gpu is avilable or not in system.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Now Loading Dataset and doing EDA over it

In [3]:
data = pd.read_csv(filepath_or_buffer="insurance.csv",encoding="utf-8")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1338 non-null   int64  
 1   sex             1338 non-null   int64  
 2   bmi             1338 non-null   float64
 3   children        1338 non-null   int64  
 4   smoker          1338 non-null   int64  
 5   region          1338 non-null   int64  
 6   charges         1338 non-null   float64
 7   insuranceclaim  1338 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 83.8 KB


### step:1 checking for Null value

In [4]:
100 * data.isnull().sum()/data.shape[0] 

age               0.0
sex               0.0
bmi               0.0
children          0.0
smoker            0.0
region            0.0
charges           0.0
insuranceclaim    0.0
dtype: float64

### step:2 checking for duplicated record

In [5]:
data.duplicated().sum()

1

In [6]:
#to see which records showing duplicacy
data[data.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
581,19,1,30.59,0,0,1,1639.5631,1


In [7]:
#dropping those duplocate records from data.
data.drop_duplicates(keep="first",inplace=True,ignore_index=True)

In [8]:
#again checking duplicated record deleted or not.
data.duplicated().sum()

0

### step:3 checking for target variable

In [9]:
data.insuranceclaim.value_counts(normalize=True,ascending=False,dropna=False) #both the classes are balanced in data.

insuranceclaim
1    0.584892
0    0.415108
Name: proportion, dtype: float64

### step:4 splitting the data into training set or testing set

In [10]:
#selecting input and output variable
x = data.drop(labels="insuranceclaim",axis=1)
y = data["insuranceclaim"]

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y) 
#stratify ensure both class distribution goes same to both sets(train or test)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((1069, 7), (268, 7), (1069,), (268,))

In [12]:
#to show the dtype and dimension of train test split data
print(f"Training set dimension: {x_train.ndim} and Testing set dimension: {x_test.ndim}")
print(f"Training set dtype:   {x_train.dtypes} and Testing set dtype: {x_test.dtypes}")
print(f"Training set type:   {type(x_train)} and Testing set type: {type(x_test)}")

Training set dimension: 2 and Testing set dimension: 2
Training set dtype:   age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region        int64
charges     float64
dtype: object and Testing set dtype: age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region        int64
charges     float64
dtype: object
Training set type:   <class 'pandas.core.frame.DataFrame'> and Testing set type: <class 'pandas.core.frame.DataFrame'>


```
| Task Type                                | Input dtype     | Target dtype    | Loss Function          |
| ---------------------------------------- | --------------- | --------------- | ---------------------- |
| **Binary Classification (Sigmoid)**      | `torch.float32` | `torch.float32` | `nn.BCEWithLogitsLoss` |
| **Multi-class Classification (Softmax)** | `torch.float32` | `torch.long`    | `nn.CrossEntropyLoss`  |


```

In [13]:
x_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1290,19,1,34.9,0,1,3,34828.654
359,18,0,20.79,0,0,2,1607.5101
579,25,0,23.465,0,0,0,3206.49135
662,18,1,33.66,0,0,2,1136.3994
1167,32,1,35.2,2,0,3,4670.64


In [14]:
y_train.head()

1290    1
359     0
579     0
662     1
1167    0
Name: insuranceclaim, dtype: int64

### step:5 now doing input scaling converting into numpy object

In [15]:
#creating an object of standardscaler class.
ss = StandardScaler()

#now applying scaling on input data.
x_train_ss = ss.fit_transform(x_train.astype("float32")) #converting dataframe 2d object to 2d numpy object
x_test_ss  = ss.transform(x_test.astype("float32"))      #Transforming testing dataframe 2d object to 2d numpy object
print(f"Training set dimension: {x_train_ss.ndim}  and Testing set dimension: {x_test_ss.ndim}")
print(f"Training set dtype:     {x_train_ss.dtype} and Testing set dtype: {x_test_ss.dtype}")
print(f"Training set type:      {type(x_train_ss)} and Testing set type: {type(x_test_ss)}")

Training set dimension: 2  and Testing set dimension: 2
Training set dtype:     float32 and Testing set dtype: float32
Training set type:      <class 'numpy.ndarray'> and Testing set type: <class 'numpy.ndarray'>


In [16]:
#converting target variable set to numpy object.
import numpy as np
y_train_np = np.array(object=y_train,dtype=np.float32)
y_test_np = np.array(object=y_test,dtype=np.float32)
print(f"Training set dimension: {y_train_np.ndim} and Testing set dimension: {y_test_np.ndim}")
print(f"Training set type:   {type(y_train_np)} and Testing set type: {type(y_test_np)}")

Training set dimension: 1 and Testing set dimension: 1
Training set type:   <class 'numpy.ndarray'> and Testing set type: <class 'numpy.ndarray'>


### step:6 converting numpy object to tensor object

In [17]:
x_train_tensor = torch.from_numpy(x_train_ss)
x_test_tensor = torch.from_numpy(x_test_ss)
y_train_tensor = torch.from_numpy(y_train_np)
y_test_tensor = torch.from_numpy(y_test_np)
print(f"Training set tensor: {type(x_train_tensor)} and Testing set tensor: {type(x_test_tensor)}")

Training set tensor: <class 'torch.Tensor'> and Testing set tensor: <class 'torch.Tensor'>


In [18]:
x_train_tensor,y_train_tensor

(tensor([[-1.4381,  0.9879,  0.7013,  ...,  2.0106,  1.3720,  1.8369],
         [-1.5094, -1.0122, -1.6309,  ..., -0.4974,  0.4717, -0.9687],
         [-1.0101, -1.0122, -1.1888,  ..., -0.4974, -1.3291, -0.8337],
         ...,
         [-0.8675,  0.9879, -0.2491,  ...,  2.0106,  0.4717,  0.4365],
         [ 1.0582, -1.0122,  0.3343,  ..., -0.4974, -1.3291, -0.1819],
         [ 0.3450, -1.0122,  0.2781,  ..., -0.4974,  0.4717, -0.4598]]),
 tensor([1., 0., 0.,  ..., 1., 1., 1.]))

### step:7) using Custom Dataset class to split the dataset into chunks based on Batch_sizes

In [19]:
class CustomDataset(Dataset):
    #using constructor class to initalize the instance variable init
    def __init__(self,input,output):
        self.input  = input
        self.output = output
        
    #using another magical method to get shape of input data.
    def __len__(self):
        return len(self.input)
    
    #using another magical method to split data into chunks.
    def __getitem__(self, index):
        return self.input[index],self.output[index]

In [20]:
#creating an object of CustomDataset class.
train_dataset = CustomDataset(input=x_train_tensor,output=y_train_tensor)
test_dataset  = CustomDataset(input=x_test_tensor,output=y_test_tensor)

In [21]:
len(train_dataset),len(test_dataset)

(1069, 268)

### dataloader class we used to load the data based on batch sizes

### step:8 Optuna Hyperparameter at Backened smartly they will used bayessian search technique to find value

In [22]:
import torch.nn.init as init
#now defining deep neural network Architecture
class SimpleNeuralArchitecture(nn.Module):
    
    #using constructor method to define neural network architecture variable init.
    def __init__(self,input_dim,output_dim,num_hidden_layer,neurons_per_hidden_layer,dropout_rate):
        
        #inheriting parent class constructor.
        super().__init__()
    
        
        layers = [] #whole neural architecture layer we are storing Here
        
        for i in range(num_hidden_layer):
            # adding a fully connected (Linear) layer
            layers.append(nn.Linear(in_features=input_dim,out_features=neurons_per_hidden_layer))
            
            # BatchNorm
            layers.append(nn.BatchNorm1d(neurons_per_hidden_layer))
            
            # Activation function using Relu 
            layers.append(nn.ReLU())
            
            # Dropout
            layers.append(nn.Dropout(p=dropout_rate))
            
            # updating input dimension for the next layer
            input_dim = neurons_per_hidden_layer
        
        
        # Finally, adding the output layer
        layers.append(nn.Linear(in_features=input_dim, out_features=output_dim))
        
        
        # Combine all layers into a Sequential block
        self.model = nn.Sequential(*layers)
        
        #calling the weight initialization method.
        self._init_weights()
        
        
    # Weight initialization
    def _init_weights(self):
        for m in self.modules():
            #Yaani, self.modules() ek PyTorch built-in method hai 
            #jo tumhare nn.Module (yani neural network class) ke andar ke saare submodules return karta hai.
            if isinstance(m, nn.Linear):
                # ReLU ke liye He initialization
                init.kaiming_normal_(m.weight, nonlinearity='relu')
                init.zeros_(m.bias)
        
        
    def forward(self,input_data):
        x = self.model(input_data)
        # output ke liye sigmoid lagana hai agar binary classification ho
        x = torch.sigmoid(x)
        return x

In [23]:
#Define the objective function
def objective(trial):
    #Define the Search Space in ANN(like nos of hidden layer,nos of neurons per layer...so on)
    num_hidden_layer = trial.suggest_int("num_hidden_layer", 1, 6)
    neurons_per_hidden_layer = trial.suggest_int("neurons_per_hidden_layer", 16, 256, step=16)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs = trial.suggest_int("epochs", 10, 60,step=10)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    
    #defining the model and its all parameter
    input_dim = x_train_tensor.shape[1]
    output_dim = 1
    
    
    #creating an object of SimpleNeuralArchitecture class
    model = SimpleNeuralArchitecture(input_dim=input_dim,output_dim=output_dim,
                                     num_hidden_layer=num_hidden_layer,
                                     neurons_per_hidden_layer=neurons_per_hidden_layer,
                                     dropout_rate = dropout_rate
                                     
                                     
                                     
                                     
                                     
                                     ).to(device=device)

    
    #now initializing the remaining model parameter like epoch optimizer ...so on
    
    #target are binary classifier so loss function binary cross entropy.
    loss_fxn = nn.BCELoss() #calculate loss value

    
    
    if optimizer_name == "Adam":
        #optimizers i am using Adam they will updates the trainable parameter effectively in each layer.
        optimizer = torch.optim.Adam(params=model.parameters(),lr=lr,weight_decay=weight_decay) 
    
    elif optimizer_name == "RMSprop":
        #optimizers i am using Adam they will updates the trainable parameter effectively in each layer.
        optimizer = torch.optim.RMSprop(params=model.parameters(),lr=lr,weight_decay=weight_decay) 
      
    
    elif optimizer_name == "SGD":
        #optimizers i am using Adam they will updates the trainable parameter effectively in each layer.
        optimizer = torch.optim.SGD(params=model.parameters(),lr=lr,weight_decay=weight_decay) 
    
    # EarlyStopping variables
    best_loss = np.inf         # abhi tak ka sabse best (minimum) test loss
    patience_counter = 0       # kitne epochs ho gaye bina improvement ke
    patience = 5               # agar 5 epoch tak test loss improve nahi hota to stop
    
    
    
    training_dataloader   = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,pin_memory=True)
    testing_dataloader = DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True,pin_memory=True)
    
    
    #Now training and evaluating the performance of model
    training_loss_lst = []
    testing_loss_lst = []
    for i in range(epochs):
        #performing model training
        model.train()
        training_running_loss_count = 0
        
        #based on batchsize data record we are passing to neural network architecture.
        for feature,label in training_dataloader:
            feature = feature.to(device)
            label   = label.to(device)
            
            #whenever we start training the model first algorithm work is forward propogation
            pred_output = model.forward(feature)
            #print(f"actual output shape: {label.shape}")
            #print(f"predicted output shape: {pred_output.shape}")
            
            #calculating the loss value by using Loss fxns or evaluating the performance of model at time of training.
            loss_value = loss_fxn(pred_output.squeeze(1),label)
            
            #now updating the training_running_loss_count variable
            training_running_loss_count = training_running_loss_count+loss_value.item() 
            #this item method we used to get value from shape of tensor
            
            #before applying backpropogation to the loss value first we have to clear the gradient inside optimizers
            optimizer.zero_grad()
            
            #reducing the loss value by using backpropogation algorithm and calcualting gradient of loss wrt to trainable parameter
            loss_value.backward()
            
            #using optimizers updating the tainable parameters
            optimizer.step()
            
        avg_training_loss = (training_running_loss_count)/(len(training_dataloader))
        training_loss_lst.append(avg_training_loss)

        
        #performing model testing(on testing we dont apply any backpropogation,overfitting technique,early stopping)
        model.eval()
        testing_running_loss_count = 0
        
        with torch.no_grad(): #at time of testing or inferencing gradient tracking is off not applying overfitting technique too
            #based on batchsize data record we are passing to neural network architecture.
            for feature,label in testing_dataloader:
                feature = feature.to(device)
                label   = label.to(device)
                
                #whenever we start training the model first algorithm work is forward propogation
                pred_output = model.forward(feature)
                
                #calculating the loss value by using Loss fxns or evaluating the performance of model at time of training.
                loss_value = loss_fxn(pred_output.squeeze(1),label)
                
                #updating the count of loss in testing_running_loss_count
                testing_running_loss_count = testing_running_loss_count+loss_value.item()
                
            avg_testing_loss = (testing_running_loss_count)/(len(testing_dataloader))
            testing_loss_lst.append(avg_testing_loss)
                
        print(f"Epoch [{i+1}/{epochs}], "
            f"Train Loss: {avg_training_loss:.4f}, "
            f"Test Loss: {avg_testing_loss:.4f}, "
            )
        
        
        # ---------------- EARLY STOPPING + CHECKPOINT ----------------
        if avg_testing_loss < best_loss:
            # Agar test loss improve ho gaya hai
            best_loss = avg_testing_loss
            patience_counter = 0  # reset patience
            torch.save(model.state_dict(), "best_model.pth")  # ModelCheckpoint
            print(f"✅ Model improved. Saved at epoch {i+1} with Test Loss {best_loss:.4f}")
            
        else:
            # Agar test loss improve nahi hua
            patience_counter += 1
            print(f"⚠️ No improvement. Patience counter = {patience_counter}/{patience}")

            if patience_counter >= patience:
                print(f"⏹ Early stopping at epoch {i+1} (best loss = {best_loss:.4f})")
                break
            
    return best_loss  # <-- Very important for Optuna
    print("Training Finished ✅")   
        
        
        
        

# creating study object

In [24]:
study = optuna.create_study(direction='minimize')
study

[I 2025-10-10 23:43:32,832] A new study created in memory with name: no-name-7949942d-8ca4-48d5-a0a7-5841a12edc61


<optuna.study.study.Study at 0x1fe3afce770>

In [25]:
#now we will run the study object.
study.optimize(func=objective,n_trials=10)

Epoch [1/20], Train Loss: 0.7049, Test Loss: 0.5366, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.5366
Epoch [2/20], Train Loss: 0.5850, Test Loss: 0.4657, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.4657
Epoch [3/20], Train Loss: 0.5112, Test Loss: 0.4077, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.4077
Epoch [4/20], Train Loss: 0.4610, Test Loss: 0.3755, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.3755
Epoch [5/20], Train Loss: 0.4415, Test Loss: 0.3623, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.3623
Epoch [6/20], Train Loss: 0.4271, Test Loss: 0.3623, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.3623
Epoch [7/20], Train Loss: 0.4143, Test Loss: 0.3506, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.3506
Epoch [8/20], Train Loss: 0.3795, Test Loss: 0.3569, 
⚠️ No improvement. Patience counter = 1/5
Epoch [9/20], Train Loss: 0.3887, Test Loss: 0.3271, 
✅ Model improved. Saved at epoch 9 with Test Loss 0.3271
Epoch [10/20], T

[I 2025-10-10 23:43:44,095] Trial 0 finished with value: 0.3105880320072174 and parameters: {'num_hidden_layer': 4, 'neurons_per_hidden_layer': 16, 'lr': 0.0039235629605612376, 'weight_decay': 6.80998035096758e-05, 'dropout_rate': 0.2164409529939766, 'batch_size': 16, 'epochs': 20, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.3105880320072174.


Epoch [20/20], Train Loss: 0.3965, Test Loss: 0.3164, 
⚠️ No improvement. Patience counter = 1/5
Epoch [1/10], Train Loss: 0.8449, Test Loss: 0.7696, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.7696
Epoch [2/10], Train Loss: 0.8533, Test Loss: 0.7506, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.7506
Epoch [3/10], Train Loss: 0.8153, Test Loss: 0.7382, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.7382
Epoch [4/10], Train Loss: 0.8134, Test Loss: 0.7288, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.7288
Epoch [5/10], Train Loss: 0.7957, Test Loss: 0.6983, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.6983
Epoch [6/10], Train Loss: 0.7812, Test Loss: 0.7188, 
⚠️ No improvement. Patience counter = 1/5
Epoch [7/10], Train Loss: 0.7542, Test Loss: 0.7018, 
⚠️ No improvement. Patience counter = 2/5
Epoch [8/10], Train Loss: 0.7532, Test Loss: 0.6757, 
✅ Model improved. Saved at epoch 8 with Test Loss 0.6757


[I 2025-10-10 23:43:45,149] Trial 1 finished with value: 0.6756561928325229 and parameters: {'num_hidden_layer': 1, 'neurons_per_hidden_layer': 48, 'lr': 0.00062580887205395, 'weight_decay': 0.0013627574583458893, 'dropout_rate': 0.30466844459148124, 'batch_size': 32, 'epochs': 10, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.3105880320072174.


Epoch [9/10], Train Loss: 0.7267, Test Loss: 0.6769, 
⚠️ No improvement. Patience counter = 1/5
Epoch [10/10], Train Loss: 0.7273, Test Loss: 0.6882, 
⚠️ No improvement. Patience counter = 2/5
Epoch [1/50], Train Loss: 0.8963, Test Loss: 0.7136, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.7136
Epoch [2/50], Train Loss: 0.8229, Test Loss: 0.6765, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.6765
Epoch [3/50], Train Loss: 0.7777, Test Loss: 0.6276, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.6276
Epoch [4/50], Train Loss: 0.7497, Test Loss: 0.6111, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.6111
Epoch [5/50], Train Loss: 0.7084, Test Loss: 0.5902, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.5902
Epoch [6/50], Train Loss: 0.6797, Test Loss: 0.5855, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.5855
Epoch [7/50], Train Loss: 0.6650, Test Loss: 0.5555, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.5555
Epoch [8/50], Train Loss: 0.62

[I 2025-10-10 23:44:00,551] Trial 2 finished with value: 0.3918066366630442 and parameters: {'num_hidden_layer': 4, 'neurons_per_hidden_layer': 192, 'lr': 3.947574760885219e-05, 'weight_decay': 1.8370594664823353e-05, 'dropout_rate': 0.3652866248086337, 'batch_size': 16, 'epochs': 50, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.3105880320072174.


Epoch [42/50], Train Loss: 0.4242, Test Loss: 0.4021, 
⚠️ No improvement. Patience counter = 5/5
⏹ Early stopping at epoch 42 (best loss = 0.3918)
Epoch [1/50], Train Loss: 1.0169, Test Loss: 0.8061, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.8061
Epoch [2/50], Train Loss: 0.9418, Test Loss: 0.7649, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.7649
Epoch [3/50], Train Loss: 0.9375, Test Loss: 0.7670, 
⚠️ No improvement. Patience counter = 1/5
Epoch [4/50], Train Loss: 0.8425, Test Loss: 0.7510, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.7510
Epoch [5/50], Train Loss: 0.8504, Test Loss: 0.7355, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.7355
Epoch [6/50], Train Loss: 0.8192, Test Loss: 0.7184, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.7184
Epoch [7/50], Train Loss: 0.8135, Test Loss: 0.7109, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.7109
Epoch [8/50], Train Loss: 0.7970, Test Loss: 0.6898, 
✅ Model improved. Saved at epoch 8 wi

[I 2025-10-10 23:44:12,506] Trial 3 finished with value: 0.6218139108489541 and parameters: {'num_hidden_layer': 5, 'neurons_per_hidden_layer': 192, 'lr': 0.00048365982856225613, 'weight_decay': 0.0013281448882618058, 'dropout_rate': 0.3544370386760839, 'batch_size': 16, 'epochs': 50, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.3105880320072174.


Epoch [32/50], Train Loss: 0.6876, Test Loss: 0.6347, 
⚠️ No improvement. Patience counter = 5/5
⏹ Early stopping at epoch 32 (best loss = 0.6218)
Epoch [1/20], Train Loss: 1.0065, Test Loss: 0.6868, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.6868
Epoch [2/20], Train Loss: 0.9932, Test Loss: 0.6797, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.6797
Epoch [3/20], Train Loss: 0.9887, Test Loss: 0.6852, 
⚠️ No improvement. Patience counter = 1/5
Epoch [4/20], Train Loss: 0.9463, Test Loss: 0.6863, 
⚠️ No improvement. Patience counter = 2/5
Epoch [5/20], Train Loss: 0.9634, Test Loss: 0.6712, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.6712
Epoch [6/20], Train Loss: 0.9759, Test Loss: 0.6784, 
⚠️ No improvement. Patience counter = 1/5
Epoch [7/20], Train Loss: 0.9514, Test Loss: 0.6800, 
⚠️ No improvement. Patience counter = 2/5
Epoch [8/20], Train Loss: 0.9304, Test Loss: 0.6783, 
⚠️ No improvement. Patience counter = 3/5
Epoch [9/20], Train Loss: 0.9287, Test L

[I 2025-10-10 23:44:18,807] Trial 4 finished with value: 0.6685849042499766 and parameters: {'num_hidden_layer': 5, 'neurons_per_hidden_layer': 64, 'lr': 0.00024427761505925276, 'weight_decay': 0.0039226875922196225, 'dropout_rate': 0.441269846129134, 'batch_size': 16, 'epochs': 20, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.3105880320072174.


Epoch [17/20], Train Loss: 0.8936, Test Loss: 0.6759, 
⚠️ No improvement. Patience counter = 5/5
⏹ Early stopping at epoch 17 (best loss = 0.6686)
Epoch [1/20], Train Loss: 0.5505, Test Loss: 0.3665, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.3665
Epoch [2/20], Train Loss: 0.4364, Test Loss: 0.3445, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.3445
Epoch [3/20], Train Loss: 0.3807, Test Loss: 0.3313, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.3313
Epoch [4/20], Train Loss: 0.3770, Test Loss: 0.3067, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.3067
Epoch [5/20], Train Loss: 0.3647, Test Loss: 0.2958, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.2958
Epoch [6/20], Train Loss: 0.3501, Test Loss: 0.3024, 
⚠️ No improvement. Patience counter = 1/5
Epoch [7/20], Train Loss: 0.3644, Test Loss: 0.2944, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.2944
Epoch [8/20], Train Loss: 0.3342, Test Loss: 0.3107, 
⚠️ No improvement. Patience counter =

[I 2025-10-10 23:44:25,855] Trial 5 finished with value: 0.2616649962523404 and parameters: {'num_hidden_layer': 4, 'neurons_per_hidden_layer': 112, 'lr': 0.0016882815273173066, 'weight_decay': 2.861148895806235e-05, 'dropout_rate': 0.24655773427524139, 'batch_size': 16, 'epochs': 20, 'optimizer': 'Adam'}. Best is trial 5 with value: 0.2616649962523404.


Epoch [20/20], Train Loss: 0.3249, Test Loss: 0.2716, 
⚠️ No improvement. Patience counter = 3/5
Epoch [1/50], Train Loss: 0.7918, Test Loss: 0.6919, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.6919
Epoch [2/50], Train Loss: 0.7553, Test Loss: 0.6423, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.6423
Epoch [3/50], Train Loss: 0.7373, Test Loss: 0.6439, 
⚠️ No improvement. Patience counter = 1/5
Epoch [4/50], Train Loss: 0.7284, Test Loss: 0.6307, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.6307
Epoch [5/50], Train Loss: 0.6732, Test Loss: 0.6304, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.6304
Epoch [6/50], Train Loss: 0.6637, Test Loss: 0.5878, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.5878
Epoch [7/50], Train Loss: 0.6653, Test Loss: 0.5810, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.5810
Epoch [8/50], Train Loss: 0.6617, Test Loss: 0.5635, 
✅ Model improved. Saved at epoch 8 with Test Loss 0.5635
Epoch [9/50], Train Loss: 0.61

[I 2025-10-10 23:44:28,573] Trial 6 finished with value: 0.40782698392868044 and parameters: {'num_hidden_layer': 2, 'neurons_per_hidden_layer': 16, 'lr': 0.0004219081870824761, 'weight_decay': 0.0010042579177842793, 'dropout_rate': 0.34692902158422056, 'batch_size': 64, 'epochs': 50, 'optimizer': 'Adam'}. Best is trial 5 with value: 0.2616649962523404.


Epoch [36/50], Train Loss: 0.4895, Test Loss: 0.4230, 
⚠️ No improvement. Patience counter = 4/5
Epoch [37/50], Train Loss: 0.4841, Test Loss: 0.4207, 
⚠️ No improvement. Patience counter = 5/5
⏹ Early stopping at epoch 37 (best loss = 0.4078)
Epoch [1/20], Train Loss: 0.7611, Test Loss: 0.6923, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.6923
Epoch [2/20], Train Loss: 0.7557, Test Loss: 0.6669, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.6669
Epoch [3/20], Train Loss: 0.7443, Test Loss: 0.6757, 
⚠️ No improvement. Patience counter = 1/5
Epoch [4/20], Train Loss: 0.7488, Test Loss: 0.6509, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.6509
Epoch [5/20], Train Loss: 0.7271, Test Loss: 0.6603, 
⚠️ No improvement. Patience counter = 1/5
Epoch [6/20], Train Loss: 0.7424, Test Loss: 0.6361, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.6361
Epoch [7/20], Train Loss: 0.7056, Test Loss: 0.6287, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.6287
Epoch [8/

[I 2025-10-10 23:44:30,674] Trial 7 finished with value: 0.5405612349510193 and parameters: {'num_hidden_layer': 4, 'neurons_per_hidden_layer': 144, 'lr': 3.760239587290328e-05, 'weight_decay': 0.0026092464883826355, 'dropout_rate': 0.36957578263888824, 'batch_size': 64, 'epochs': 20, 'optimizer': 'Adam'}. Best is trial 5 with value: 0.2616649962523404.


Epoch [20/20], Train Loss: 0.6215, Test Loss: 0.5406, 
✅ Model improved. Saved at epoch 20 with Test Loss 0.5406
Epoch [1/30], Train Loss: 0.8222, Test Loss: 0.6448, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.6448
Epoch [2/30], Train Loss: 0.6417, Test Loss: 0.5520, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.5520
Epoch [3/30], Train Loss: 0.5892, Test Loss: 0.5024, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.5024
Epoch [4/30], Train Loss: 0.5571, Test Loss: 0.4819, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.4819
Epoch [5/30], Train Loss: 0.5050, Test Loss: 0.4674, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.4674
Epoch [6/30], Train Loss: 0.4855, Test Loss: 0.4612, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.4612
Epoch [7/30], Train Loss: 0.4885, Test Loss: 0.4243, 
✅ Model improved. Saved at epoch 7 with Test Loss 0.4243
Epoch [8/30], Train Loss: 0.4527, Test Loss: 0.4252, 
⚠️ No improvement. Patience counter = 1/5
Epoch [9/30], 

[I 2025-10-10 23:44:34,495] Trial 8 finished with value: 0.3578377382622825 and parameters: {'num_hidden_layer': 2, 'neurons_per_hidden_layer': 128, 'lr': 7.067034723785334e-05, 'weight_decay': 0.0006234385237695559, 'dropout_rate': 0.23659996996402177, 'batch_size': 32, 'epochs': 30, 'optimizer': 'RMSprop'}. Best is trial 5 with value: 0.2616649962523404.


Epoch [26/30], Train Loss: 0.3681, Test Loss: 0.3655, 
⚠️ No improvement. Patience counter = 4/5
Epoch [27/30], Train Loss: 0.3711, Test Loss: 0.3619, 
⚠️ No improvement. Patience counter = 5/5
⏹ Early stopping at epoch 27 (best loss = 0.3578)
Epoch [1/10], Train Loss: 0.6777, Test Loss: 0.5454, 
✅ Model improved. Saved at epoch 1 with Test Loss 0.5454
Epoch [2/10], Train Loss: 0.5690, Test Loss: 0.4926, 
✅ Model improved. Saved at epoch 2 with Test Loss 0.4926
Epoch [3/10], Train Loss: 0.5398, Test Loss: 0.4575, 
✅ Model improved. Saved at epoch 3 with Test Loss 0.4575
Epoch [4/10], Train Loss: 0.5038, Test Loss: 0.4504, 
✅ Model improved. Saved at epoch 4 with Test Loss 0.4504
Epoch [5/10], Train Loss: 0.4830, Test Loss: 0.4412, 
✅ Model improved. Saved at epoch 5 with Test Loss 0.4412
Epoch [6/10], Train Loss: 0.4584, Test Loss: 0.4144, 
✅ Model improved. Saved at epoch 6 with Test Loss 0.4144
Epoch [7/10], Train Loss: 0.4396, Test Loss: 0.4238, 
⚠️ No improvement. Patience counter 

[I 2025-10-10 23:44:37,068] Trial 9 finished with value: 0.38795893946114707 and parameters: {'num_hidden_layer': 2, 'neurons_per_hidden_layer': 224, 'lr': 2.610701454467616e-05, 'weight_decay': 0.0002930514024460913, 'dropout_rate': 0.192554600593433, 'batch_size': 16, 'epochs': 10, 'optimizer': 'RMSprop'}. Best is trial 5 with value: 0.2616649962523404.


Epoch [10/10], Train Loss: 0.4373, Test Loss: 0.3880, 
✅ Model improved. Saved at epoch 10 with Test Loss 0.3880


In [27]:
print("Best trial:")
trial = study.best_trial   # single best trial
print("  Best Loss:", trial.value)
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  Best Loss: 0.2616649962523404
  Params:
    num_hidden_layer: 4
    neurons_per_hidden_layer: 112
    lr: 0.0016882815273173066
    weight_decay: 2.861148895806235e-05
    dropout_rate: 0.24655773427524139
    batch_size: 16
    epochs: 20
    optimizer: Adam
