In [1]:
import os
# Get the current working directory (where the notebook is located)
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
# Change the working directory to the parent directory
os.chdir(parent_dir)

In [2]:
import torch
from p2ch13.training import LunaTrainingApp
args = [
    '--num-workers', '1',  # Example: Set number of workers to 1
    '--batch-size', '2',  # Example: Set batch size to 2
    '--epochs', '1',       # Example: Set number of epochs to 1
]
app = LunaTrainingApp(args)
app

2024-11-17 17:22:02,793 INFO     pid:14076 p2ch13.training:086:initModel Using CUDA; 1 devices.


<p2ch13.training.LunaTrainingApp at 0x2e8a27018d0>

In [3]:
train_dl = app.initTrainDl()
val_dl = app.initValDl()
data_sample = next(iter(train_dl))
print(data_sample[0].shape)
print(data_sample[1:])

2024-11-17 17:22:08,255 INFO     pid:14076 p2ch13.dsets:182:__init__ <p2ch13.dsets.LunaDataset object at 0x000002E8A2703130>: 495958 training samples
2024-11-17 17:22:08,297 INFO     pid:14076 p2ch13.dsets:182:__init__ <p2ch13.dsets.LunaDataset object at 0x000002E8D4B94640>: 55107 validation samples


torch.Size([2, 1, 32, 48, 48])
[tensor([[1, 0],
        [1, 0]]), ['1.3.6.1.4.1.14519.5.2.1.6279.6001.272348349298439120568330857680', '1.3.6.1.4.1.14519.5.2.1.6279.6001.340158437895922179455019686521'], tensor([[ 97, 268, 201],
        [ 88, 157, 392]])]


In [4]:
import math
logits = [1, -2, 3]
exp = [math.exp(x) for x in logits]
print([round(x, 3) for x in exp])
softmax = [x / sum(exp) for x in exp]
print([round(x, 3) for x in softmax])

[2.718, 0.135, 20.086]
[0.118, 0.006, 0.876]


In [5]:
model = app.model
model

LunaModel(
  (tail_batchnorm): BatchNorm3d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (block1): LunaBlock(
    (conv1): Conv3d(1, 8, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (relu1): ReLU(inplace=True)
    (conv2): Conv3d(8, 8, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (relu2): ReLU(inplace=True)
    (maxpool): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block2): LunaBlock(
    (conv1): Conv3d(8, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (relu1): ReLU(inplace=True)
    (conv2): Conv3d(16, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (relu2): ReLU(inplace=True)
    (maxpool): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block3): LunaBlock(
    (conv1): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (relu1): ReLU(inplace=True)
    (conv2): Conv3d(32, 32, kernel_siz

In [None]:
from tqdm import tqdm
device = app.device
train_dl_subset = app.get_dl_subset(train_dl, num_samples=20)
trnMetrics_g = torch.zeros(
    3,
    len(train_dl_subset.dataset),
    device=device,
)

batch_iter = tqdm(train_dl_subset, desc="E{} Training".format(epoch_ndx), total=len(train_dl_subset))
for batch_ndx, batch_tup in enumerate(batch_iter):
    loss_var = app.computeBatchLoss(
        batch_ndx, batch_tup, train_dl_subset.batch_size, trnMetrics_g
    )


E1 Training: 100%|██████████| 10/10 [00:21<00:00,  2.12s/it]


In [28]:
trnMetrics_g

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [-0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.]],
       device='cuda:0')

In [6]:
num_epochs = 1
train_dl_subset = app.get_dl_subset(train_dl, num_samples=20)
for epoch_ndx in range(1, num_epochs + 1):
    trnMetrics_t = app.doTraining(epoch_ndx, train_dl_subset)

E1 Training: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


In [7]:
trnMetrics_t

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0000e+00],
        [1.0000e+00, 1.0000e+00, 3.6156e-22, 1.9186e-22, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 4.2039e-45, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.6581e+01, 1.5637e+01, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, 4.4291e+03]])

In [8]:
num_epochs = 1
val_dl_subset = app.get_dl_subset(val_dl, num_samples=20)
for epoch_ndx in range(1, num_epochs + 1):
    valMetrics_t = app.doValidation(epoch_ndx, val_dl_subset)
app.logMetrics(epoch_ndx, "val", valMetrics_t)

E1 Validation: 100%|██████████| 10/10 [00:14<00:00,  1.47s/it]
2024-11-17 17:24:13,491 INFO     pid:14076 p2ch13.training:263:logMetrics E1 LunaTrainingApp
  metrics_dict["correct/pos"] = pos_correct / np.float32(pos_count) * 100
2024-11-17 17:24:13,508 INFO     pid:14076 p2ch13.training:293:logMetrics E1 val      149.4884 loss,   0.0% correct
2024-11-17 17:24:13,509 INFO     pid:14076 p2ch13.training:297:logMetrics E1 val_neg  149.4884 loss,   0.0% correct (0 of 20)
2024-11-17 17:24:13,509 INFO     pid:14076 p2ch13.training:301:logMetrics E1 val_pos  nan loss,   nan% correct (0 of 0)


In [9]:
valMetrics_t

tensor([[  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000],
        [  1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,
           1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,
           1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000],
        [209.0222, 338.8909, 120.5889,  67.1825,  75.7169,  77.1178,  93.8583,
         107.2692,  58.5267, 296.5546, 194.6401,  93.6103, 106.4330,  70.8644,
         171.8813, 210.4819, 200.1293,  72.8488, 281.8973, 142.2537]])

In [None]:
from tqdm import tqdm
optimizer = app.optimizer
device = app.device
model.train()
epoch_ndx = 0
trnMetrics_g = torch.zeros(
    3,
    len(train_dl.dataset),
    device=device,
)
print(device)
batch_iter = tqdm(train_dl, desc="E{} Training".format(epoch_ndx), total=len(train_dl))
for batch_ndx, batch_tup in enumerate(batch_iter):
    optimizer.zero_grad()

    loss_var = app.computeBatchLoss(
        batch_ndx, batch_tup, train_dl.batch_size, trnMetrics_g
    )
    loss_var.backward()
    optimizer.step()
    break

    

cuda


E0 Training:   0%|          | 7/123990 [00:16<80:34:35,  2.34s/it] 


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm
import time
import random
for _ in tqdm(range(234), desc="Sleeping"):
    time.sleep(random.random())