In [1]:
#library installations
%pip install pydicom SimpleITK numpy
%pip install pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg


Note: you may need to restart the kernel to use updated packages.
Collecting pylibjpeg-libjpeg
  Using cached pylibjpeg_libjpeg-2.3.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (4.8 kB)
Collecting pylibjpeg-openjpeg
  Using cached pylibjpeg_openjpeg-2.4.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (5.7 kB)
Collecting numpy (from pylibjpeg)
  Using cached numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl.metadata (62 kB)
Using cached pylibjpeg_libjpeg-2.3.0-cp310-cp310-macosx_10_9_x86_64.whl (677 kB)
Using cached numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl (21.2 MB)
Using cached pylibjpeg_openjpeg-2.4.0-cp310-cp310-macosx_10_9_x86_64.whl (305 kB)
Installing collected packages: numpy, pylibjpeg-openjpeg, pylibjpeg-libjpeg
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 1.25.2
[2K    Uninstalling numpy-1.25.2:
[2K      Successfully uninstalled numpy-1.25.2
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pylibjpeg-libjpeg

In [2]:
%pip install python-gdcm

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install scikit-image==0.22.0

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install --force-reinstall numpy==1.25.2 scikit-image==0.22.0

Collecting numpy==1.25.2
  Using cached numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Collecting scikit-image==0.22.0
  Using cached scikit_image-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.8 (from scikit-image==0.22.0)
  Using cached scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl.metadata (61 kB)
Collecting networkx>=2.8 (from scikit-image==0.22.0)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting pillow>=9.0.1 (from scikit-image==0.22.0)
  Using cached pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl.metadata (9.0 kB)
Collecting imageio>=2.27 (from scikit-image==0.22.0)
  Using cached imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tifffile>=2022.8.12 (from scikit-image==0.22.0)
  Using cached tifffile-2025.5.10-py3-none-any.whl.metadata (31 kB)
Collecting packaging>=21 (from scikit-image==0.22.0)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lazy_loader>=0.3

In [6]:
import os
import pydicom
import numpy as np
import SimpleITK as sitk

def load_dicom_volume(folder_path):
    # Load all DICOM files in the folder
    dicoms = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.dcm'):
            dicom = pydicom.dcmread(os.path.join(folder_path, filename))
            dicoms.append(dicom)

    # Sort slices by ImagePositionPatient or InstanceNumber
    dicoms.sort(key=lambda x: float(x.ImagePositionPatient[2]) if 'ImagePositionPatient' in x else int(x.InstanceNumber))

    # Stack slices into 3D array
    image_stack = np.stack([d.pixel_array for d in dicoms])

    # Get spacing info
    try:
        spacing = list(map(float, dicoms[0].PixelSpacing))  # in-plane spacing
        slice_thickness = float(dicoms[0].SliceThickness)
        spacing.append(slice_thickness)
    except:
        spacing = [1.0, 1.0, 1.0]  # fallback if tags missing

    return image_stack, spacing

In [7]:
def resample_volume(volume, original_spacing, new_spacing=[1.0, 1.0, 1.0]):
    original_spacing = np.array(original_spacing[::-1])  # DICOM order: z, y, x
    new_spacing = np.array(new_spacing)
    
    resize_factor = original_spacing / new_spacing
    new_shape = np.round(np.array(volume.shape) * resize_factor).astype(int)

    volume_sitk = sitk.GetImageFromArray(volume)
    volume_sitk.SetSpacing(original_spacing.tolist())

    resampler = sitk.ResampleImageFilter()
    resampler.SetOutputSpacing(new_spacing.tolist())
    resampler.SetSize([int(s) for s in new_shape[::-1]])
    resampler.SetInterpolator(sitk.sitkLinear)

    resampled = resampler.Execute(volume_sitk)
    return sitk.GetArrayFromImage(resampled)

In [8]:
def normalize_ct(volume, clip_min=-1000, clip_max=400):
    volume = np.clip(volume, clip_min, clip_max)
    volume = (volume - clip_min) / (clip_max - clip_min)  # normalize to [0, 1]
    return volume.astype(np.float32)

In [9]:
def load_and_process_dicom(folder_path):
    volume, spacing = load_dicom_volume(folder_path)
    resampled = resample_volume(volume, spacing, [1.0, 1.0, 1.0])
    normalized = normalize_ct(resampled)
    return normalized  # shape: (D, H, W)

In [10]:
import numpy as np

def get_10_montage_slices(volume):
    """Divide the volume into 10 sections and sample the center slice from each (sequentially)"""
    depth = volume.shape[0]  # z-dimension (axial slices)
    section_size = depth // 10
    slices = []

    for i in range(10):
        start = i * section_size
        end = (i + 1) * section_size if i < 9 else depth
        center_idx = (start + end) // 2
        slices.append(volume[center_idx])

    montage = np.stack(slices, axis=0)  # shape: (10, H, W), ordered top→bottom
    return montage

In [11]:
from skimage.transform import resize

def preprocess_slice(slice_2d):
    slice_2d = np.clip(slice_2d, -1000, 400)
    slice_2d = (slice_2d + 1000) / 1400
    slice_2d = resize(slice_2d, (224, 224), mode='reflect', anti_aliasing=True)
    return slice_2d.astype(np.float32)

In [12]:
def create_montage_tensor(volume):
    slices = get_10_montage_slices(volume)
    slices = [preprocess_slice(s) for s in slices]
    montage = np.stack(slices)  # shape: (10, H, W)
    montage = montage[:, np.newaxis, :, :]  # (10, 1, H, W)
    montage = np.transpose(montage, (1, 0, 2, 3))  # (1, 10, H, W)
    tensor = torch.tensor(montage, dtype=torch.float32)  # (1, 10, 224, 224)
    tensor = tensor.unsqueeze(0)  # add batch dim: (B=1, C=1, D=10, H, W)
    return tensor

In [14]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.3.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp310-cp310-macosx_10_9_x86_64.whl (11.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]━━━━━━━━━━━[0m [32m2/3[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.0 pytz-2025.2 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.


In [15]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class DicomMontageDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.labels_df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        patient_id = self.labels_df.iloc[idx]['patient_id']
        label = self.labels_df.iloc[idx]['label']
        dicom_folder = os.path.join(self.root_dir, patient_id)
        
        # Process volume
        volume = load_and_process_dicom(dicom_folder)
        tensor = create_montage_tensor(volume)

        if self.transform:
            tensor = self.transform(tensor)
            
        return tensor.squeeze(0), torch.tensor(label, dtype=torch.long)

In [17]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl (11.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m31m17.3 MB/s[0m eta [36m0:00:01[0m
Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn]━━━━━[0m [32m2/3[0m [scikit-learn]
Successfully installed joblib-1.5.1 scikit-learn-1.7.0 threadpoolctl-3.6.0
Note: you may need to restart the 

In [21]:
%pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [22]:
#partition the dataset into train:validate:test 75%:12.5%:12.5%

from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm

# Wrap dataset access in try-except to catch loading errors
def safe_get_label(dataset, idx):
    try:
        _, label = dataset[idx]
        return label
    except Exception as e:
        # Attempt to get patient ID if available; adjust attribute as per your dataset
        patient_id = getattr(dataset, 'patient_ids', None)
        pid = patient_id[idx] if patient_id is not None else f"index {idx}"
        print(f"Warning: Skipping patient {pid} due to error: {e}")
        return None

# full dataset and loader (keep loader for convenience, may skip corrupted later)
dataset = DicomMontageDataset(csv_file='dataset/labels.csv', root_dir='dataset')
loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)

# Collect valid indices and labels with error handling
valid_indices = []
valid_labels = []
print("Extracting labels and filtering corrupted samples:")
for idx in tqdm(range(len(dataset))):
    label = safe_get_label(dataset, idx)
    if label is not None:
        valid_indices.append(idx)
        valid_labels.append(label)

# Now do stratified splits on filtered valid data only
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_rel_idx, temp_rel_idx in sss1.split(X=valid_labels, y=valid_labels):
    pass

# Get absolute indices from relative
train_idx = [valid_indices[i] for i in train_rel_idx]
temp_idx = [valid_indices[i] for i in temp_rel_idx]

# Split temp into val and test (50% each)
temp_labels = [valid_labels[i] for i in temp_rel_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_rel_idx, test_rel_idx in sss2.split(X=temp_labels, y=temp_labels):
    pass

val_idx = [temp_idx[i] for i in val_rel_idx]
test_idx = [temp_idx[i] for i in test_rel_idx]

# Create subsets using filtered and split indices
train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)
test_dataset = Subset(dataset, test_idx)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


Extracting labels and filtering corrupted samples:


Corrupt JPEG data: bad Huffman code█████████████████████████████████████████████████████████████████████████████████                      | 147/176 [05:06<00:54,  1.89s/it]
Unsupported marker type 0xfa
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 148/176 [05:08<00:58,  2.08s/it]

  gdcm: 'NoneType' object has no attribute 'encode'
  pylibjpeg: libjpeg error code '-1038' returned from Decode(): A misplaced marker segment was found - invalid stream, found invalid huffman code in entropy coded segment


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 176/176 [06:11<00:00,  2.11s/it]


In [24]:
%pip install monai

Collecting monai
  Downloading monai-1.5.0-py3-none-any.whl.metadata (13 kB)
INFO: pip is looking at multiple versions of monai to determine which version is compatible with other requirements. This could take a while.
  Downloading monai-1.4.0-py3-none-any.whl.metadata (11 kB)
Downloading monai-1.4.0-py3-none-any.whl (1.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: monai
Successfully installed monai-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
#Create the model

from monai.networks.nets import SEResNet50
import torch.nn as nn
import torch

# Load pretrained SE-ResNet50 3D
# Create SE-ResNet50 3D model for binary classification
model = SEResNet50(
    spatial_dims=3,
    in_channels=1,      # Use this instead of n_input_channels
    num_classes=2       # For binary classification
)

# Optional: move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

## Optionally freeze early layers to prevent fine tuning the whole network
# for param in model.layer1.parameters():
#     param.requires_grad = False

num_epochs = 10

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)  # (B, 1, 10, 224, 224)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {running_loss:.4f}, Train Accuracy: {train_acc:.2f}%")

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_inputs = val_inputs.to(device)
            val_labels = val_labels.to(device)

            val_outputs = model(val_inputs)
            loss = criterion(val_outputs, val_labels)

            val_loss += loss.item()
            _, val_predicted = val_outputs.max(1)
            val_correct += val_predicted.eq(val_labels).sum().item()
            val_total += val_labels.size(0)

    val_acc = 100 * val_correct / val_total
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\n")


Epoch 1/10, Train Loss: 1.4797, Train Accuracy: 93.89%
Validation Loss: 0.2852, Validation Accuracy: 100.00%

Epoch 2/10, Train Loss: 0.0149, Train Accuracy: 100.00%
Validation Loss: 0.1153, Validation Accuracy: 100.00%

Epoch 3/10, Train Loss: 0.0069, Train Accuracy: 100.00%
Validation Loss: 0.0123, Validation Accuracy: 100.00%

Epoch 4/10, Train Loss: 0.0049, Train Accuracy: 100.00%
Validation Loss: 0.0012, Validation Accuracy: 100.00%

Epoch 5/10, Train Loss: 0.0040, Train Accuracy: 100.00%
Validation Loss: 0.0007, Validation Accuracy: 100.00%

Epoch 6/10, Train Loss: 0.0034, Train Accuracy: 100.00%
Validation Loss: 0.0006, Validation Accuracy: 100.00%

Epoch 7/10, Train Loss: 0.0030, Train Accuracy: 100.00%
Validation Loss: 0.0005, Validation Accuracy: 100.00%

Epoch 8/10, Train Loss: 0.0027, Train Accuracy: 100.00%
Validation Loss: 0.0005, Validation Accuracy: 100.00%

Epoch 9/10, Train Loss: 0.0024, Train Accuracy: 100.00%
Validation Loss: 0.0004, Validation Accuracy: 100.00%

Ep

In [28]:
#Test the model 
def test_model(model, test_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation for speed
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

    avg_loss = test_loss / len(test_loader)
    accuracy = 100 * correct / total
    print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.2f}%")
    return avg_loss, accuracy

test_model(model, test_loader, criterion, device)


Test Loss: 0.0001, Test Accuracy: 100.00%


(0.00013061173376627266, 100.0)