In [None]:
!pip install -q SimpleITK
!pip install -q scikit-learn
!pip install -q pandas numpy matplotlib
!pip install -q tqdm
!pip install tensorflow



In [None]:
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.19.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
from google.colab import drive

# Mount to a fresh folder
drive.mount('/content/my_drive')

Drive already mounted at /content/my_drive; to attempt to forcibly remount, call drive.mount("/content/my_drive", force_remount=True).


In [None]:
import os
DATA_DIR = '/content/drive/MyDrive/LUNA16_full'  # Replace with your folder path
os.makedirs(DATA_DIR, exist_ok=True)

**DATA DOWNLOAD**

In [15]:
from google.colab import drive
import os
import zipfile
from tqdm import tqdm
import requests
import concurrent.futures


# Folder to store LUNA16 dataset
DATA_DIR = '/content/drive/MyDrive/LUNA16_full'
os.makedirs(DATA_DIR, exist_ok=True)

# Helper function to download & extract
def download_and_extract(url, filename, extract_to):
    filepath = os.path.join(extract_to, filename)
    if not os.path.exists(filepath):
        print(f"Downloading {filename}...")
        response = requests.get(url, stream=True)
        total = int(response.headers.get('content-length', 0))
        with open(filepath, 'wb') as f, tqdm(total=total, unit='iB', unit_scale=True) as bar:
            for data in response.iter_content(chunk_size=1024):
                size = f.write(data)
                bar.update(size)
    else:
        print(f"{filename} already exists, skipping download.")

    if filename.endswith('.zip'):
        extract_folder = os.path.join(extract_to, filename.replace('.zip',''))
        if not os.path.exists(extract_folder):
            print(f"Extracting {filename}...")
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
            print(f"✓ {filename} extracted.")
        else:
            print(f"{filename} already extracted.")

# LUNA16 base URL
BASE_URL = "https://zenodo.org/records/3723295/files/"

# Download annotations and candidates sequentially
download_and_extract(BASE_URL + "annotations.csv?download=1", "annotations.csv", DATA_DIR)
download_and_extract(BASE_URL + "candidates_V2.zip?download=1", "candidates_V2.zip", DATA_DIR)

# -------- Parallel download subsets 0-4 --------
def download_subset(i):
    subset_url = BASE_URL + f"subset{i}.zip?download=1"
    download_and_extract(subset_url, f"subset{i}.zip", DATA_DIR)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(download_subset, range(5))

print("\n✓ Subsets 0-4 of LUNA16 dataset ready in your Drive folder!")

Downloading annotations.csv...


137kiB [00:00, 753kiB/s] 


Downloading candidates_V2.zip...


100%|██████████| 11.4M/11.4M [00:01<00:00, 8.71MiB/s]


Extracting candidates_V2.zip...
✓ candidates_V2.zip extracted.
Downloading subset0.zip...
Downloading subset1.zip...
Downloading subset2.zip...
Downloading subset3.zip...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

 76%|███████▌  | 5.23G/6.90G [05:22<01:33, 17.8MiB/s][A[A[A

 72%|███████▏  | 5.23G/7.26G [05:22<02:02, 16.5MiB/s][A[A
 78%|███████▊  | 4.95G/6.33G [05:22<01:24, 16.3MiB/s]

 72%|███████▏  | 5.23G/7.26G [05:22<01:53, 17.8MiB/s][A[A
 76%|███████▌  | 5.17G/6.81G [05:23<01:56, 14.0MiB/s][A


 78%|███████▊  | 4.95G/6.33G [05:23<01:26, 16.0MiB/s]

 78%|███████▊  | 4.95G/6.33G [05:23<01:22, 16.7MiB/s]
 76%|███████▌  | 5.18G/6.81G [05:23<02:06, 12.9MiB/s][A


 76%|███████▌  | 5.24G/6.90G [05:23<02:02, 13.5MiB/s][A[A[A

 78%|███████▊  | 4.95G/6.33G [05:23<01:22, 16.6MiB/s]
 76%|███████▌  | 5.18G/6.81G [05:23<02:05, 13.0MiB/s][A


 78%|███████▊  | 4.96G/6.33G [05:23<01:21, 16.9MiB/s]

 72%|███████▏  | 5.24G/7.26G [05:23<01:54, 17.6MiB/s][A[A
 76%|███████▌  | 5.18G/6.81G [05:23<01:51, 14.7MiB/s][A


 78%|███████▊  | 4.96G/6.33G [05:23<01:25, 16.1MiB/s]
 76%|███████▌  | 5.18G/6.81G [05:23<01:54, 14.3MiB/s][A

 72%|

Extracting subset1.zip...



 96%|█████████▋| 6.57G/6.81G [06:55<00:14, 16.6MiB/s][A


 96%|█████████▌| 6.63G/6.90G [06:55<00:13, 19.1MiB/s][A[A[A

 92%|█████████▏| 6.65G/7.26G [06:55<00:36, 16.8MiB/s][A[A
 96%|█████████▋| 6.57G/6.81G [06:55<00:14, 17.1MiB/s][A


 96%|█████████▌| 6.64G/6.90G [06:55<00:13, 19.2MiB/s][A[A[A

 92%|█████████▏| 6.65G/7.26G [06:55<00:35, 17.3MiB/s][A[A
 97%|█████████▋| 6.57G/6.81G [06:55<00:12, 18.7MiB/s][A


 96%|█████████▋| 6.64G/6.90G [06:55<00:12, 20.7MiB/s][A[A[A

 92%|█████████▏| 6.65G/7.26G [06:56<00:31, 19.1MiB/s][A[A
 97%|█████████▋| 6.58G/6.81G [06:56<00:12, 19.5MiB/s][A


 96%|█████████▋| 6.64G/6.90G [06:56<00:13, 19.3MiB/s][A[A[A

 92%|█████████▏| 6.66G/7.26G [06:56<00:29, 20.3MiB/s][A[A
 97%|█████████▋| 6.58G/6.81G [06:56<00:11, 19.9MiB/s][A


 96%|█████████▋| 6.64G/6.90G [06:56<00:13, 19.0MiB/s][A[A[A

 92%|█████████▏| 6.66G/7.26G [06:56<00:31, 18.9MiB/s][A[A
 97%|█████████▋| 6.58G/6.81G [06:56<00:11, 20.5MiB/s][A


 96%|█████████▋| 6.64G/6.

Extracting subset0.zip...




 95%|█████████▌| 6.91G/7.26G [07:09<00:14, 23.7MiB/s][A[A


100%|█████████▉| 6.89G/6.90G [07:09<00:00, 24.9MiB/s][A[A[A

 95%|█████████▌| 6.91G/7.26G [07:09<00:15, 21.9MiB/s][A[A


100%|█████████▉| 6.89G/6.90G [07:09<00:00, 24.5MiB/s][A[A[A

 95%|█████████▌| 6.91G/7.26G [07:09<00:14, 23.2MiB/s][A[A


100%|█████████▉| 6.89G/6.90G [07:09<00:00, 27.3MiB/s][A[A[A

 95%|█████████▌| 6.92G/7.26G [07:09<00:15, 22.5MiB/s][A[A


100%|█████████▉| 6.89G/6.90G [07:09<00:00, 24.7MiB/s][A[A[A

100%|██████████| 6.90G/6.90G [07:09<00:00, 16.1MiB/s]


 95%|█████████▌| 6.92G/7.26G [07:09<00:14, 23.0MiB/s][A[A

Extracting subset3.zip...




 95%|█████████▌| 6.93G/7.26G [07:09<00:14, 23.4MiB/s][A[A

 95%|█████████▌| 6.93G/7.26G [07:09<00:13, 23.8MiB/s][A[A

 95%|█████████▌| 6.93G/7.26G [07:10<00:13, 24.2MiB/s][A[A

 96%|█████████▌| 6.93G/7.26G [07:10<00:13, 24.8MiB/s][A[A

 96%|█████████▌| 6.94G/7.26G [07:10<00:13, 24.6MiB/s][A[A

 96%|█████████▌| 6.94G/7.26G [07:10<00:13, 24.5MiB/s][A[A

 96%|█████████▌| 6.94G/7.26G [07:10<00:12, 24.5MiB/s][A[A

 96%|█████████▌| 6.94G/7.26G [07:10<00:12, 24.2MiB/s][A[A

 96%|█████████▌| 6.95G/7.26G [07:10<00:13, 23.8MiB/s][A[A

 96%|█████████▌| 6.95G/7.26G [07:10<00:12, 24.1MiB/s][A[A

 96%|█████████▌| 6.95G/7.26G [07:10<00:12, 23.9MiB/s][A[A

 96%|█████████▌| 6.95G/7.26G [07:11<00:12, 24.5MiB/s][A[A

 96%|█████████▌| 6.96G/7.26G [07:11<00:12, 24.2MiB/s][A[A

 96%|█████████▌| 6.96G/7.26G [07:11<00:11, 25.3MiB/s][A[A

 96%|█████████▌| 6.96G/7.26G [07:11<00:12, 24.3MiB/s][A[A

 96%|█████████▌| 6.96G/7.26G [07:11<00:12, 23.7MiB/s][A[A

 96%|█████████▌| 6.97G

Extracting subset2.zip...
✓ subset1.zip extracted.
Downloading subset4.zip...


  0%|          | 20.9M/6.86G [00:17<2:13:47, 852kiB/s]

✓ subset0.zip extracted.


  0%|          | 24.3M/6.86G [00:20<1:42:07, 1.11MiB/s]

✓ subset3.zip extracted.


  1%|          | 43.3M/6.86G [00:39<2:27:06, 772kiB/s]

✓ subset2.zip extracted.


100%|██████████| 6.86G/6.86G [1:07:11<00:00, 1.70MiB/s]


Extracting subset4.zip...
✓ subset4.zip extracted.

✓ Subsets 0-4 of LUNA16 dataset ready in your Drive folder!


In [16]:
# Install nibabel for NIfTI support
!pip install -q nibabel

# Import nibabel
import nibabel as nib

In [17]:
# NIfTI Conversion Utilities (Optional - for converting MHD to NIfTI)
def convert_mhd_to_nifti(mhd_path, nii_path):
    """Convert MHD file to NIfTI format"""
    try:
        # Read MHD file
        img = sitk.ReadImage(mhd_path)
        array = sitk.GetArrayFromImage(img)

        # Create NIfTI image
        nii_img = nib.Nifti1Image(array, np.eye(4))  # Identity affine matrix
        nib.save(nii_img, nii_path)
        print(f"✓ Converted {mhd_path} to {nii_path}")
        return True
    except Exception as e:
        print(f"✗ Failed to convert {mhd_path}: {e}")
        return False

def batch_convert_to_nifti(data_dir, output_dir=None):
    """Convert all MHD files in LUNA16 dataset to NIfTI"""
    if output_dir is None:
        output_dir = os.path.join(data_dir, "nifti")

    os.makedirs(output_dir, exist_ok=True)

    converted = 0
    total = 0

    # Find all subset directories
    subsets = [d for d in os.listdir(data_dir) if d.startswith("subset")]

    for subset in subsets:
        subset_dir = os.path.join(data_dir, subset)
        nii_subset_dir = os.path.join(output_dir, subset)
        os.makedirs(nii_subset_dir, exist_ok=True)

        # Find all MHD files
        mhd_files = [f for f in os.listdir(subset_dir) if f.endswith('.mhd')]

        for mhd_file in mhd_files:
            total += 1
            mhd_path = os.path.join(subset_dir, mhd_file)
            nii_path = os.path.join(nii_subset_dir, mhd_file.replace('.mhd', '.nii.gz'))

            if convert_mhd_to_nifti(mhd_path, nii_path):
                converted += 1

    print(f"\nConversion complete: {converted}/{total} files converted")
    return output_dir

# Uncomment to convert dataset to NIfTI:
# NII_DATA_DIR = batch_convert_to_nifti(DATA_DIR)

In [18]:
# Install nibabel for NIfTI support
!pip install -q nibabel

# Import nibabel
import nibabel as nib

In [19]:
# NIfTI Conversion Utilities (Optional - for converting MHD to NIfTI)
def convert_mhd_to_nifti(mhd_path, nii_path):
    """Convert MHD file to NIfTI format"""
    try:
        # Read MHD file
        img = sitk.ReadImage(mhd_path)
        array = sitk.GetArrayFromImage(img)

        # Create NIfTI image
        nii_img = nib.Nifti1Image(array, np.eye(4))  # Identity affine matrix
        nib.save(nii_img, nii_path)
        print(f"✓ Converted {mhd_path} to {nii_path}")
        return True
    except Exception as e:
        print(f"✗ Failed to convert {mhd_path}: {e}")
        return False

def batch_convert_to_nifti(data_dir, output_dir=None):
    """Convert all MHD files in LUNA16 dataset to NIfTI"""
    if output_dir is None:
        output_dir = os.path.join(data_dir, "nifti")

    os.makedirs(output_dir, exist_ok=True)

    converted = 0
    total = 0

    # Find all subset directories
    subsets = [d for d in os.listdir(data_dir) if d.startswith("subset")]

    for subset in subsets:
        subset_dir = os.path.join(data_dir, subset)
        nii_subset_dir = os.path.join(output_dir, subset)
        os.makedirs(nii_subset_dir, exist_ok=True)

        # Find all MHD files
        mhd_files = [f for f in os.listdir(subset_dir) if f.endswith('.mhd')]

        for mhd_file in mhd_files:
            total += 1
            mhd_path = os.path.join(subset_dir, mhd_file)
            nii_path = os.path.join(nii_subset_dir, mhd_file.replace('.mhd', '.nii.gz'))

            if convert_mhd_to_nifti(mhd_path, nii_path):
                converted += 1

    print(f"\nConversion complete: {converted}/{total} files converted")
    return output_dir

# Uncomment to convert dataset to NIfTI:
# NII_DATA_DIR = batch_convert_to_nifti(DATA_DIR)

**DATA PREPROCESSING (NIfTI Support)**

In [36]:
import SimpleITK as sitk
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import nibabel as nib

class LUNANIfTIDataPreprocessor:
    """Robust LUNA16 preprocessor supporting both MHD and NIfTI files."""

    def __init__(self, data_dir):
        self.data_dir = data_dir

        # Load CSVs
        self.annotations = pd.read_csv(os.path.join(data_dir, "annotations.csv"))
        self.candidates = pd.read_csv(os.path.join(data_dir, "candidates_V2.csv"))

        # Detect subset directories
        self.subsets = [
            d for d in os.listdir(self.data_dir)
            if d.startswith("subset") and os.path.isdir(os.path.join(self.data_dir, d))
        ]

        if not self.subsets:
            raise RuntimeError("No subset directories found")

        # Index all scans (MHD and NIfTI)
        self.uid_to_path = {}
        for subset in self.subsets:
            subset_dir = os.path.join(self.data_dir, subset)
            for fname in os.listdir(subset_dir):
                if fname.endswith((".nii", ".nii.gz", ".mhd")):
                    for uid in self.candidates["seriesuid"].unique():
                        if uid in fname:
                            self.uid_to_path[uid] = os.path.join(subset_dir, fname)

        print(f"Detected subsets: {self.subsets}")
        print(f"Indexed {len(self.uid_to_path)} scans")
        print(f"Loaded {len(self.annotations)} nodules")
        print(f"Loaded {len(self.candidates)} candidates")

        if len(self.uid_to_path) == 0:
            raise RuntimeError("No seriesuid matched any scan file (MHD or NIfTI)")

    # ---------------- IO ----------------
    def load_scan(self, series_uid):
        path = self.uid_to_path.get(series_uid)
        if path is None:
            return None

        if path.endswith(".nii") or path.endswith(".nii.gz"):
            return nib.load(path)
        elif path.endswith(".mhd"):
            return sitk.ReadImage(path)
        else:
            return None

    # ---------------- Geometry ----------------
    def world_to_voxel(self, coords, scan):
        if isinstance(scan, nib.Nifti1Image):
            inv_affine = np.linalg.inv(scan.affine)
            voxel = nib.affines.apply_affine(inv_affine, coords)
            return np.round(voxel).astype(int)
        else:
            origin = np.array(scan.GetOrigin())
            spacing = np.array(scan.GetSpacing())
            return np.round(np.abs(coords - origin) / spacing).astype(int)

    # ---------------- Patch extraction ----------------
    def extract_nodule_patch(self, scan, world_coords, patch_size=64):
        if isinstance(scan, nib.Nifti1Image):
            vol = scan.get_fdata()
            vx, vy, vz = self.world_to_voxel(world_coords, scan)
            z, y, x = vz, vy, vx
        else:
            vol = sitk.GetArrayFromImage(scan)
            vx, vy, vz = self.world_to_voxel(world_coords, scan)
            z, y, x = vz, vy, vx

        half = patch_size // 2

        zmin, zmax = max(0, z-half), min(vol.shape[0], z+half)
        ymin, ymax = max(0, y-half), min(vol.shape[1], y+half)
        xmin, xmax = max(0, x-half), min(vol.shape[2], x+half)

        patch = vol[zmin:zmax, ymin:ymax, xmin:xmax]

        padded = np.zeros((patch_size, patch_size, patch_size), dtype=np.float32)
        dz = (patch_size - patch.shape[0]) // 2
        dy = (patch_size - patch.shape[1]) // 2
        dx = (patch_size - patch.shape[2]) // 2

        padded[
            dz:dz+patch.shape[0],
            dy:dy+patch.shape[1],
            dx:dx+patch.shape[2]
        ] = patch

        return padded

    # ---------------- Normalization ----------------
    def normalize_hu(self, patch):
        patch = np.clip(patch, -1000, 400)
        return ((patch + 1000) / 1400).astype(np.float32)

    # ---------------- Dataset ----------------
    def prepare_dataset(self, max_samples=None, balance=True):
        X, y = [], []

        if balance:
            pos = self.candidates[self.candidates["class"] == 1]
            neg = self.candidates[self.candidates["class"] == 0].sample(
                n=len(pos), random_state=42
            )
            candidates = pd.concat([pos, neg]).sample(frac=1, random_state=42)
        else:
            candidates = self.candidates

        if max_samples:
            candidates = candidates.head(max_samples)

        for _, row in tqdm(candidates.iterrows(), total=len(candidates)):
            scan = self.load_scan(row["seriesuid"])
            if scan is None:
                continue

            coords = np.array([row["coordX"], row["coordY"], row["coordZ"]])
            patch = self.normalize_hu(self.extract_nodule_patch(scan, coords))

            X.append(patch)
            y.append(row["class"])

        X = np.array(X)[..., np.newaxis]
        y = np.array(y)

        print(f"\n✓ Dataset: {X.shape}, Pos: {y.sum()}, Neg: {len(y)-y.sum()}")
        return X, y

In [37]:
import os

subset0_dir = "/content/drive/MyDrive/LUNA16_full/subset0"
for f in os.listdir(subset0_dir)[:10]:
    print(f)

1.3.6.1.4.1.14519.5.2.1.6279.6001.138080888843357047811238713686.mhd
1.3.6.1.4.1.14519.5.2.1.6279.6001.128023902651233986592378348912.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.905371958588660410240398317235.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.269689294231892620436462818860.mhd
1.3.6.1.4.1.14519.5.2.1.6279.6001.975254950136384517744116790879.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.333145094436144085379032922488.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.269689294231892620436462818860.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.293757615532132808762625441831.raw
1.3.6.1.4.1.14519.5.2.1.6279.6001.139713436241461669335487719526.mhd
1.3.6.1.4.1.14519.5.2.1.6279.6001.395623571499047043765181005112.raw


In [38]:
DATA_DIR = "/content/drive/MyDrive/LUNA16_full"

preprocessor = LUNANIfTIDataPreprocessor(DATA_DIR)

X, y = preprocessor.prepare_dataset(max_samples=40000, balance=True)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Detected subsets: ['subset0', 'subset1', 'subset3', 'subset4', 'subset2']
Indexed 445 scans
Loaded 1186 nodules
Loaded 754975 candidates


100%|██████████| 3114/3114 [07:46<00:00,  6.68it/s]



✓ Dataset: (1611, 64, 64, 64, 1), Pos: 817, Neg: 794
Train: (1127, 64, 64, 64, 1), Val: (242, 64, 64, 64, 1), Test: (242, 64, 64, 64, 1)


In [39]:
print("Train distribution:", np.bincount(y_train))
print("Val distribution:", np.bincount(y_val))


Train distribution: [555 572]
Val distribution: [119 123]


**MODEL A**

In [None]:
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import tensorflow as tf

# --- Simplified Residual Block without Attention ---
def residual_block_simple(x, filters):
    shortcut = x
    x = layers.Conv3D(filters, 3, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Conv3D(filters, 3, padding='same')(x)
    x = layers.BatchNormalization()(x)
    if shortcut.shape[-1] != filters:
        shortcut = layers.Conv3D(filters, 1, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
    x = layers.Add()([x, shortcut])
    x = layers.Activation('relu')(x)
    return x

# --- Simplified Light 3D ResNet ---
def build_light_resnet_simple(input_shape=(64,64,64,1)):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv3D(8, 3, padding='same')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    x = residual_block_simple(x, 8)
    x = layers.MaxPooling3D(2)(x)

    x = residual_block_simple(x, 16)
    x = layers.MaxPooling3D(2)(x)

    x = residual_block_simple(x, 32)
    x = layers.GlobalAveragePooling3D()(x)

    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    return Model(inputs, outputs)

# --- Focal Loss (optional, keep it or replace with BCE) ---
def focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1-1e-7)
        ce = -y_true * tf.math.log(y_pred)
        weight = alpha * y_true * tf.pow(1 - y_pred, gamma)
        return tf.reduce_mean(tf.reduce_sum(weight * ce, axis=1))
    return loss

# --- Build and Compile ---
model = build_light_resnet_simple()
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),  # smaller LR for small dataset
    loss=focal_loss(),                           # or 'binary_crossentropy'
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc"),
             tf.keras.metrics.Precision(name="precision"),
             tf.keras.metrics.Recall(name="recall")]
)

# --- Callbacks ---
callbacks = [
    ModelCheckpoint('best_light_resnet.h5', monitor='val_auc', mode='max', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_auc', mode='max', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, min_lr=1e-6, verbose=1)
]

# --- Training ---
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=2,   # keep small for memory + stability
    callbacks=callbacks,
    verbose=1
)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt


model = build_light_resnet_attention()
model.load_weights('best_resnet_attention_model.h5')
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss=focal_loss(),
              metrics=["accuracy", tf.keras.metrics.AUC(name="auc"),
                       tf.keras.metrics.Precision(name="precision"),
                       tf.keras.metrics.Recall(name="recall")])

test_results = model.evaluate(X_test, y_test, verbose=0)
print("\nTEST RESULTS:")
print(f"Loss      : {test_results[0]:.4f}")
print(f"Accuracy  : {test_results[1]*100:.2f}%")
print(f"AUC       : {test_results[2]*100:.2f}%")
print(f"Precision : {test_results[3]*100:.2f}%")
print(f"Recall    : {test_results[4]*100:.2f}%")

y_pred_proba = model.predict(X_test, verbose=0).ravel()
y_pred = (y_pred_proba > 0.5).astype(int)
print("\nCLASSIFICATION REPORT:\n", classification_report(y_test, y_pred, target_names=["Benign", "Malignant"]))
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], "--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – LUNA16")
plt.legend()
plt.grid(True)
plt.show()

**Model Performance Summary**

Test Results:

Loss: 0.0000 (indicates the model’s training objective is minimized, but this may be misleading due to the imbalanced prediction behavior)

Accuracy: 50.41% (the model is only slightly better than random guessing for two classes)

AUC (Area Under ROC Curve): 50.42% (essentially random performance; the model is unable to discriminate between the classes)

Precision: 50.41%

Recall: 100.00% (the model predicts all malignant cases correctly but fails to identify benign cases)

Classification Report:

Class	Precision	Recall	F1-score	Support
Benign	0.00	0.00	0.00	120
Malignant	0.50	1.00	0.67	122
Accuracy			0.50	242
Macro Avg	0.25	0.50	0.34	242
Weighted Avg	0.25	0.50	0.34	242

Observation: The model predicts all samples as Malignant, resulting in perfect recall for the Malignant class but zero recall and precision for the Benign class. This indicates a severe class imbalance in predictions, even if the dataset is balanced.

All 120 Benign samples were incorrectly predicted as Malignant. All 122 Malignant samples were correctly predicted.

Warnings and Notes:

UndefinedMetricWarning: Precision for the Benign class is ill-defined because the model never predicted that class. This is why it is set to 0.0.
tf.function retracing warning: The TensorFlow warning indicates that a function was repeatedly retraced, which can impact performance but does not affect model predictions.

Interpretation:

The model fails to learn to distinguish Benign samples and is biased toward predicting Malignant.
High recall for Malignant comes at the cost of completely missing Benign cases.
Accuracy and AUC are effectively random due to this biased behavior.

**HYPERPARAMETER TUNING USING MLFLOW**

In [None]:
!pip install -q mlflow

In [None]:
import mlflow
import mlflow.tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

def focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)

        pos_loss = -alpha * y_true * tf.pow(1 - y_pred, gamma) * tf.math.log(y_pred)
        neg_loss = -(1 - alpha) * (1 - y_true) * tf.pow(y_pred, gamma) * tf.math.log(1 - y_pred)

        return tf.reduce_mean(pos_loss + neg_loss)
    return loss

# Simple 3D CNN (Baseline Model)
def build_3d_cnn(input_shape=(64, 64, 64, 1), dropout=0.3):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv3D(16, 3, padding='same', activation='relu')(inputs)
    x = layers.MaxPooling3D(2)(x)

    x = layers.Conv3D(32, 3, padding='same', activation='relu')(x)
    x = layers.MaxPooling3D(2)(x)

    x = layers.Conv3D(64, 3, padding='same', activation='relu')(x)
    x = layers.MaxPooling3D(2)(x)

    x = layers.GlobalAveragePooling3D()(x)

    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(dropout)(x)

    outputs = layers.Dense(1, activation='sigmoid')(x)

    return Model(inputs, outputs)

model = build_3d_cnn()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=focal_loss(gamma=2.0, alpha=0.25),
    metrics=[
        "accuracy",
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall")
    ]
)

model.summary()


callbacks = [
    ModelCheckpoint(
        "best_3d_cnn.h5",
        monitor="val_auc",
        mode="max",
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor="val_auc",
        mode="max",
        patience=8,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=4,
        min_lr=1e-6,
        verbose=1
    )
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=8,
    callbacks=callbacks,
    verbose=1
)

**HYPERPARAMETER TUNING ON MODEL A**

In [None]:
mlflow.set_experiment("LUNA16_3D_CNN_Tuning")

def train_run(lr, loss_name, loss_fn, dropout):
    # Start MLflow run
    with mlflow.start_run():
        mlflow.log_param("learning_rate", lr)
        mlflow.log_param("loss", loss_name)
        mlflow.log_param("dropout", dropout)
        mlflow.log_param("batch_size", 2)
        mlflow.log_param("epochs", 50)

        model = build_light_resnet_attention(dropout=dropout)

        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
            loss=loss_fn,
            metrics=[tf.keras.metrics.AUC(name="auc")]
        )

        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=25,
            batch_size=2,
            verbose=0
        )

        best_auc = max(history.history["val_auc"])
        mlflow.log_metric("best_val_auc", best_auc)

        return best_auc

In [None]:
losses = {
    "bce": "binary_crossentropy",
    "focal": focal_loss(gamma=2.0, alpha=0.25),
    "focal_soft": focal_loss(gamma=1.5, alpha=0.5)
}

for lr in [1e-3, 3e-4, 1e-4]:
    for loss_name, loss_fn in losses.items():
        for dr in [0.2, 0.3, 0.5]:
            auc = train_run(lr, loss_name, loss_fn, dr)
            print(lr, loss_name, dr, auc)

In [None]:
mlflow.search_runs(
    experiment_names=["LUNA16_3D_CNN_Tuning"],
    order_by=["metrics.best_val_auc DESC"]
)[["params.learning_rate", "params.loss", "params.dropout", "metrics.best_val_auc"]]


In [None]:
with mlflow.start_run(run_name="best_model"):
    mlflow.tensorflow.log_model(model, "model")

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

y_prob = model.predict(X_val, verbose=0).ravel()

print("Threshold | F1-score | Precision | Recall")
print("-" * 40)

for t in [0.3, 0.4, 0.5, 0.6, 0.7]:
    y_pred = (y_prob > t).astype(int)

    f1 = f1_score(y_val, y_pred, zero_division=0)
    precision = precision_score(y_val, y_pred, zero_division=0)
    recall = recall_score(y_val, y_pred, zero_division=0)

    print(f"{t:9.2f} | {f1:8.4f} | {precision:9.4f} | {recall:6.4f}")

**Results after hyperparameter tuning:**

This shows that for Dropout 0.2 with Binary Cross Entropy Loss function at learning rate of 0.0003. The model gets better performance. And the threshold has no effect on the performance.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt


model = build_light_resnet_attention(dropout=0.2)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc"),
             tf.keras.metrics.Precision(name="precision"),
             tf.keras.metrics.Recall(name="recall")]
)

callbacks = [
    ModelCheckpoint('best_light_resnet_attention_model.h5', monitor='val_auc', mode='max', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_auc', mode='max', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, min_lr=1e-6, verbose=1)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=2,
    callbacks=callbacks,
    verbose=1
)

threshold = 0.5
y_prob = model.predict(X_val).ravel()
y_pred = (y_prob > threshold).astype(int)

f1 = f1_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred)

print(f"\nValidation Results at Threshold={threshold}")
print(f"F1-score  : {f1:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"\nConfusion Matrix:\n{cm}")
print("\nClassification Report:\n", classification_report(y_val, y_pred, target_names=["Benign", "Malignant"]))

**Evaluation on Testing Set**

In [None]:
import tensorflow as tf
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Load your trained model
model = build_light_resnet_attention(dropout=0.2)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
    loss=focal_loss(),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc"),
             tf.keras.metrics.Precision(name="precision"),
             tf.keras.metrics.Recall(name="recall")]
)

# Load the best weights
model.load_weights('best_light_resnet_attention_model.h5')

# Predict probabilities on the test set
y_prob = model.predict(X_test).ravel()

# Choose a threshold for classification
threshold = 0.5
y_pred = (y_prob > threshold).astype(int)

# Compute metrics
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results
print(f"\nTest Results at Threshold={threshold}")
print(f"F1-score  : {f1:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"\nConfusion Matrix:\n{cm}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Benign", "Malignant"]))

**EXPORTING .pkl AND .h5 file for backend usage**

In [None]:
import pickle

print("\n" + "="*60)
print("EXPORTING MODEL")
print("="*60)

# Save complete model
model.save('nodule_classifier_resnet_attention.h5')
print("✓ Saved as nodule_classifier_resnet_attention.h5")

# Save preprocessing parameters
preprocessing_params = {
    'patch_size': 64,
    'min_bound': -1000.0,
    'max_bound': 400.0,
    'input_shape': (64, 64, 64, 1),
    'architecture': '3D ResNet + CBAM Attention'
}

with open('preprocessing_params.pkl', 'wb') as f:
    pickle.dump(preprocessing_params, f)
print("✓ Saved preprocessing parameters")

# Save metrics
metrics = {
    'accuracy': float(test_results[1]),
    'auc': float(test_results[2]),
    'precision': float(test_results[3]),
    'recall': float(test_results[4]),
    'architecture': 'Light ResNet + Attention'
}

with open('model_metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)
print("✓ Saved model metrics")

print("\n" + "="*60)
print("FILES READY FOR DOWNLOAD:")
print("="*60)
print("1. nodule_classifier_resnet_attention.h5 (~80 MB)")
print("2. preprocessing_params.pkl")
print("3. model_metrics.pkl")
print("4. training_results_resnet_attention.png")

In [None]:
from google.colab import files

print("\nDownloading files to your computer...")

try:
    files.download('nodule_classifier_resnet_attention.h5')
    print("✓ Downloaded model")
except:
    print("✗ Error downloading model")

try:
    files.download('preprocessing_params.pkl')
    print("✓ Downloaded preprocessing params")
except:
    print("✗ Error downloading params")

try:
    files.download('model_metrics.pkl')
    print("✓ Downloaded metrics")
except:
    print("✗ Error downloading metrics")

print("MODEL TRAINING COMPLETE! ")