## Loading the Data

- Importing the data from kaggle
- Use pytorch data loaders for efficient and flexible data loading

- Plotting basic visualisations

In [None]:
# Upload kaggle.json and configure Kaggle API
from google.colab import files
import os, shutil, glob

files.upload()  # Upload kaggle.json
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)

# Download BreaKHis dataset
import kagglehub
path = kagglehub.dataset_download("ambarish/breakhis")

# Set up dataset path
dataset_images_path = os.path.join(path, "BreaKHis_v1", "BreaKHis_v1", "histology_slides", "breast")
destination = "/content/breakhis_dataset"
if not os.path.exists(destination):
    shutil.copytree(dataset_images_path, destination)

# Check dataset structure
print("Dataset available at:", destination)
print("Top-level folders:", os.listdir(destination))

# Count images by class and magnification
benign_images = glob.glob(os.path.join(destination, "benign", "**", "*.png"), recursive=True)
malignant_images = glob.glob(os.path.join(destination, "malignant", "**", "*.png"), recursive=True)

magnifications = ["40X", "100X", "200X", "400X"]
image_counts = {mag: len(glob.glob(os.path.join(destination, "**", mag, "*.png"), recursive=True)) for mag in magnifications}

print(f"Benign images: {len(benign_images)}")
print(f"Malignant images: {len(malignant_images)}")
print(f"Total images: {len(benign_images) + len(malignant_images)}")
print("Image counts by magnification:", image_counts)

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nchaitri","key":"c86f4c8b1f90fcfba0c3d2b37f122296"}'}

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

#Visualisations
# Class distribution bar chart
plt.figure(figsize=(6, 4))
sns.barplot(x=["Benign", "Malignant"], y=[len(benign_images), len(malignant_images)])
plt.title("Class Distribution")
plt.ylabel("Number of Images")
plt.show()

# Magnification distribution bar chart
plt.figure(figsize=(6, 4))
sns.barplot(x=list(image_counts.keys()), y=list(image_counts.values()))
plt.title("Image Count by Magnification")
plt.ylabel("Number of Images")
plt.xlabel("Magnification")
plt.show()

# Display sample images from each class and magnification
fig, axes = plt.subplots(2, 4, figsize=(14, 6))
sample_images = []

for i, mag in enumerate(magnifications):
    benign_sample = glob.glob(os.path.join(destination, "benign", "**", mag, "*.png"), recursive=True)[0]
    malignant_sample = glob.glob(os.path.join(destination, "malignant", "**", mag, "*.png"), recursive=True)[0]
    sample_images.append((benign_sample, malignant_sample))

    axes[0, i].imshow(Image.open(benign_sample))
    axes[0, i].set_title(f"Benign - {mag}")
    axes[0, i].axis("off")

    axes[1, i].imshow(Image.open(malignant_sample))
    axes[1, i].set_title(f"Malignant - {mag}")
    axes[1, i].axis("off")

plt.suptitle("Sample Images by Class and Magnification", fontsize=16)
plt.tight_layout()
plt.show()

###Preprocessing

In [None]:
import os
import glob
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Define dataset path
dataset_path = "/content/breakhis_dataset"

# Define transformation
IMG_SIZE = 224
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


# Create dataset DataFrame
data = []

for class_name in ["benign", "malignant"]:
    class_path = os.path.join(dataset_path, class_name, "SOB")

    if os.path.isdir(class_path):
        for subtype in os.listdir(class_path):
            subtype_path = os.path.join(class_path, subtype)

            if os.path.isdir(subtype_path):
                for sample in os.listdir(subtype_path):
                    sample_path = os.path.join(subtype_path, sample)

                    if os.path.isdir(sample_path):
                        for mag in ["40X", "100X", "200X", "400X"]:
                            mag_path = os.path.join(sample_path, mag)

                            if os.path.isdir(mag_path):
                                for img_path in glob.glob(os.path.join(mag_path, "*.png")):
                                    data.append([img_path, class_name, mag])

df = pd.DataFrame(data, columns=["image_path", "label", "magnification"])

# Convert labels to numeric (0 = benign, 1 = malignant)
df["label"] = df["label"].map({"benign": 0, "malignant": 1})

# Define PyTorch Dataset
class BreakHisDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]["image_path"]
        label = self.dataframe.iloc[idx]["label"]

        image = Image.open(img_path).convert("RGB")  # Load image with PIL

        if self.transform:
            image = self.transform(image)

        return image, label

# Create dataset and DataLoader
dataset = BreakHisDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Test DataLoader
sample_images, sample_labels = next(iter(dataloader))
print("Batch Image Shape:", sample_images.shape)  # Should be (32, 3, 224, 224)
print("Batch Label Shape:", sample_labels.shape)  # Should be (32,)


Batch Image Shape: torch.Size([32, 3, 224, 224])
Batch Label Shape: torch.Size([32])


###Train-test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset (80% train, 20% test, stratified)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

# Create PyTorch datasets for train and test
train_dataset = BreakHisDataset(train_df, transform=transform)
test_dataset = BreakHisDataset(test_df, transform=transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Check data shapes
train_images, train_labels = next(iter(train_loader))
print("Train Batch Image Shape:", train_images.shape)  # (32, 3, 224, 224)
print("Train Batch Label Shape:", train_labels.shape)  # (32,)

test_images, test_labels = next(iter(test_loader))
print("Test Batch Image Shape:", test_images.shape)  # (32, 3, 224, 224)
print("Test Batch Label Shape:", test_labels.shape)  # (32,)


Train Batch Image Shape: torch.Size([32, 3, 224, 224])
Train Batch Label Shape: torch.Size([32])
Test Batch Image Shape: torch.Size([32, 3, 224, 224])
Test Batch Label Shape: torch.Size([32])


###Resnet18 with LR Scheduler

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torchvision.models as models
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

# Load model
model = models.resnet18(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(model)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

# Training loop
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        optimizer.zero_grad()
        outputs = model(images).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(test_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    scheduler.step(avg_val_loss)

# Evaluate model
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        outputs = model(images).squeeze()
        preds = torch.sigmoid(outputs) > 0.5
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 183MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

100%|██████████| 198/198 [02:14<00:00,  1.48it/s]


Epoch [1/15], Loss: 0.3469
Validation Loss: 0.3758


100%|██████████| 198/198 [02:02<00:00,  1.62it/s]


Epoch [2/15], Loss: 0.2474
Validation Loss: 0.2687


 39%|███▉      | 77/198 [00:45<01:11,  1.68it/s]


KeyboardInterrupt: 

Save Model

In [None]:
# Save model
torch.save(model.state_dict(), "breakhis_resnet18.pth")
print("Model saved successfully!")

# Load model
model.load_state_dict(torch.load("breakhis_resnet18.pth"))
model = model.to(device)
model.eval()
print("Model loaded successfully!")

###Resnet18 with LR Scheduler and Bayesian Optimizer

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import optuna
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Bayesian optimization using Optuna
def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)  # Fixed deprecated function
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "sgd"])

    # Define model (ResNet-18)
    model = models.resnet18(weights="ResNet18_Weights.DEFAULT")  # Updated pretrained argument
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 1)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr) if optimizer_name == "adam" else optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.BCEWithLogitsLoss()

    # Training loop (reduced for optimization)
    for epoch in range(3):  # Only a few epochs for faster tuning
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            outputs = model(images).squeeze()
            preds = torch.sigmoid(outputs) > 0.5
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    return acc

# Run Bayesian Optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)


[I 2025-03-28 11:18:01,844] A new study created in memory with name: no-name-53ebc210-52eb-4ca1-911e-69350c31df0a
[I 2025-03-28 11:23:19,033] Trial 0 finished with value: 0.7831858407079646 and parameters: {'lr': 0.007065931575857947, 'optimizer': 'adam'}. Best is trial 0 with value: 0.7831858407079646.
[I 2025-03-28 11:28:35,427] Trial 1 finished with value: 0.9443742098609356 and parameters: {'lr': 0.0038920039566723505, 'optimizer': 'sgd'}. Best is trial 1 with value: 0.9443742098609356.
[I 2025-03-28 11:33:49,010] Trial 2 finished with value: 0.9399494310998736 and parameters: {'lr': 0.008476718402402647, 'optimizer': 'sgd'}. Best is trial 1 with value: 0.9443742098609356.
[I 2025-03-28 11:39:03,742] Trial 3 finished with value: 0.9279393173198482 and parameters: {'lr': 0.0002675562986650459, 'optimizer': 'adam'}. Best is trial 1 with value: 0.9443742098609356.
[I 2025-03-28 11:44:23,244] Trial 4 finished with value: 0.961441213653603 and parameters: {'lr': 0.006751435887532834, 'o

Best parameters: {'lr': 0.006751435887532834, 'optimizer': 'sgd'}
Best accuracy: 0.961441213653603


### Resnet18 with grid search hyperparameter tuning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torchvision.models as models
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

# Load model
def create_model():
    model = models.resnet18(pretrained=True)
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 1)
    return model.to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define loss function
criterion = nn.BCEWithLogitsLoss()

# Define parameter grid
param_grid = {
    "lr": [1e-4, 1e-3, 1e-2],
    "optimizer": ["adam", "sgd"],
}

best_acc = 0.0
best_params = None

# Perform grid search
for lr, optimizer_name in itertools.product(param_grid["lr"], param_grid["optimizer"]):
    model = create_model()
    optimizer = optim.Adam(model.parameters(), lr=lr) if optimizer_name == "adam" else optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

    # Training loop
    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Validation phase
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            outputs = model(images).squeeze()
            preds = torch.sigmoid(outputs) > 0.5
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    print(f"LR: {lr}, Optimizer: {optimizer_name}, Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_params = {"lr": lr, "optimizer": optimizer_name}

print("Best parameters:", best_params)
print("Best accuracy:", best_acc)

# Evaluate best model
model = create_model()
optimizer = optim.Adam(model.parameters(), lr=best_params["lr"]) if best_params["optimizer"] == "adam" else optim.SGD(model.parameters(), lr=best_params["lr"], momentum=0.9)

# Final training with best parameters
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        optimizer.zero_grad()
        outputs = model(images).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}")

# Final evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        outputs = model(images).squeeze()
        preds = torch.sigmoid(outputs) > 0.5
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(true_labels, predictions))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Benign", "Malignant"], yticklabels=["Benign", "Malignant"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()
