#Setup

In [6]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d andrewmvd/leukemia-classification
!unzip -q leukemia-classification.zip -d leukemia_data

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/andrewmvd/leukemia-classification
License(s): other


# Import Libraries
Import all necessary libraries, including TensorFlow, NumPy, Matplotlib, and Scikit-learn.

In [7]:
# Import Libraries
import os
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# Load and Preprocess Dataset
Define a function to load and preprocess the dataset, including resizing images and normalizing pixel values.

In [12]:
# Load and Preprocess Dataset
IMAGE_SIZE = (64, 64)
DATASET_PATH = "/content/leukemia_data/C-NMC_Leukemia/training_data"
LABELS = {'hem': 0, 'all': 1}

def load_dataset(dataset_path):
    X, y = [], []
    for fold in ['fold_0', 'fold_1', 'fold_2']:
        fold_path = os.path.join(dataset_path, fold)
        if not os.path.exists(fold_path):
            continue
        for label_folder in os.listdir(fold_path):
            if label_folder not in LABELS:
                continue
            folder_path = os.path.join(fold_path, label_folder)
            for file in tqdm(os.listdir(folder_path), desc=f"{fold}/{label_folder}"):
                if not file.lower().endswith(".bmp"):
                    continue
                img = load_img(os.path.join(folder_path, file),
                               target_size=IMAGE_SIZE,
                               color_mode='grayscale')
                img_array = img_to_array(img) / 255.0
                X.append(img_array)
                y.append(LABELS[label_folder])
    return np.array(X), np.array(y)

X, y = load_dataset(DATASET_PATH)
print(f"Loaded {len(X)} images. Shape: {X.shape}")

fold_0/all: 100%|██████████| 2397/2397 [00:11<00:00, 205.56it/s]
fold_0/hem: 100%|██████████| 1130/1130 [00:05<00:00, 216.06it/s]
fold_1/all: 100%|██████████| 2418/2418 [00:11<00:00, 209.65it/s]
fold_1/hem: 100%|██████████| 1163/1163 [00:05<00:00, 196.82it/s]
fold_2/all: 100%|██████████| 2457/2457 [00:11<00:00, 209.12it/s]
fold_2/hem: 100%|██████████| 1096/1096 [00:05<00:00, 214.79it/s]


Loaded 10661 images. Shape: (10661, 64, 64, 1)


# Split Data into Train, Validation, and Test Sets
Use train_test_split to divide the dataset into training, validation, and test sets.

In [13]:
# Split Data into Train, Validation, and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Data Augmentation
Set up ImageDataGenerator for training and validation data augmentation.

In [14]:
# Data Augmentation
datagen_train = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)
datagen_train.fit(X_train)

datagen_val = ImageDataGenerator()
datagen_val.fit(X_val)

# Build CNN Model
Define the CNN architecture using Keras, including convolutional, pooling, dropout, and dense layers.

In [15]:
# Build CNN Model
inputs = Input(shape=(64, 64, 1))
x = Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.001))(inputs)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)

x = Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001))(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)

x = Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001))(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.3)(x)

x = Flatten()(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(0.001), name='feature_layer')(x)
x = Dropout(0.4)(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN Model
Compile and train the CNN model using the training and validation datasets, with early stopping.

In [None]:
# Train CNN Model
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

history = model.fit(
    datagen_train.flow(X_train, y_train, batch_size=32),
    validation_data=datagen_val.flow(X_val, y_val, batch_size=32),
    epochs=50,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50


  self._warn_if_super_not_called()


[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 220ms/step - accuracy: 0.7125 - loss: 0.7997 - val_accuracy: 0.7954 - val_loss: 0.5538
Epoch 2/50


# Feature Extraction for PCA and SVM
Extract features from the trained CNN model's intermediate layer for use in PCA and SVM.

In [None]:
# Feature Extraction for PCA and SVM
feature_model = Model(inputs=model.input, outputs=model.get_layer('feature_layer').output)

features_train = feature_model.predict(X_train)
features_test = feature_model.predict(X_test)

# Apply PCA
Perform PCA on the extracted features to reduce dimensionality.

In [None]:
# Apply PCA
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(features_train)
X_test_pca = pca.transform(features_test)

# Train and Evaluate SVM Classifier
Train an SVM classifier on the PCA-transformed features and evaluate its performance on the test set.

In [None]:
# Train and Evaluate SVM Classifier
svm = SVC(kernel='rbf', C=1.0)
svm.fit(X_train_pca, y_train)
y_pred = svm.predict(X_test_pca)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['hem', 'all']))

print(f"Test Accuracy (SVM on PCA):  {accuracy_score(y_test, y_pred) * 100:.2f}%")

# Plot Training and Validation Accuracy
Plot the training and validation accuracy over epochs to visualize model performance.

In [None]:
# Plot Training and Validation Accuracy
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('CNN Training vs Validation Accuracy')
plt.legend()
plt.grid()
plt.show()