# Lymphoma Diagnosis in Histopathology Images

## Import Kaggle Data

In [None]:
import kagglehub
obulisainaren_multi_cancer_path = kagglehub.dataset_download('obulisainaren/multi-cancer')

print('Data source import complete.')

## Path Setup

In [None]:
data_path_cll = "/kaggle/input/multi-cancer/Multi Cancer/Lymphoma/lymph_cll"
data_path_fl = "/kaggle/input/multi-cancer/Multi Cancer/Lymphoma/lymph_fl"
data_path_mcl = "/kaggle/input/multi-cancer/Multi Cancer/Lymphoma/lymph_mcl"

## Loading Data

In [None]:
import os
import cv2
import numpy as np

def load_images_with_labels(folder_path: str, label: int, img_size: tuple = (128, 128)) -> tuple[np.ndarray, np.ndarray]:
    images = []
    labels = []

    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        print(f"Loading image: {img_path}")

        img = cv2.imread(img_path)

        if img is None:
            print(f"Error loading image: {img_path}")
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, img_size)
        images.append(img)
        labels.append(label)

    return np.array(images) / 255.0, np.array(labels)

labels_dict = { 0 : 'Chronic Lymphocytic Leukemia',1 : 'Follicular Lymphoma', 2 : 'Mantle Cell Lymphoma'}

lymph_cll = load_images_with_labels(folder_path = data_path_cll, label = 0)
lymph_fl  = load_images_with_labels(folder_path = data_path_fl , label = 1)
lymph_mcl = load_images_with_labels(folder_path = data_path_mcl, label = 2)

lymph_cll_images, lymph_cll_labels = lymph_cll[0], lymph_cll[1]
lymph_fl_images,  lymph_fl_labels  = lymph_fl[0],  lymph_fl[1]
lymph_mcl_images, lymph_mcl_labels = lymph_mcl[0], lymph_mcl[1]

## Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train_validate_cll, X_test_cll, y_train_validate_cll, y_test_cll = train_test_split(lymph_cll_images, lymph_cll_labels, test_size=0.2, random_state=42)
X_train_validate_fl,  X_test_fl,  y_train_validate_fl,  y_test_fl  = train_test_split(lymph_fl_images,  lymph_fl_labels,  test_size=0.2, random_state=42)
X_train_validate_mcl, X_test_mcl, y_train_validate_mcl, y_test_mcl = train_test_split(lymph_mcl_images, lymph_mcl_labels, test_size=0.2, random_state=42)

## Train and Validate Split

In [None]:
X_train_cll, X_val_cll, y_train_cll, y_val_cll = train_test_split(X_train_validate_cll, y_train_validate_cll, test_size=0.2, random_state=42)
X_train_fl,  X_val_fl,  y_train_fl,  y_val_fl  = train_test_split(X_train_validate_fl,  y_train_validate_fl,  test_size=0.2, random_state=42)
X_train_mcl, X_val_mcl, y_train_mcl, y_val_mcl = train_test_split(X_train_validate_mcl, y_train_validate_mcl, test_size=0.2, random_state=42)

## Concatenate Data

In [None]:
X_train = np.concatenate((X_train_cll, X_train_fl,X_train_mcl), axis=0)
X_test  = np.concatenate((X_test_cll,  X_test_fl ,X_test_mcl ), axis=0)
X_val   = np.concatenate((X_val_cll,   X_val_fl  ,X_val_mcl  ), axis=0)

y_train = np.concatenate((y_train_cll, y_train_fl,y_train_mcl), axis=0)
y_test  = np.concatenate((y_test_cll,  y_test_fl ,y_test_mcl ), axis=0)
y_val   = np.concatenate((y_val_cll,   y_val_fl  ,y_val_mcl  ), axis=0)

## Shapes Check

In [None]:
print(f'X_train : {X_train.shape} ,  y_train :  {y_train.shape}')
print(f'X_val   : {X_val.shape} ,  y_val   :  {y_val.shape}  ')
print(f'X_test  : {X_test.shape} ,  y_test  :  {y_test.shape} ')

## Shuffle

In [None]:
indices_train = np.arange(X_train.shape[0])
indices_val   = np.arange(X_val.shape[0])
indices_test  = np.arange(X_test.shape[0])

np.random.shuffle(indices_train)
np.random.shuffle(indices_val)
np.random.shuffle(indices_test)

X_train_shuffled = X_train[indices_train]
y_train_shuffled = y_train[indices_train]

X_val_shuffled = X_val[indices_val]
y_val_shuffled = y_val[indices_val]

X_test_shuffled = X_test[indices_test]
y_test_shuffled = y_test[indices_test]

del X_train, X_val, X_test, y_train, y_test, y_val, X_train_cll, X_val_cll, y_train_cll, y_val_cll, X_train_fl,  X_val_fl,  y_train_fl,  y_val_fl, X_train_mcl, X_val_mcl, y_train_mcl, y_val_mcl, X_train_validate_mcl, X_test_mcl, y_train_validate_mcl, y_test_mcl, X_train_validate_fl,  X_test_fl,  y_train_validate_fl,  y_test_fl,X_train_validate_cll, X_test_cll, y_train_validate_cll, y_test_cll
del lymph_cll_images, lymph_cll_labels, lymph_fl_images,  lymph_fl_labels, lymph_mcl_images, lymph_mcl_labels, lymph_cll, lymph_fl, lymph_mcl

## Visualize Class Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

y_mapped = [labels_dict[label] for label in y_train_shuffled]

plt.figure(figsize=(10, 6))
sns.countplot(x=y_mapped)
plt.title('Class Distribution')
plt.show()

## Display Sample Images

In [None]:
def visualize_images(images, labels, class_names=None, num_samples=4):
    num_rows = 1
    num_cols = num_samples
    plt.figure(figsize=(16, 16))

    for i in range(num_samples):

        plt.subplot(num_rows, num_cols, i + 1)
        plt.imshow(images[i])
        if class_names:
            plt.title(class_names[labels_dict[labels[i]]])
        else:
            plt.title(f"Label: {labels_dict[labels[i]]}")
        plt.axis('off')

    plt.show()

visualize_images(X_train_shuffled, y_train_shuffled)

## Explore Color Channels

In [None]:
def plot_color_channels(img):
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img)
    plt.title('Original Image')
    plt.axis('off')

    for i, channel in enumerate(['Red', 'Green', 'Blue']):
        plt.subplot(1, 4, i + 2)
        plt.imshow(img[:, :, i], cmap='gray')
        plt.title(f'{channel} Channel')
        plt.axis('off')

    plt.show()

random_index = np.random.randint(0, len(X_train_shuffled))
random_image = X_train_shuffled[random_index]
plot_color_channels(random_image)

## Pixel Intensity Distribution

In [None]:
def plot_pixel_intensity_distribution(img):
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.imshow(img)
    plt.title('Original Image')
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.hist(img.ravel(), bins=256, color='gray', histtype='step')
    plt.title('Pixel Intensity Distribution')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')

    plt.show()

plot_pixel_intensity_distribution(random_image)

## Average Pixel Intensity per Class

In [None]:
def average_pixel_intensity_per_class(X, y):
    unique_classes = np.unique(y)
    avg_intensity_per_class = []

    for label in unique_classes:
        class_indices = np.where(y == label)[0]
        class_images = X[class_indices]
        avg_intensity = np.mean(class_images)
        avg_intensity_per_class.append(avg_intensity)

    plt.bar(unique_classes, avg_intensity_per_class)
    plt.title('Average Pixel Intensity per Class')
    plt.xlabel('Class')
    plt.ylabel('Average Pixel Intensity')
    plt.show()

average_pixel_intensity_per_class(X_train_shuffled, y_train_shuffled)

## Correlation Between Channels

In [None]:
def plot_channel_correlation(img):
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 4, 1)
    plt.imshow(img)
    plt.title('Original Image')
    plt.axis('off')

    plt.subplot(1, 4, 2)
    plt.scatter(img[:, :, 0].ravel(), img[:, :, (1) % 3].ravel(), s=2, alpha=0.5)
    plt.title(f'Correlation: Red vs Green')
    plt.xlabel(f'Red Channel')
    plt.ylabel(f'Green Channel')

    plt.subplot(1, 4, 3)
    plt.scatter(img[:, :, 1].ravel(), img[:, :, (2) % 3].ravel(), s=2, alpha=0.5)
    plt.title(f'Correlation: Green vs Blue')
    plt.xlabel(f'Green Channel')
    plt.ylabel(f'Blue Channel')

    plt.subplot(1, 4, 4)
    plt.scatter(img[:, :, 2].ravel(), img[:, :, (3) % 3].ravel(), s=2, alpha=0.5)
    plt.title(f'Correlation: Blue vs Green')
    plt.xlabel(f'Blue Channel')
    plt.ylabel(f'Green Channel')


    plt.show()

plot_channel_correlation(random_image)

## Load Pre-Trained ResNet-50

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import to_categorical

base_resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

for layer in base_resnet.layers:
    layer.trainable = False

## Build Model

In [None]:
ResNet50 = models.Sequential()
ResNet50.add(base_resnet)
ResNet50.add(layers.GlobalAveragePooling2D())
ResNet50.add(layers.Dense(256, activation='relu'))
ResNet50.add(layers.Dropout(0.5))
ResNet50.add(layers.Dense(3, activation='softmax'))

## One Hot Encoding

In [None]:
y_train_one_hot = to_categorical(y_train_shuffled, 3)
y_val_one_hot   = to_categorical(y_val_shuffled, 3)
y_test_one_hot  = to_categorical(y_test_shuffled, 3)

ResNet50.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## Train

In [None]:
res_net_history = ResNet50.fit(X_train_shuffled, y_train_one_hot, batch_size=32, epochs=10, validation_data=(X_val_shuffled, y_val_one_hot))

## Plot Learning Curve

In [None]:
def plot_learning_curves(history):
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Train', 'Validate'], loc='upper left')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(['Train', 'Validate'], loc='upper left')

    plt.tight_layout()
    plt.show()

plot_learning_curves(res_net_history)

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

y_pred_pretrained = ResNet50.predict(X_test_shuffled)
y_pred_classes_pretrained = np.argmax(y_pred_pretrained, axis=1)
y_test_classes_pretrained = np.argmax(y_test_one_hot, axis=1)

confusion_mtx_resnet = confusion_matrix(y_pred_classes_pretrained, y_test_classes_pretrained)
print("Confusion Matrix:")
print(confusion_mtx_resnet)

plt.figure(figsize=(10, 10))
plt.imshow(confusion_mtx_resnet, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Using Pretrained ResNet-50')
plt.colorbar()
tick_marks = np.arange(3)
plt.xticks(tick_marks, [labels_dict[0], labels_dict[1], labels_dict[2]])
plt.yticks(tick_marks, [labels_dict[0], labels_dict[1], labels_dict[2]])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Classification Report

In [None]:
from sklearn.metrics import classification_report

class_report_resnet = classification_report(y_test_classes_pretrained, y_pred_classes_pretrained)
print("Classification Report Using Pretrained ResNet-50:")
print(class_report_resnet)