#### Baseline Model 2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
# Function to extract features (flatten the image)
def extract_features(img, size=(27, 27)):
    img_resized = img.convert("L").resize(size)  # Convert to grayscale and resize
    return np.array(img_resized).flatten()  # Flatten to a 1D vector

# Prepare features (flattened images) and labels
X = []
y_cancerous = []
y_celltype = []

for _, row in labels_main.iterrows():
    img = row['Image']  # PIL Image object from 'Image' column
    X.append(extract_features(img))  # Extract features from image
    y_cancerous.append(row["isCancerous"])  # Target: isCancerous
    y_celltype.append(row["cellTypeName"])  # Target: cellTypeName

X = np.array(X)
y_cancerous = np.array(y_cancerous)
y_celltype = np.array(y_celltype)

# Train-Test Split (80% training, 20% test data)
X_train, X_test, y_c_train, y_c_test, y_t_train, y_t_test = train_test_split(
    X, y_cancerous, y_celltype, test_size=0.2, random_state=42, stratify=y_cancerous
)

In [None]:
# Train Random Forest for cancerous classification (isCancerous)
rf_cancerous = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cancerous.fit(X_train, y_c_train)

# Predict on test data
y_c_pred = rf_cancerous.predict(X_test)

# Evaluate the cancerous classification model
print("Cancerous Classification (isCancerous) Results:")
print("Accuracy: ", accuracy_score(y_c_test, y_c_pred))
print("Classification Report:")
print(classification_report(y_c_test, y_c_pred))

# Confusion Matrix for cancerous classification
cm_cancerous = confusion_matrix(y_c_test, y_c_pred)

In [None]:
# Encode the cell type labels to numeric values
le = LabelEncoder()
y_t_train_encoded = le.fit_transform(y_t_train)
y_t_test_encoded = le.transform(y_t_test)

# Train Random Forest for cell-type classification
rf_celltype = RandomForestClassifier(n_estimators=100, random_state=42)
rf_celltype.fit(X_train, y_t_train_encoded)

# Predict on test data for cell type
y_t_pred = rf_celltype.predict(X_test)

# Evaluate the cell-type classification model
print("\nCell-Type Classification Results:")
print("Accuracy: ", accuracy_score(y_t_test_encoded, y_t_pred))
print("Classification Report:")
print(classification_report(y_t_test_encoded, y_t_pred))

# Confusion Matrix for cell type classification
cm_celltype = confusion_matrix(y_t_test_encoded, y_t_pred)

In [None]:
import matplotlib.pyplot as plt

# Plotting Confusion Matrix for cancerous classification
plt.figure(figsize=(6, 6))
plt.imshow(cm_cancerous, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Cancerous Classification Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['Non-Cancerous', 'Cancerous'])
plt.yticks(tick_marks, ['Non-Cancerous', 'Cancerous'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plotting Confusion Matrix for cell type classification
plt.figure(figsize=(8, 8))
plt.imshow(cm_celltype, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Cell-Type Classification Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(le.classes_))
plt.xticks(tick_marks, le.classes_, rotation=45)
plt.yticks(tick_marks, le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### Baseline Model 3: custom CNN with residual blocks

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import layers, models, callbacks, metrics
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array
import tensorflow as tf
from tensorflow.keras import backend as K

# Set image size for ResNet101
IMG_SIZE = (224, 224)

# -------------------
# Image Preprocessing
# -------------------
def preprocess_images(df, image_column, target_size=IMG_SIZE):
    X = []
    for img in df[image_column]:
        img_resized = img.resize(target_size)
        img_array = img_to_array(img_resized)
        img_array = img_array / 255.0  # Normalize to [0, 1]
        X.append(img_array)
    return np.array(X)

X = preprocess_images(labels_main, 'Image', target_size=IMG_SIZE)
y_cancerous = labels_main['isCancerous'].values
y_celltype = labels_main['cellTypeName'].values

In [None]:
# -------------------
# Train-Test Split
# -------------------
X_train, X_test, y_c_train, y_c_test, y_t_train, y_t_test = train_test_split(
    X, y_cancerous, y_celltype, test_size=0.2, random_state=42, stratify=y_cancerous
)

In [None]:
# -------------------
# Residual Block
# -------------------
def residual_block(x, filters, kernel_size=(3, 3), strides=(1, 1)):
    shortcut = x
    x = layers.Conv2D(filters, kernel_size, strides=strides, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Conv2D(filters, kernel_size, strides=(1, 1), padding="same")(x)
    x = layers.BatchNormalization()(x)

    if shortcut.shape[-1] != filters or strides != (1, 1):
        shortcut = layers.Conv2D(filters, (1, 1), strides=strides, padding="same")(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)

    x = layers.add([x, shortcut])
    x = layers.Activation("relu")(x)
    return x

In [None]:
# -------------------
# Build ResNet-like Model
# -------------------
def build_resnet101_model(input_shape=(224, 224, 3)):
    input_layer = layers.Input(shape=input_shape)
    x = layers.Conv2D(64, (7, 7), strides=(2, 2), padding="same")(input_layer)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)

    x = residual_block(x, 64)
    x = residual_block(x, 64)
    x = residual_block(x, 128, strides=(2, 2))
    x = residual_block(x, 128)
    x = residual_block(x, 256, strides=(2, 2))
    x = residual_block(x, 256)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    output_layer = layers.Dense(1, activation="sigmoid")(x)

    model = models.Model(inputs=input_layer, outputs=output_layer)
    return model

model_cancerous = build_resnet101_model()

In [None]:
# -------------------
# Focal Loss
# -------------------
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        y_true = K.cast(y_true, tf.float32)
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
        pt = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        return -K.mean(alpha * K.pow(1. - pt, gamma) * K.log(pt))
    return loss

# -------------------
# Compile Model
# -------------------
model_cancerous.compile(
    optimizer="adam",
    loss=focal_loss(),
    metrics=[metrics.BinaryAccuracy(), metrics.Precision(), metrics.Recall()]
)

In [None]:
# -------------------
# Class Weights
# -------------------
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_c_train),
                                     y=y_c_train)
class_weights = dict(enumerate(class_weights))

# -------------------
# Data Augmentation
# -------------------
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True
)
train_generator = datagen.flow(X_train, y_c_train, batch_size=32)

In [None]:
# -------------------
# Callbacks
# -------------------
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# -------------------
# Train Model
# -------------------
history_cancerous = model_cancerous.fit(
    train_generator,
    validation_data=(X_test, y_c_test),
    epochs=30,
    class_weight=class_weights,
    callbacks=[early_stop]
)

# -------------------
# Model Summary (Optional)
# -------------------
model_cancerous.summary()

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_prob = model_cancerous.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy: ", accuracy_score(y_c_test, y_pred))
print("Classification Report:\n", classification_report(y_c_test, y_pred))