In [1]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from tensorflow.keras import models, layers
from tensorflow.keras.utils import to_categorical

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd





In [2]:
data_dir = "../../../DATA_PROCESSED/64x64/"
categories = ["Benign", "Malignant", "Normal"]
image_size = (64, 64)

# Load images and labels with filenames
data = []
labels = []
file_names = []  # List to store image file names

# %%
for category in categories:
    category_path = os.path.join(data_dir, category)
    for img_name in os.listdir(category_path):
        img_path = os.path.join(category_path, img_name)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, image_size)
        data.append(img)
        labels.append(categories.index(category))
        file_names.append(img_name)  # Store the filename

In [3]:
data = np.array(data).reshape(-1, 64, 64, 1).astype('float32') / 255.0
labels = np.array(labels)

# Split data into training and testing sets
train_data, test_data, train_labels, test_labels, train_file_names, test_file_names = train_test_split(
    data, labels, file_names, test_size=0.2, stratify=labels, random_state=42)

# %%
train_labels = to_categorical(train_labels, num_classes=len(categories))
test_labels = to_categorical(test_labels, num_classes=len(categories))

In [4]:
def create_model(input_shape, num_classes):
    model = models.Sequential()

    # Convolutional layers
    model.add(layers.Conv2D(
        32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))

    # Flatten the output from the convolutional layers
    model.add(layers.Flatten())

    # Dense layers
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(128, activation='relu'))

    # Output layer (3 classes)
    model.add(layers.Dense(num_classes, activation='softmax'))

    return model

In [5]:
input_shape = (64, 64, 1)  # Example input shape for grayscale images
num_classes = len(categories)  # Number of classes
cnn_model = create_model(input_shape, num_classes)
cnn_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 62, 62, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 31, 31, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 29, 29, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 14, 14, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 12, 12, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 6, 6, 128)        

In [6]:
cnn_model.compile(optimizer='adam',
                  loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = cnn_model.fit(train_data, train_labels, epochs=10,
                        batch_size=32, validation_split=0.2)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:

# %%
predictions = cnn_model.predict(test_data)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(test_labels, axis=1)

# Calculate metrics
accuracy = accuracy_score(true_classes, predicted_classes)
precision = precision_score(
    true_classes, predicted_classes, average='weighted')
recall = recall_score(true_classes, predicted_classes, average='weighted')
roc_auc = roc_auc_score(test_labels, predictions, multi_class='ovr')
conf_matrix = confusion_matrix(true_classes, predicted_classes)

# Calculate specificity for each class
tn = conf_matrix.diagonal()
fp = conf_matrix.sum(axis=0) - tn
specificity = tn / (tn + fp)

# %%
report = classification_report(
    true_classes, predicted_classes, target_names=categories)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'ROC-AUC: {roc_auc:.2f}')
print(f'Specificity: {specificity}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(report)

# Create DataFrames for training and testing data
train_data_df = pd.DataFrame(
    {'File Name': train_file_names, 'Class': np.argmax(train_labels, axis=1)})
test_data_df = pd.DataFrame(
    {'File Name': test_file_names, 'Class': np.argmax(test_labels, axis=1)})

Accuracy: 0.99
Precision: 0.99
Recall: 0.99
ROC-AUC: 1.00
Specificity: [0.97096774 0.99299363 0.98414496]
Confusion Matrix:
[[ 301    0    7]
 [   2 1559    7]
 [   7   11  869]]
Classification Report:
              precision    recall  f1-score   support

      Benign       0.97      0.98      0.97       308
   Malignant       0.99      0.99      0.99      1568
      Normal       0.98      0.98      0.98       887

    accuracy                           0.99      2763
   macro avg       0.98      0.98      0.98      2763
weighted avg       0.99      0.99      0.99      2763



In [8]:
with pd.ExcelWriter('image_data.xlsx') as writer:
    train_data_df.to_excel(writer, sheet_name='Train Data', index=False)
    test_data_df.to_excel(writer, sheet_name='Test Data', index=False)

    # Store metrics in a separate sheet
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'ROC-AUC'],
        'Value': [accuracy, precision, recall, roc_auc]
    })
    metrics_df.to_excel(writer, sheet_name='Metrics', index=False)

    # Store confusion matrix in a separate sheet
    confusion_df = pd.DataFrame(
        conf_matrix, index=categories, columns=categories)
    confusion_df.to_excel(writer, sheet_name='Confusion Matrix')

    # Store specificity in a separate sheet
    specificity_df = pd.DataFrame({
        'Class': categories,
        'Specificity': specificity
    })
    specificity_df.to_excel(writer, sheet_name='Specificity', index=False)

    # Store classification report as text
    report_df = pd.DataFrame(report.split('\n'), columns=[
                             'Classification Report'])
    report_df.to_excel(writer, sheet_name='Classification Report', index=False)