# Data analysis for mamography image

### Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten , Dropout , BatchNormalization, MaxPooling2D, GlobalAveragePooling2D
from keras.callbacks import Callback, ModelCheckpoint, CSVLogger

import tensorflow as tf

import pickle

from sklearn.metrics import classification_report,confusion_matrix

##### Import data
Image data are splited and save seperatly as folders for  ,train , validation and test

In [None]:
# Paths
train_dir = "C:/Users/Sreerag/Documents/ML_chellange/Brest-Cancer-classifier/data/raw/model_2/train"
val_dir   = "C:/Users/Sreerag/Documents/ML_chellange/Brest-Cancer-classifier/data/raw/model_2/val"
test_dir  = "C:/Users/Sreerag/Documents/ML_chellange/Brest-Cancer-classifier/data/raw/model_2/test"

#### Split data

In [None]:
# Load training data
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=(224, 224),  # resize images
    batch_size=64,
    label_mode="int",  color_mode="rgb"       # returns 0 (benign), 1 (malignant)
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    image_size=(224, 224),
    batch_size=64,
    label_mode="int"
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(224, 224),
    batch_size=64,
    label_mode="int"
)

#### Initial Analysis

In [None]:
# Check class names (labels)
print(train_ds.class_names)  # ['benign', 'malignant']

In [None]:
imgs, labels = next(iter(train_ds))

In [None]:
# Check the shape of images and labels
imgs.shape, labels.shape

In [None]:
ind = np.random.randint(0, imgs.shape[0])
print(f"Image shape: {imgs[ind].shape}, Label: {labels[ind]}")
plt.imshow(imgs[0])
plt.ylabel(labels[0])
plt.show()

### Model selection
As a starting point, a simple convolutional neural network (CNN) was implemented to establish a baseline for the classification task. The model uses multiple convolution and pooling layers for feature extraction, followed by fully connected layers with dropout to reduce overfitting, and a sigmoid activation in the output layer for binary prediction. This serves as an initial benchmark before exploring more complex models and transfer learning techniques.

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(224,224,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:

model.summary()

### Train of Model

In [None]:
# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)

In [None]:
# Train the model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=25,           # change epochs later
    batch_size=32        # batch size is already set in dataset
)


### Evaluation

In [None]:
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

The baseline CNN achieved a training accuracy of 97.5% and a loss of 0.068, with a validation accuracy of 86.3% and a loss of 0.522. The results show the model learns the training data well, though there is some overfitting, suggesting potential improvements with data augmentation or more advanced architectures.

### Test of model

In [None]:
# Evaluate on test dataset
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

##### prediction 

In [None]:
# Get model predictions
y_pred_prob = model.predict(test_ds)
y_pred_classes = np.where(y_pred_prob > 0.5, 1, 0)

# Get true labels
y_true = np.concatenate([y for x, y in test_ds], axis=0)

In [None]:
plt.plot(y_pred_prob,'.',color='red',label='Predicted Probabilty')
plt.plot(y_true,'.',color='navy',label='Actual Labels')
plt.xlabel('Instance Number')
plt.ylabel('Probability')
plt.legend()

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(6,6))
sns.heatmap(cm, cmap=plt.cm.Blues, annot=True, annot_kws={"size": 24}, fmt='g')

# Correct tick positions and labels
plt.xticks([0.5, 1.5], ['Benign','Malignant'], fontsize=16)
plt.yticks([0.5, 1.5], ['Benign','Malignant'], fontsize=16, rotation=0)

plt.ylabel('True label', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.title('Confusion Matrix for Breast Cancer', fontsize=16)
plt.show()


In [None]:
# Print detailed classification report
print(classification_report(y_true, y_pred_classes, target_names=['benign','malignant']))

The higher recall and precision for the malignant class compared to benign indicate that the model is biased towards predicting malignant cases. While it captures malignant cases reasonably well, it struggles with benign classification. This suggests the need for improvements such as data balancing, augmentation, or more advanced architectures to achieve better generalization.