<a href="https://colab.research.google.com/github/samthomaz/skincancer_detector/blob/main/skin_cancer_detection_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Google Colab Setup and Loading Dataset**

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

#Download HAM10000 Dataset
!kaggle datasets download kmader/skin-cancer-mnist-ham10000
!ls
!unzip skin-cancer-mnist-ham10000.zip
!ls

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

!ls

df = pd.read_csv('HAM10000_metadata.csv')
df.head()

print("Total images: ", len(df))

sns.countplot(x='dx', data=df)
plt.title('Class Distribution')
plt.xticks(rotation=45)
plt.show()

In [None]:
df['label'] = df['dx'].apply(lambda x: 1 if x == 'mel' else 0)
print(df['label'].value_counts())

In [None]:
import matplotlib.pyplot as plt

def show_images(df, label, n=5):
  subset = df[df['label'] == label].sample(n)
  plt.figure(figsize=(15, 5))
  for i, row in enumerate(subset.itertuples()):
    image_path = os.path.join('HAM10000_images_part_1', row.image_id + '.jpg')
    if not os.path.exists(image_path):
      image_path = os.path.join('HAM10000_images_part_2', row.image_id + '.jpg')
    image = Image.open(image_path)
    plt.subplot(1, n, i+1)
    plt.imshow(image)
    plt.title('Melanoma' if row.label == 1 else 'Benign')
    plt.axis('off')
  plt.tight_layout()
  plt.show()

show_images(df, label=1) # Melanoma
show_images(df, label=0) # Benign

# **Preprocessing & Dataset Preparation**

In [None]:
import os

#Code block will aid to get the correct image path
def get_image_path(image_id):
  part1 = os.path.join('HAM10000_images_part_1', image_id + '.jpg')
  part2 = os.path.join('HAM10000_images_part_2', image_id + '.jpg')
  return part1 if os.path.exists(part1) else part2

df['image_path'] = df['image_id'].apply(get_image_path)

df[['image_id', 'image_path', 'label']].head()

In [None]:
mel_df = df[df['label'] == 1]
non_mel_df = df[df['label'] == 0].sample(len(mel_df), random_state=42)

balanced_df = pd.concat([mel_df, non_mel_df]).sample(frac=1).reset_index(drop=True)
print(balanced_df['label'].value_counts())

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

datagen = ImageDataGenerator(
    rescale=1./255, #normalises image pixels from 0-255 to 0-1
    validation_split=0.2, #rsvs 20% of data for validation
    horizontal_flip=True, #adds data augmentation to make model more robust
    rotation_range=15,
    zoom_range=0.1
)

#creates batches of images and labels for training and validation sets
train_gen = datagen.flow_from_dataframe(
    balanced_df,
    x_col='image_path',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw',
    subset='training',
    shuffle=True
  )


validation_gen = datagen.flow_from_dataframe(
    balanced_df,
    x_col='image_path',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw',
    subset='validation',
    shuffle=True
)

# **Building and Training CNN model with Transfer Learning**

Transfer learning reuses "early layers" of pre-trained model (which detects edges, shapes, colours) and adds user defined custom output layer for the case (in this instance it is the melanoma detection).

MobielNetV2 will be used since

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam

# Loading the base MobileNetV2 model (without top layers)
base_model = MobileNetV2(input_shape = (IMG_SIZE[0], IMG_SIZE[1], 3),
                         include_top =  False, #not to include ImageNet classification head
                         weights = 'imagenet') # use weights learnt from imagenet

#Freeze the base layers to only train the new top layers
base_model.trainable = False

In [None]:
#Adding Custom Layers

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=predictions)

In [None]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer = Adam(learning_rate=0.0001),
    metrics = ['accuracy']
)

In [None]:
history = model.fit(
    train_gen,
    validation_data = validation_gen,
    epochs = 10,
)

In [None]:
import matplotlib.pyplot as plt

#Accuracy plot
plt.plot(history.history['accuracy'], label = 'Training Accuracy')
plt.plot(history.history['val_accuracy'], label = 'Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# **Evaluate and Interpret the Model**

In [None]:
import numpy as np

#Reset validation_gen, so that it will start from the beginning
validation_gen.reset()


#Predict images in validation set
prediction_probs = model.predict(validation_gen, verbose=1)
prediction_labels = (prediction_probs > 0.5).astype(int).flatten()

# True Labels
true_labels = validation_gen.classes

In [None]:
# Create a Confusion Matrix

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

#Confusion Matrix
cm = confusion_matrix(true_labels, prediction_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()