### Dataset::https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000 

In [3]:
# Required Libraries

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,recall_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('/kaggle/input/skin-cancer-mnist-ham10000/hmnist_28_28_RGB.csv')
data.head()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
0,192,153,193,195,155,192,197,154,185,202,...,173,124,138,183,147,166,185,154,177,2
1,25,14,30,68,48,75,123,93,126,158,...,60,39,55,25,14,28,25,14,27,2
2,192,138,153,200,145,163,201,142,160,206,...,167,129,143,159,124,142,136,104,117,2
3,38,19,30,95,59,72,143,103,119,171,...,44,26,36,25,12,17,25,12,15,2
4,158,113,139,194,144,174,215,162,191,225,...,209,166,185,172,135,149,109,78,92,2


In [5]:
# Assigning labels to encoded classes (for informational purposes only)

classes = {4: ('nv', ' melanocytic nevi'), 6: ('mel', 'melanoma'), 2 :('bkl', 'benign keratosis-like lesions'),
           1:('bcc' , ' basal cell carcinoma'), 5: ('vasc', ' pyogenic granulomas and hemorrhage'), 
           0: ('akiec', 'Actinic keratoses and intraepithelial carcinomae'),  3: ('df', 'dermatofibroma')}

In [6]:
#Checking the dataset: Is the dataset biased or not?

first = [classes[i][1] for i in list(data['label'].value_counts().index)]
sec = list(data['label'].value_counts().values)
last = dict(zip(first,sec))
last

{' melanocytic nevi': 6705,
 'melanoma': 1113,
 'benign keratosis-like lesions': 1099,
 ' basal cell carcinoma': 514,
 'Actinic keratoses and intraepithelial carcinomae': 327,
 ' pyogenic granulomas and hemorrhage': 142,
 'dermatofibroma': 115}

In [26]:
# Divide the dataset into dependent and independent variables

x = data.drop(['label'],axis = 1)
y = data['label']

In [27]:
# Scale the data.

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [28]:
# Split the dataset into train and test sets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [29]:
# The data is biased, with a larger number of observations for the class "melanocytic nevi" (class 4)
# To mitigate this bias, we follow the steps outlined below

# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Apply RandomUnderSampler to undersample the majority class
rus = RandomUnderSampler(random_state=42)
x_train_resampled, y_train_resampled = rus.fit_resample(x_train_resampled, y_train_resampled)

In [30]:
# Extract the validation sets from the training data

x_train1, x_val, y_train1, y_val = train_test_split(x_train_resampled, y_train_resampled, test_size=0.2, random_state=42)

In [31]:
y_train1.value_counts()

3    4328
2    4302
6    4298
1    4297
4    4284
0    4274
5    4272
Name: label, dtype: int64

In [32]:
# Reshape the datasets to fit the CNN model

x_train1 = np.array(x_train1).reshape(-1,28,28,3)
x_val = np.array(x_val).reshape(-1,28,28,3)
x_test = np.array(x_test).reshape(-1,28,28,3)

In [33]:
# Creating the CNN Model

model = Sequential()
model.add(Conv2D(256, kernel_size=(3, 3), input_shape=(28, 28, 3), activation='relu', padding='same'))
model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
# model.add(Dropout(0.6))  # Added dropout layer

model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))

# # model.add(Dropout(0.6))  # Added dropout layer
# model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
# model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
# model.add(MaxPool2D(pool_size=(2, 2), padding='same'))

# model.add(Dropout(0.6))  # Added dropout layer
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'))
# model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.6))  # Added dropout layer
model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.6))  # Added dropout layer
model.add(Dense(7, activation='softmax'))

In [34]:
# Compiling the Model and Implementing Early Stopping

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model using the separate training and validation sets
history = model.fit(
    x_train1,
    y_train1,
    validation_data=(x_val, y_val),
    batch_size=16,
    epochs=10,
    callbacks=[early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# Prediction

y_pred = model.predict(x_test)

# Convert predictions to class labels
y_pred_labels = np.argmax(y_pred, axis=1)

# Print the predicted labels
print("Train Predictions:", y_pred_labels)

Train Predictions: [2 6 6 ... 0 6 4]


In [36]:
#Accuracy Assesment

# Calculate overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred_labels)
print("Overall Accuracy:", overall_accuracy)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_labels)
print("Confusion Matrix:")
print(cm)

# Calculate overall recall
overall_recall = recall_score(y_test, y_pred_labels, average='macro')
print("Overall Recall:", overall_recall)

# Generate classification report
report = classification_report(y_test, y_pred_labels)
print("Classification Report:")
print(report)

Overall Accuracy: 0.5801298052920619
Confusion Matrix:
[[ 24   7  15   1   6   0  16]
 [ 19  32  17   0  10   2  13]
 [ 26   4  80   1  30   2  85]
 [  0   4   7   4   3   1   9]
 [ 29  32 107   4 848   5 313]
 [  0   1   1   0   4  14   1]
 [ 10   6  21   0  28   1 160]]
Overall Recall: 0.45629421093781175
Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.35      0.27        69
           1       0.37      0.34      0.36        93
           2       0.32      0.35      0.34       228
           3       0.40      0.14      0.21        28
           4       0.91      0.63      0.75      1338
           5       0.56      0.67      0.61        21
           6       0.27      0.71      0.39       226

    accuracy                           0.58      2003
   macro avg       0.44      0.46      0.42      2003
weighted avg       0.71      0.58      0.62      2003

