In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define dataset paths
dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000"  # Replace with the exact dataset folder name
images_dir = f"{dataset_path}/HAM10000_images_part_1"
meta_data_path = f"{dataset_path}/HAM10000_metadata.csv"

# Check if paths exist
print("Images Directory:", os.listdir(images_dir)[:5])  # Print the first few image names
print("Metadata File Exists:", os.path.exists(meta_data_path))

Images Directory: ['ISIC_0028933.jpg', 'ISIC_0028394.jpg', 'ISIC_0027799.jpg', 'ISIC_0028100.jpg', 'ISIC_0027960.jpg']
Metadata File Exists: True


In [2]:
# Load metadata
meta_data = pd.read_csv(meta_data_path)

# Display the first few rows
print(meta_data.head())

# Check label distribution
print(meta_data['dx'].value_counts())

     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear
dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [3]:
# Extract image file names and labels
image_ids = meta_data['image_id'].values
labels = meta_data['dx'].values

# Create train-test split
train_ids, val_ids, train_labels, val_labels = train_test_split(
    image_ids, labels, test_size=0.2, stratify=labels, random_state=42
)

print(f"Training Samples: {len(train_ids)}")
print(f"Validation Samples: {len(val_ids)}")

Training Samples: 8012
Validation Samples: 2003


In [4]:
# Create directories for training and validation
train_dir = "/kaggle/working/train"
val_dir = "/kaggle/working/validation"

# Helper function to organize images
def organize_images(image_ids, labels, src_dir, dest_dir):
    for img_id, label in zip(image_ids, labels):
        label_dir = os.path.join(dest_dir, label)
        os.makedirs(label_dir, exist_ok=True)
        
        src_path = os.path.join(src_dir, f"{img_id}.jpg")
        dest_path = os.path.join(label_dir, f"{img_id}.jpg")
        
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)

# Organize images
organize_images(train_ids, train_labels, images_dir, train_dir)
organize_images(val_ids, val_labels, images_dir, val_dir)

print("Train and Validation directories created!")

Train and Validation directories created!


In [None]:
import os
import shutil
import cv2  # Ensure OpenCV is imported
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Augmentation setup
augmentation_generator = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Augment minority classes
def augment_all_minority_classes(train_dir, target_count):
    """
    Augments all classes in the training directory to match the target_count.
    
    Args:
    - train_dir: Path to the training directory.
    - target_count: Desired number of images for each class.
    """
    classes = os.listdir(train_dir)
    
    for class_name in classes:
        target_dir = os.path.join(train_dir, class_name)
        current_images = os.listdir(target_dir)
        current_count = len(current_images)
        
        print(f"Processing class '{class_name}' with {current_count} images...")
        
        # Augment only if the current count is less than the target count
        if current_count < target_count:
            while current_count < target_count:
                for img_name in current_images:
                    img_path = os.path.join(target_dir, img_name)
                    img = cv2.imread(img_path)  # Read image
                    if img is None:
                        print(f"Failed to read {img_path}. Skipping...")
                        continue
                    img = cv2.resize(img, (128, 128))  # Resize to 128x128
                    img = np.expand_dims(img, axis=0)  # Add batch dimension

                    # Perform augmentation and save images
                    for batch in augmentation_generator.flow(
                        img, batch_size=1, save_to_dir=target_dir, save_prefix='aug', save_format='jpg'
                    ):
                        current_count += 1
                        if current_count >= target_count:
                            break
            print(f"Class '{class_name}' augmented to {current_count} images.")
        else:
            print(f"Class '{class_name}' already has sufficient images.")

# Example: Augment all classes to 6705 images
augment_all_minority_classes(train_dir, target_count=6705)


Processing class 'bkl' with 452 images...


In [None]:
# Get class distribution and labels
from sklearn.utils.class_weight import compute_class_weight

class_labels = os.listdir(train_dir)
label_to_index = {label: idx for idx, label in enumerate(class_labels)}

# Prepare train_labels from augmented dataset
train_labels = []
for label in class_labels:
    label_dir = os.path.join(train_dir, label)
    train_labels.extend([label] * len(os.listdir(label_dir)))

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(class_labels),
    y=train_labels
)

# Map weights to class indices
class_weights_dict = {label_to_index[label]: weight for label, weight in zip(class_labels, class_weights)}
print("Class Weights:", class_weights_dict)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(class_labels), activation='softmax')  # Output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Data generators
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

# Training generator
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

# Validation generator
val_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=21,
    class_weight=class_weights_dict  # Apply class weights
)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Initialize the ImageDataGenerator for the test dataset
test_datagen = ImageDataGenerator(rescale=1.0 / 255)  # Normalize pixel values to [0, 1]

# Define the directory where the test dataset is stored
test_dir = val_dir  # Replace with your test dataset directory

# Create the test data generator
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(128, 128),  # Image size should match the input size of your model
    batch_size=32,  # Number of images to process in a batch
    class_mode='categorical',  # The labels are one-hot encoded
    shuffle=False  # Ensure images are not shuffled to maintain correct order for evaluation
)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Step 2: Predict on the test data
y_true = test_generator.classes  # True labels from the test set
y_pred = model.predict(test_generator)  # Model predictions

# Convert predictions from probabilities to class indices
y_pred_classes = np.argmax(y_pred, axis=1)

# Step 3: Generate the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred_classes)

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=test_generator.class_indices.keys(),
            yticklabels=test_generator.class_indices.keys())
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Step 4: Generate a classification report
class_report = classification_report(y_true, y_pred_classes, target_names=test_generator.class_indices.keys())
print("Classification Report:")
print(class_report)


In [None]:
import matplotlib.pyplot as plt

# Assume 'history' is the object returned from model.fit or model.fit_generator
# Example: history = model.fit(train_generator, validation_data=val_generator, epochs=20)

# Extract metrics
accuracy = history.history['accuracy']  # Training accuracy
val_accuracy = history.history['val_accuracy']  # Validation accuracy
loss = history.history['loss']  # Training loss
val_loss = history.history['val_loss']  # Validation loss
epochs = range(1, len(accuracy) + 1)  # Epochs

# Plot Training and Validation Accuracy
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, label='Training Accuracy', marker='o')
plt.plot(epochs, val_accuracy, label='Validation Accuracy', marker='o')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, label='Training Loss', marker='o')
plt.plot(epochs, val_loss, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
history2 = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=51,
    class_weight=class_weights_dict  # Apply class weights
)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Step 2: Predict on the test data
y_true = test_generator.classes  # True labels from the test set
y_pred = model.predict(test_generator)  # Model predictions

# Convert predictions from probabilities to class indices
y_pred_classes = np.argmax(y_pred, axis=1)

# Step 3: Generate the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred_classes)

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=test_generator.class_indices.keys(),
            yticklabels=test_generator.class_indices.keys())
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Step 4: Generate a classification report
class_report = classification_report(y_true, y_pred_classes, target_names=test_generator.class_indices.keys())
print("Classification Report:")
print(class_report)


In [None]:
import matplotlib.pyplot as plt

# Assume 'history' is the object returned from model.fit or model.fit_generator
# Example: history = model.fit(train_generator, validation_data=val_generator, epochs=20)

# Extract metrics
accuracy = history2.history['accuracy']  # Training accuracy
val_accuracy = history2.history['val_accuracy']  # Validation accuracy
loss = history2.history['loss']  # Training loss
val_loss = history2.history['val_loss']  # Validation loss
epochs = range(1, len(accuracy) + 1)  # Epochs

# Plot Training and Validation Accuracy
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, label='Training Accuracy', marker='o')
plt.plot(epochs, val_accuracy, label='Validation Accuracy', marker='o')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, label='Training Loss', marker='o')
plt.plot(epochs, val_loss, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Define the path to save the model in the Kaggle output directory
model_save_path = '/kaggle/working/saved_model.h5'

# Save the model
model.save(model_save_path)

print(f"Model saved to: {model_save_path}")
