In [None]:
import numpy as np
import pandas as pd

# from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Input, Dense, Flatten, concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import random

from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
data = pd.read_csv('train-metadata.csv')
# display all columns and rows
pd.set_option('display.max_columns', None)
data.sample(3)

In [None]:
data.shape
#  data.info(verbose=True, show_counts=True)

In [None]:
# drop the columns not needed
data = data.drop(['lesion_id','iddx_full','iddx_1', 'iddx_2','iddx_3','iddx_4','iddx_5','mel_mitotic_index','mel_thick_mm','tbp_lv_dnn_lesion_confidence','attribution','copyright_license','patient_id','image_type','tbp_tile_type','tbp_lv_location_simple'], axis=1)
data.shape

In [None]:
pd.set_option('display.max_rows',None)
data.isna().sum()

In [None]:
# fill missing age values with a specific value
mean_age = data['age_approx'].mean()
data['age_approx'].fillna(mean_age, inplace=True)
# fill missing sex values with a specific value
data['sex'].fillna('Unknown', inplace=True)
# fill missing values with a specific value
data['anatom_site_general'].fillna('Missing', inplace=True)

In [None]:
# get benign and malignant cases and store in seperate variables
benign = data[data['target'] == 0]
malignant = data[data['target'] == 1]

# plt.bar(['benign','malignant'], [len(benign), len(malignant)])
# plt.xlabel('Diagnose')
# plt.ylabel('Number')
# plt.title('Target Bar Chart')

In [None]:
# number of benign and malignant cases are so imbalanced
# here I sample the benign cases to create an balancec dataset
# number of malignant cases
print(len(malignant))
# get 607 benign cases to create a 1000 row dataset
benign_sample = benign.sample(n=607, random_state=42)
# benign_sample.head()

In [None]:
# concatenate the benign_sample and malignant dataframes
merged_data = pd.concat([benign_sample, malignant], ignore_index=True)

# shuffle the dataframe
merged_data= merged_data.sample(frac=1, random_state=42).reset_index(drop=True)

merged_data.head(5)

In [None]:
merged_data.shape

In [None]:
benign_images = [filename for filename in benign_sample['isic_id']]
malignant_images = [filename for filename in malignant['isic_id']]

### Display the images

In [None]:
# function to display images
def display_images(image_paths, category):
    plt.figure(figsize=(24, 4))
    for i, img_path in enumerate(image_paths):
        plt.subplot(1, 3, i + 1)
        img = mpimg.imread(os.path.join('train-image/image/', img_path + '.jpg'))
        plt.imshow(img)
        plt.title(category)
        plt.axis('off')
    plt.show()

# selecting three random images of cats
selected_benign_images = random.sample(benign_images, 3)

# selecting three random images of dogs
selected_malignant_images = random.sample(malignant_images, 3)

# display the images
display_images(selected_benign_images, 'Benign')
display_images(selected_malignant_images, 'Malignant')

In [None]:
# convert categorical columns to numerical
merged_data = pd.get_dummies(merged_data, columns=['sex', 'anatom_site_general', 'tbp_lv_location'])

# create the image file name column
merged_data['image'] = merged_data['isic_id'] + '.jpg'
merged_data['label'] = merged_data['target'].map({0: 'benign', 1: 'malignant'})
merged_data.head(5)
# X = merged_data.drop(columns=['target'])
# y = merged_data['target']

# convert categorical columns to numerical
# X_encoded = pd.get_dummies(X, columns=['sex', 'anatom_site_general', 'tbp_lv_location'])
# X_encoded.head(3)


In [None]:
merged_data.shape

In [None]:
# merged_data.head(5).to_csv('sampled_data.csv', index=False)
# extract and preprocess metadata
metadata_features = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'sex_Unknown', 'sex_female', 'sex_male', 'anatom_site_general_Missing', 'anatom_site_general_anterior torso', 'anatom_site_general_head/neck', 'anatom_site_general_lower extremity', 'anatom_site_general_posterior torso', 'anatom_site_general_upper extremity', 'tbp_lv_location_Head & Neck', 'tbp_lv_location_Left Arm', 'tbp_lv_location_Left Arm - Lower', 'tbp_lv_location_Left Arm - Upper', 'tbp_lv_location_Left Leg', 'tbp_lv_location_Left Leg - Lower', 'tbp_lv_location_Left Leg - Upper', 'tbp_lv_location_Right Arm', 'tbp_lv_location_Right Arm - Lower', 'tbp_lv_location_Right Arm - Upper', 'tbp_lv_location_Right Leg', 'tbp_lv_location_Right Leg - Lower', 'tbp_lv_location_Right Leg - Upper', 'tbp_lv_location_Torso Back Bottom Third', 'tbp_lv_location_Torso Back Middle Third', 'tbp_lv_location_Torso Back Top Third', 'tbp_lv_location_Torso Front Bottom Half', 'tbp_lv_location_Torso Front Top Half', 'tbp_lv_location_Unknown']
X_metadata = merged_data[metadata_features].values

# standardize metadata
scaler = StandardScaler()
X_metadata = scaler.fit_transform(X_metadata)
X_metadata.shape

In [None]:
# image size
# image_width, image_height = 224, 224
# batch_size = 32

# image processing
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
train_image_generator = datagen.flow_from_dataframe(
    dataframe=merged_data,
    directory='train-image/image/',
    x_col='image',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='training'
)

validation_image_generator = datagen.flow_from_dataframe(
    dataframe=merged_data,
    directory='train-image/image/',
    x_col='image',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation'
)

In [None]:
# load vgg model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

In [None]:
# image input
image_input = Input(shape=(224, 224, 3), name='image_input')
vgg16_features = base_model(image_input)
# flatten the vgg16 output
x = Flatten(name='flatten')(vgg16_features)

In [None]:
# define the metadata model
metadata_input = Input(shape=(X_metadata.shape[1],), name='metadata_input')
y = Dense(64, activation='relu')(metadata_input)
y = Dense(32, activation='relu')(y)

# combine the image and metadata networks
combined = concatenate([x, y])
z = Dense(64, activation='relu')(combined)
z = Dense(1, activation='sigmoid')(z)  # Output layer for binary classification

# create the model
model = Model(inputs=[image_input, metadata_input], outputs=z)

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
# define custom data generator to provide image and metadata together
def custom_generator(image_gen, metadata):
    while True:
        image_batch, label_batch = next(image_gen)  # Correct unpacking
        indices = image_gen.index_array
        metadata_batch = metadata[indices]
        yield {'image_input': image_batch, 'metadata_input': metadata_batch}, label_batch

# training the model
history = model.fit(
    custom_generator(train_image_generator, X_metadata),
    steps_per_epoch=len(train_image_generator),
    epochs=10,
    validation_data=custom_generator(validation_image_generator, X_metadata),
    validation_steps=len(validation_image_generator)
)

In [None]:
# Evaluate the model
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1])
plt.legend(loc='lower right')

###
###
###





