In [1]:
import pandas as pd
import os

In [21]:
# Directories where images are stored
lesion_data = '/content/drive/MyDrive/lesion_data/images/all_images'

# Get a list of all image_ids from the lesion_data directories
available_image_ids= [f for f in os.listdir(lesion_data) if f.endswith('.png')]
print(len(available_image_ids))
clinical_data = '/content/drive/MyDrive/lesion_data/metadata.csv'

df = pd.read_csv(clinical_data)
df = df.dropna()
# Specify columns to keep
columns_to_keep = ["smoke", "drink", "age", "gender", "skin_cancer_history", "img_id", "diagnostic"]

# Subset the DataFrame
df = df[columns_to_keep]
print(df.head())


2298
   smoke  drink  age  gender skin_cancer_history                img_id  \
1  False  False   55  FEMALE                True    PAT_46_881_939.png   
4  False   True   79    MALE                True  PAT_684_1302_588.png   
6  False   True   52  FEMALE               False  PAT_778_1471_835.png   
7  False  False   74  FEMALE               False   PAT_117_179_983.png   
9  False   True   58  FEMALE                True  PAT_705_4015_413.png   

  diagnostic  
1        BCC  
4        BCC  
6        BCC  
7        BCC  
9        ACK  


In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
# Get a list of all image_ids from the lesion_data_1 directory
# available_image_ids = [f for f in os.listdir(lesion_data_1) if f.endswith('.png')]
print(len(available_image_ids))
df = df[df['img_id'].isin(available_image_ids)]

# 1. Encoding Labels
encoder = LabelEncoder()
df.loc[:, 'diagnostic'] = encoder.fit_transform(df['diagnostic'])

# 2. One-Hot Encoding & Type Conversion
for col in df.select_dtypes(include=[bool]).columns:
    df[col] = df[col].astype(int)

print(df.columns)
df = pd.get_dummies(df, columns=['smoke', 'drink', 'gender', 'skin_cancer_history'], drop_first=True)
feature_cols = df.columns.difference(['img_id', 'diagnostic'])
df[feature_cols] = df[feature_cols].astype('float32')
df['diagnostic'] = df['diagnostic'].astype('int')


# 3. Normalization
scaler = StandardScaler().fit(df[feature_cols])
df[feature_cols] = scaler.transform(df[feature_cols])


print(df.head())

2298
Index(['smoke', 'drink', 'age', 'gender', 'skin_cancer_history', 'img_id',
       'diagnostic'],
      dtype='object')
        age                img_id  diagnostic  smoke_True  drink_True  \
1 -0.556305    PAT_46_881_939.png           1   -0.398503   -0.568471   
4  1.103322  PAT_684_1302_588.png           1   -0.398503    1.759103   
6 -0.763759  PAT_778_1471_835.png           1   -0.398503    1.759103   
7  0.757566   PAT_117_179_983.png           1   -0.398503   -0.568471   
9 -0.348852  PAT_705_4015_413.png           0   -0.398503    1.759103   

   gender_MALE  skin_cancer_history_True  
1    -0.987862                  1.087981  
4     1.012287                  1.087981  
6    -0.987862                 -0.919133  
7    -0.987862                 -0.919133  
9    -0.987862                  1.087981  


  df.loc[:, 'diagnostic'] = encoder.fit_transform(df['diagnostic'])


In [4]:
# 4. Train-Test-Validation Split
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['diagnostic'], random_state=42)
valid_df, val_test_df = train_test_split(test_df, test_size=0.5, stratify=test_df['diagnostic'], random_state=42)

# 5. Convert to TensorFlow Dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    dataframe.drop(columns=['img_id'])
    print(dataframe.head())
    labels = dataframe.pop('diagnostic')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

In [5]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

batch_size = 32
img_size = (299, 299)

train_image_data_gen = ImageDataGenerator(rescale=1./255)  # add any augmentation options you like
valid_image_data_gen = ImageDataGenerator(rescale=1./255)  # typically no augmentation for validation

train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(valid_df)


           age                img_id  diagnostic  smoke_True  drink_True  \
1057  0.135206  PAT_605_1152_536.png           4    2.509388   -0.568471   
700  -0.694608   PAT_86_1109_306.png           1   -0.398503   -0.568471   
315   0.619264  PAT_831_1570_991.png           0   -0.398503   -0.568471   
76   -0.418003   PAT_288_441_312.png           1   -0.398503   -0.568471   
272   0.273508    PAT_587_3431_6.png           0   -0.398503    1.759103   

      gender_MALE  skin_cancer_history_True  
1057    -0.987862                 -0.919133  
700     -0.987862                  1.087981  
315     -0.987862                 -0.919133  
76      -0.987862                  1.087981  
272      1.012287                 -0.919133  
           age                img_id  diagnostic  smoke_True  drink_True  \
1859 -0.418003  PAT_860_1641_998.png           4   -0.398503    1.759103   
1849 -0.694608  PAT_837_1583_124.png           1   -0.398503   -0.568471   
1168  1.034171   PAT_504_953_569.png   

In [8]:
# Function to get full path of an image
def get_image_path(image_id):
    for folder in [lesion_data]:
        if image_id in os.listdir(folder):
            return os.path.join(folder, image_id)
    return None

# Get lists of image file paths
train_images = [get_image_path(filename) for filename in train_df['img_id'].values]
val_images = [get_image_path(filename) for filename in valid_df['img_id'].values]
test_images = [get_image_path(filename) for filename in val_test_df['img_id'].values]


# Extracting structured data and labels
train_struct_data = train_df.drop(columns=['img_id', 'diagnostic']).copy()
val_struct_data = valid_df.drop(columns=['img_id', 'diagnostic']).copy()

train_labels = train_df['diagnostic'].values
val_labels = valid_df['diagnostic'].values

In [6]:
from tensorflow.keras.utils import Sequence
import cv2
import numpy as np

class MultiInputDataGenerator(Sequence):
    def __init__(self, img_data, structured_data, labels, batch_size, img_size):
        self.img_data = img_data
        self.structured_data = structured_data
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size

    def __len__(self):
        return int(np.ceil(len(self.img_data) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x_img = self.img_data[idx * self.batch_size: (idx + 1) * self.batch_size]
        batch_x_struct = self.structured_data[idx * self.batch_size: (idx + 1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size: (idx + 1) * self.batch_size]

        # Ensure batch sizes are consistent across image and structured data
        min_batch_size = min(len(batch_x_img), len(batch_x_struct), len(batch_y))
        batch_x_img = batch_x_img[:min_batch_size]
        batch_x_struct = batch_x_struct[:min_batch_size]
        batch_y = batch_y[:min_batch_size]

        img_batch = np.array([cv2.resize(cv2.imread(file_name), self.img_size) for file_name in batch_x_img])
        struct_batch = np.array(batch_x_struct)

        print(f"Image batch shape: {img_batch.shape}, Struct batch shape: {struct_batch.shape}")

        return [img_batch, struct_batch], np.array(batch_y)


In [27]:
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam

# Load the ResNet50 model pre-trained on ImageNet data
base_model = ResNet50(input_shape=(299, 299, 3),
                      include_top=False,
                      weights='imagenet')

# Freeze the convolutional layers from the original model (optional)
for layer in base_model.layers:
    layer.trainable = False

# Define the customized layers for image input
image_x = layers.Flatten()(base_model.output)
image_x = layers.Dense(1024, activation='relu')(image_x)
image_x = layers.Dropout(0.2)(image_x)

# Assume that the clinical data has n_features attributes
n_features = 5  # You should adjust this according to your actual data
clinical_input = layers.Input(shape=(n_features,))
clinical_x = layers.Dense(256, activation='relu')(clinical_input)
clinical_x = layers.Dropout(0.2)(clinical_x)

# Concatenate the output of the two branches
combined_x = layers.concatenate([image_x, clinical_x])

# Add output layer for a 6-class classification problem
output = layers.Dense(6, activation='softmax', name='output')(combined_x)

# Creating the model
model = Model(inputs=[base_model.input, clinical_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Summary of the model architecture
model.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, 299, 299, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 305, 305, 3)          0         ['input_19[0][0]']            
                                                                                                  
 conv1_conv (Conv2D)         (None, 150, 150, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 150, 150, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                        

In [9]:
batch_size = 256
img_size = (299, 299)



# Assuming your data is prepared in train_images, train_struct_data, and train_labels
train_gen = MultiInputDataGenerator(
    img_data=train_images,
    structured_data=train_struct_data,
    labels=train_labels,
    batch_size=batch_size,
    img_size=img_size
)

val_gen = MultiInputDataGenerator(
    img_data=val_images,
    structured_data=val_struct_data,
    labels=val_labels,
    batch_size=batch_size,
    img_size=img_size
)

from tensorflow.keras import mixed_precision

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [28]:
# Train the model
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=11,
    steps_per_epoch=len(train_gen),
    validation_steps=len(val_gen),
    verbose=1
)

Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
Epoch 1/11
Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
1/5 [=====>........................] - ETA: 1:02 - loss: 2.7461 - accuracy: 0.1719Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
Image batch shape: (221, 299, 299, 3), Struct batch shape: (221, 5)
Epoch 2/11
Image batch shape: (7, 299, 299, 3), Struct batch shape: (7, 5)
1/5 [=====>........................] - ETA: 1s - loss: 17.5938 - accuracy: 0.0000e+00Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
Epoch 3/11
Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
1/5 [=====>........................] - ETA: 49s - loss: 10.0859 - accuracy: 0.4453Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
Epoch 4/11
Image batch shape: (256, 299, 299, 3), Struct batch shape: (256, 5)
1/5 [=====>........................] - ETA: 35s - loss: 6.4766 - accuracy: 0.5547Image batch shape:

In [None]:
# from tensorflow.keras.models import load_model
# model.save('/content/drive/MyDrive/lesion_data/my_model.h5')
# Save the entire model as a SavedModel.
# model.save('/content/drive/MyDrive/lesion_data/my_model_keras.keras')


In [None]:
# from tensorflow.keras.models import load_model
# model = load_model('/content/drive/MyDrive/lesion_data/my_model.h5')

In [29]:
# 1. Prepare the test dataset

# Get lists of image file paths for test set

test_images = [os.path.join(lesion_data, filename) for filename in val_test_df['img_id'].values]
for file_name in test_images:
    if not os.path.exists(file_name):
        print(f"File {file_name} does not exist.")

# Extracting structured data and labels from the test dataset
test_struct_data = test_df.drop(columns=['img_id', 'diagnostic']).copy()
test_labels = test_df['diagnostic'].values

# Create a test data generator
test_gen = MultiInputDataGenerator(
    img_data=test_images,
    structured_data=test_struct_data,
    labels=test_labels,
    batch_size=batch_size,
    img_size=img_size
)

# 2. Evaluate the model on the test dataset
loss, accuracy = model.evaluate(test_gen, steps=len(test_gen), verbose=1)

print(f"Test accuracy: {accuracy * 100:.2f}%")


Image batch shape: (222, 299, 299, 3), Struct batch shape: (222, 5)
Image batch shape: (222, 299, 299, 3), Struct batch shape: (222, 5)
Test accuracy: 45.05%
