In [None]:
import numpy as np
import pandas as pd
import os
import joblib
import h5py
import io

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate
from tensorflow.keras.applications import ResNet50

from PIL import Image
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')
class DataPreprocessor:

    def __init__(self,numerical_features,categorical_features):
        self.numerical_features = numerical_features
        self.categorical_features = categorical_features

    def custom_preprocessing(self, data):
        
        data.dropna(inplace=True)
        scaler = StandardScaler()
        data[numerical_features] = scaler.fit_transform(data[numerical_features])
        
        encoder = OneHotEncoder(sparse=False)
        encoded_categories = encoder.fit_transform(data[categorical_features])
        
        encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_features))
        data = pd.concat([data.drop(columns=categorical_features), encoded_df], axis=1)
        
        return data

    def fit_transform(self, data):
        data_preprocessed = self.custom_preprocessing(data)
        return data_preprocessed
class ImageDataGenerator(Sequence):
    def __init__(self, hdf5_file, metadata_df, batch_size=32, image_size=(128, 128)):
        self.hdf5_file = h5py.File(hdf5_file, 'r')
        self.metadata_df = metadata_df
        self.batch_size = batch_size
        self.image_size = image_size
        self.indexes = np.arange(len(self.metadata_df))
        

    def __len__(self):
        return int(np.floor(len(self.metadata_df) / self.batch_size))

    def __getitem__(self, index):
        batch_start = index * self.batch_size
        batch_end = min(batch_start + self.batch_size, len(self.metadata_df))
        batch_indexes = self.indexes[batch_start:batch_end]
        batch_metadata = self.metadata_df.iloc[batch_indexes]
        images, metadata = self.__data_generation(batch_metadata)
        
        return (images, metadata), batch_metadata['target'].values


    def __data_generation(self, batch_metadata):
        images = []
        metadata = []
        isic_ids = batch_metadata['isic_id'].values
        batch_metadata = batch_metadata.drop(columns=['isic_id','target'])
        for i, row in enumerate(batch_metadata.iterrows()):
            isic_id = isic_ids[i]
            try:
                # Load and preprocess the image
                image_data = self.hdf5_file[isic_id][()]
                image = Image.open(io.BytesIO(image_data)).resize(self.image_size)
                image = img_to_array(image)
                image = preprocess_input(image)
                images.append(image)
            except Exception as e:
                print(f"Error loading image for isic_id {isic_id}: {e}")
                continue

        

        
            # Convert metadata row to a numpy array
            non_image_data = row[1].values  # row[1] gives the Series without the index
            metadata.append(non_image_data)

        # Convert to numpy arrays
        images = np.array(images)
        metadata = np.array(metadata)

        # Ensure metadata has the correct shape
        if metadata.shape[0] != len(images):
            raise ValueError(f"Metadata batch size {metadata.shape[0]} does not match image batch size {len(images)}")

        return images, metadata

# Ensure all model layers are initialized in the __init__ method
class ModelTrainer:
    def __init__(self, image_size, num_features):
        self.image_size = image_size
        self.num_metadata_features = num_features
        self.model = self.build_model()

    def build_model(self):
        image_input = Input(shape=(self.image_size[0], self.image_size[1], 3))
        base_model = ResNet50(weights=None, include_top=False, input_tensor=image_input)
        x = Flatten()(base_model.output)

        metadata_input = Input(shape=(self.num_metadata_features,))
        y = Dense(128, activation='relu')(metadata_input)
        y = Dense(64, activation='relu')(y)

        combined = Concatenate()([x, y])
        z = Dense(128, activation='relu')(combined)
        z = Dense(64, activation='relu')(z)
        output = Dense(1, activation='sigmoid')(z)

        model = Model(inputs=[image_input, metadata_input], outputs=output)
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def train(self, training_generator, epochs=10):
        history = self.model.fit(training_generator, epochs=epochs)
        if history and history.history:
            print("Training history logs:")
            print(history.history)
        else:
            print("Training logs are None or empty!")

        return history
train_metadata = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
columns_not_in_test = {'lesion_id','iddx_full','iddx_1','iddx_2','iddx_3','iddx_4','iddx_5','mel_mitotic_index','mel_thick_mm','tbp_lv_dnn_lesion_confidence'}
columns_with_missing_values = train_metadata.columns[train_metadata.isnull().sum() > len(train_metadata) * 0.5]
train_ids = train_metadata['isic_id']
train_targets = train_metadata['target']

train_metadata = train_metadata.drop(columns=['isic_id','target'], errors='ignore')
train_metadata = train_metadata.drop(columns=columns_with_missing_values, errors='ignore')
train_metadata = train_metadata.drop(columns=columns_not_in_test, errors='ignore')
numerical_features = train_metadata.select_dtypes(include=['number']).columns.tolist()
categorical_features = train_metadata.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = DataPreprocessor(numerical_features, categorical_features)
processed_train_df = preprocessor.fit_transform(train_metadata)
processed_train_df = processed_train_df.fillna(processed_train_df.mean())
processed_train_df['target'] = train_targets
processed_train_df['isic_id'] = train_ids
generator = ImageDataGenerator('/kaggle/input/isic-2024-challenge/train-image.hdf5', processed_train_df)
num_features = processed_train_df.shape[1] - 2  # Exclude 'isic_id' and 'target'
trainer = ModelTrainer(image_size=(128, 128), num_features = num_features)
import tensorflow as tf
def create_tf_dataset(generator, num_metadata_features, for_prediction=False):
    if for_prediction:
        output_signature = (
            (
                tf.TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32),  # Images
                tf.TensorSpec(shape=(None, num_metadata_features), dtype=tf.float32)  # Metadata
            )
        )
    else:
        output_signature = (
            (
                tf.TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32),  # Images
                tf.TensorSpec(shape=(None, num_metadata_features), dtype=tf.float32)  # Metadata
            ),
            tf.TensorSpec(shape=(None,), dtype=tf.float32)  # Targets
        )

    return tf.data.Dataset.from_generator(
        lambda: generator,
        output_signature=output_signature
    )

train_dataset = create_tf_dataset(generator, num_features)
trainer.train(train_dataset, epochs=3)
test_metadata = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')
#just for once :
test_metadata = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')   
test_ids = test_metadata['isic_id']
test_metadata = test_metadata.drop(columns=['isic_id'], errors='ignore')
test_metadata = test_metadata.drop(columns=columns_with_missing_values, errors='ignore')

preprocessor = DataPreprocessor(numerical_features, categorical_features)
processed_test_df = preprocessor.fit_transform(test_metadata)

columns_to_drop = [col for col in processed_test_df.columns if col not in processed_train_df.columns]
processed_test_df = processed_test_df.drop(columns=columns_to_drop)

for col in processed_train_df.columns:
    if col not in processed_test_df.columns:
        processed_test_df[col] = 0


processed_test_df = processed_test_df[processed_train_df.columns]
processed_test_df['isic_id'] = test_ids
processed_test_df['target'] = 0
import numpy as np

image_input_data = []
for isic_id in test_ids:  # Assuming index has isic_ids
    image_data = h5py.File('/kaggle/input/isic-2024-challenge/test-image.hdf5', 'r')[isic_id][()]
    image = Image.open(io.BytesIO(image_data)).resize((128, 128))
    image = img_to_array(image)
    image = preprocess_input(image)
    image_input_data.append(image)

image_input_data = np.array(image_input_data)

# Convert the metadata to a NumPy array
metadata_input_data = test_features_df.values

# Predict using the model
predictions = trainer.model.predict([image_input_data, metadata_input_data])

# Output predictions
print("Predictions:", predictions)

threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)

submission_df = pd.DataFrame({
    'isic_id': test_ids,
    'target': predicted_labels.flatten()
})

submission_df.to_csv('submission.csv', index=False)