In [None]:
CRIA_BASE_TRAIN= True
FOLDS = 5

#### imports

In [None]:
!pip install --no-index --find-links=/kaggle/input/downloads-isic/ /kaggle/input/downloads-isic/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import os
import gc
import copy
import random
import pickle
import warnings
import io
import typing as tp
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import distance
from tqdm import tqdm
import pywt
import librosa
import cv2
import h5py
from PIL import Image
from io import BytesIO

import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
import efficientnet.tfkeras as efn

from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, train_test_split

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping

# Configurações
pd.set_option('display.max_rows', 10)
warnings.simplefilter("ignore", pd.errors.PerformanceWarning)  # Suprimir todos os PerformanceWarning globalmente

# Limpeza de memória
gc.collect()


#### config gpu

In [None]:
# Definindo quais GPUs utilizar
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# Verifica as GPUs disponíveis e configura para crescimento de memória
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Configura a estratégia de distribuição
if len(gpus) <= 1:
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    print(f'Using {len(gpus)} GPU')
else:
    strategy = tf.distribute.MirroredStrategy()
    print(f'Using {len(gpus)} GPUs')

# Habilita mixed precision se necessário
MIX = True
if MIX:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    print('Mixed precision enabled')
else:
    print('Using full precision')

# funçoes

set_random_seed

In [None]:
def set_random_seed(seed: int = 42, deterministic: bool = False):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    if deterministic:
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
    else:
        os.environ.pop('TF_DETERMINISTIC_OPS', None)



DISTANCIAS EMBEDS
* df_vetores_controle
* calcular_distancias_treino_teste
* calcular_distancias_treino_teste_idx_2_especial

In [None]:
def df_vetores_controle(df_, embeds_data, col_saudavel, q_inf=0.1, col_group='patient_id', name_='b0'):
   
    embeds_data_df = pd.DataFrame(embeds_data)
    
    # Passo 1: Encontrar o quartil 0.1 de cada paciente
    quartil_01 = df_.groupby(col_group)[col_saudavel].quantile(q_inf).reset_index()
    quartil_01.columns = [col_group, 'col_saudavel_quartil_01']

    # Merge para adicionar a coluna de quartil_01 ao DataFrame original
    df_merged = pd.merge(df_, quartil_01, on=col_group, how='left')
    
    # Filtrar as linhas que são menores ou iguais ao quartil 0.1
    df_filtered = df_merged[df_merged[col_saudavel] <= df_merged['col_saudavel_quartil_01']]

    # Passo 2: Criar embeds_data_sadios para cada paciente
    embeds_data_sadios = {}
    for paciente_id in df_filtered[col_group].unique():
        indices = df_filtered[df_filtered[col_group] == paciente_id].index
        embeds_data_sadios[paciente_id] = embeds_data_df.loc[indices].mean(axis=0)
    
    # Passo 3: Calcular a diferença de cada embed do paciente contra o embeds_data_sadios
    def calcular_diferenca(row):
        if row.name % 10_000==0:   
            print(f'--{row.name}', end=' ')
        paciente_id = row[col_group]
        embed_data_sadio = embeds_data_sadios[paciente_id]
        embed_data_atual = embeds_data_df.loc[row.name]
        distancia_cos = distance.cosine(embed_data_sadio, embed_data_atual) 
        distancia_euclidiana = np.linalg.norm(embed_data_sadio - embed_data_atual)
        return distancia_cos, distancia_euclidiana
    
    print('calculando df_vetores_controle ------------------')
    df_[f'distancia_sauldavel_cos_{name_}'], df_[f'distancia_sauldavel_eucli_{name_}'] = zip(*df_.apply(calcular_diferenca, axis=1))
    
    # Passo 5: Retornar a coluna com os dados
    return df_, [f'distancia_sauldavel_cos_{name_}',f'distancia_sauldavel_eucli_{name_}']

def calcular_distancias_treino_teste(df_train_, df_test_, embeds_data_train, embeds_data_test, is_train=True, name_='b0'):
    # Transformar os dados de embeddings em DataFrames
    embeds_data_df_train = pd.DataFrame(embeds_data_train)

    df_positivos = df_train_[df_train_['target'] == 1]
    media_geral = embeds_data_df_train.loc[df_positivos.index].mean(axis=0)
    
    def calcular_media_sem_o_atual(index, n):
        # Calcula a nova média excluindo o vetor de embedding atual do cálculo da média
        soma_total = media_geral * n
        vetor_atual = embeds_data_df_train.loc[index]
        nova_media = (soma_total - vetor_atual) / (n - 1)
        return nova_media
    
    def calcular_diferenca(row, embeds_totais, n, is_train=True):
        if row.name % 10_000==0:   
            print(f'--{row.name}', end=' ')
        if is_train and row['target'] == 1:
            embed_data_doente = calcular_media_sem_o_atual(row.name, len(df_positivos))
        else:
            embed_data_doente = media_geral
        embed_data_atual = embeds_totais.loc[row.name]
        distancia_cos = distance.cosine(embed_data_doente, embed_data_atual)
        distancia_euclidiana = np.linalg.norm(embed_data_doente - embed_data_atual)
        return distancia_cos, distancia_euclidiana
    print('calculando distâncias ------------------')
    if is_train:
        resultado = df_train_.apply(lambda row: calcular_diferenca(row, embeds_data_df_train, len(df_positivos), is_train=True), axis=1)
        df_train_[f'dist_pos_cos_{name_}'], df_train_[f'dist_pos_eucli_{name_}'] = zip(*resultado)
    else:
        embeds_data_df_test = pd.DataFrame(embeds_data_test)
        resultado = df_test_.apply(lambda row: calcular_diferenca(row, embeds_data_df_test, len(df_positivos), is_train=False), axis=1)
        df_test_[f'dist_pos_cos_{name_}'], df_test_[f'dist_pos_eucli_{name_}'] = zip(*resultado)

    return df_train_, df_test_, [f'dist_pos_cos_{name_}',f'dist_pos_eucli_{name_}']

def calcular_distancias_treino_teste_idx_2_especial(df_train_, df_test_, embeds_data_train, embeds_data_test, is_train=True, name_='b0'):
    # Transformar os dados de embeddings em DataFrames
    embeds_data_df_train = pd.DataFrame(embeds_data_train)
    embeds_data_df_test = pd.DataFrame(embeds_data_test)

    # Filtra os dados de acordo com as condições especificadas
    condicoes = {
        'Malignant epidermal proliferations': None,
        'Malignant adnexal epithelial proliferations - Follicular': None,
        'Malignant melanocytic proliferations (Melanoma)': None,
        'Indeterminate epidermal proliferations':None,
        'Indeterminate melanocytic proliferations':None,
        'Benign melanocytic proliferations':None,
        'Benign epidermal proliferations':None,
        'Benign soft tissue proliferations - Fibro-histiocytic':None,
        'Flat melanotic pigmentations - not melanocytic nevus':None,
        'Inflammatory or infectious diseases':None,
    }
    # Calcula a média dos embeddings para cada condição
    for condicao in condicoes.keys():
        df_positivos = df_train_[(df_train_['target'] == 1) & (df_train_['iddx_2'] == condicao)]
        print(df_positivos.shape)
        if not df_positivos.empty:
            media_embeddings = embeds_data_df_train.loc[df_positivos.index].mean(axis=0)
            condicoes[condicao] = media_embeddings
        else:
            condicoes[condicao] = np.zeros(embeds_data_df_train.shape[1])

    def calcular_media_sem_o_atual(media_geral_, index, n):
        soma_total = media_geral_ * n
        vetor_atual = embeds_data_df_train.loc[index]
        nova_media = (soma_total - vetor_atual) / (n - 1)
        return nova_media

    def calcular_diferenca(row, embeds_totais, is_train=True):
        distancias = {}
        for condicao, indices in condicoes.items():
            if is_train and row['target'] == 1 and row['iddx_2'] == condicao:
                embed_data_condicao = calcular_media_sem_o_atual(indices, row.name, len(indices))
            else:
                embed_data_condicao = indices
            
            embed_data_atual = embeds_totais.loc[row.name]
            distancias[f'dist_idx2_cos_{name_}_{condicao}'] = distance.cosine(embed_data_condicao, embed_data_atual)
            distancias[f'dist_idx2_eucli_{name_}_{condicao}'] = np.linalg.norm(embed_data_condicao - embed_data_atual)
        return pd.Series(distancias)
    if is_train:
        tqdm.pandas(desc="Calculando distâncias para dados de treino")
        resultado_train = df_train_.progress_apply(lambda row: calcular_diferenca(row, embeds_data_df_train, is_train=True), axis=1)
        df_train_ = pd.concat([df_train_, resultado_train], axis=1)
    else:
        tqdm.pandas(desc="Calculando distâncias para dados de teste")
        resultado_test = df_test_.progress_apply(lambda row: calcular_diferenca(row, embeds_data_df_test, is_train=False), axis=1)
        df_test_ = pd.concat([df_test_, resultado_test], axis=1)
    
    new_cols=[]
    for condicao, indices in condicoes.items():
        new_cols.append(f'dist_idx2_cos_{name_}_{condicao}')
        new_cols.append(f'dist_idx2_eucli_{name_}_{condicao}')
    
    return df_train_, df_test_, new_cols

CALCULO MEDIA DE FEATURES
* calculate_ratio_to_patient_mean
* calculate_ratio_to_patient_mean_capped
* calculate_ratio_to_patient_mean_quartil


In [None]:
def calculate_ratio_to_patient_mean(df, groupby_key, feature_list):
    new_columns = []
    for feature in tqdm(feature_list, desc="Processing features"):
        # Calcular a média de cada feature por paciente
        patient_mean = df.groupby(groupby_key)[feature].transform('mean')
        
        # Criar uma nova coluna para a razão da feature à média do paciente
        new_col_name = f"{feature}_ratio_to_patient_mean"
        df[new_col_name] = df[feature] / patient_mean
        
        # Adicionar o nome da nova coluna à lista
        new_columns.append(new_col_name)
        
    return df, new_columns


def calculate_ratio_to_patient_mean_capped(df, groupby_key, feature_list, q_inf=0.1, q_sup=0.9):
    new_columns = []
    percentiles = {}
    
    # Pré-calcular os percentis 0.10 e 0.90 para cada grupo e cada feature
    for feature in feature_list:
        percentiles[feature] = df.groupby(groupby_key)[feature].quantile([q_inf, q_sup]).unstack(level=-1)
    
    # DataFrame para armazenar as novas colunas
    new_df = pd.DataFrame(index=df.index)
    
    # Iterar sobre cada feature para calcular as razões com uma barra de progresso
    for feature in tqdm(feature_list, desc="Processing features"):
        lower_percentile = df[groupby_key].map(percentiles[feature][q_inf])
        upper_percentile = df[groupby_key].map(percentiles[feature][q_sup])
        
        # Capar os valores pelo percentil 0.10 e 0.90 e calcular a média capada por paciente
        capped_values = df[feature].clip(lower=lower_percentile, upper=upper_percentile)
        patient_mean_capped = df.groupby(groupby_key)[capped_values.name].transform('mean')
        
        # Criar uma nova coluna para a razão da feature à média capada do paciente
        new_col_name = f"{feature}_ratio_to_capped_patient_mean"
        new_df[new_col_name] = df[feature] / patient_mean_capped
        
        # Adicionar o nome da nova coluna à lista
        new_columns.append(new_col_name)
        
    # Concatenar as novas colunas ao DataFrame original
    df = pd.concat([df, new_df], axis=1)
    
    return df, new_columns

def calculate_ratio_to_patient_mean_quartil(df, col_group, feature_list, q_inf=0.1):
    new_columns = []
    
    # Passo 1: Encontrar o quartil 0.1 de cada paciente e filtrar
    df['quartil_01'] = df.groupby(col_group)['media_pred_oof'].transform(lambda x: x.quantile(q_inf))
    df_filtered = df[df['media_pred_oof'] <= df['quartil_01']]

    # Passo 2: Calcular a média dos valores filtrados por paciente para cada feature
    means = df_filtered.groupby(col_group)[feature_list].mean()

    # Passo 3: Calcular as razões das features à média do quartil do paciente
    for feature in tqdm(feature_list, desc="Processing features"):
        # Criar nova coluna de razão
        new_col_name = f"{feature}_ratio_to_quartil_patient_mean"
        # Calcula a razão de maneira vetorizada usando o .loc e .div
        df[new_col_name] = df[feature].div(df[col_group].map(means[feature]))
        new_columns.append(new_col_name)
    
    return df, new_columns

def calculate_ratio_to_patient_mean_capped2(df, groupby_key, feature_list, q_inf=0.1, q_sup=0.9):
    new_columns = []
    percentiles = {}
    
    # Pré-calcular os percentis 0.10 e 0.90 para cada grupo e cada feature
    for feature in feature_list:
        percentiles[feature] = df.groupby(groupby_key)[feature].quantile([q_inf, q_sup]).unstack(level=-1)
    
    new_df = pd.DataFrame(index=df.index)
    
    for feature in tqdm(feature_list, desc="Processing features"):
        lower_percentile = df[groupby_key].map(percentiles[feature][q_inf])
        upper_percentile = df[groupby_key].map(percentiles[feature][q_sup])
        capped_values = df[feature].clip(lower=lower_percentile, upper=upper_percentile)
        patient_mean_capped = df.groupby(groupby_key)[capped_values.name].transform('mean')
        skewness_capped = df.groupby(groupby_key)[capped_values.name].transform('skew')
#         kurtosis_capped = df.groupby(groupby_key)[capped_values.name].transform('kurt')
        
        new_df[f"{feature}_z_score_capped"] = (df[feature] - patient_mean_capped) / df.groupby(groupby_key)[capped_values.name].transform('std')
        new_df[f"{feature}_min_max_scaled_capped"] = (capped_values - lower_percentile) / (upper_percentile - lower_percentile)
        new_df[f"{feature}_skewness_capped"] = skewness_capped
#         new_df[f"{feature}_kurtosis_capped"] = kurtosis_capped
        
        new_columns.extend([
            f"{feature}_z_score_capped",
            f"{feature}_min_max_scaled_capped",
            f"{feature}_skewness_capped",
#             f"{feature}_kurtosis_capped"
        ])
    
    df = pd.concat([df, new_df], axis=1)
    return df, new_columns

import pandas as pd
import numpy as np
from tqdm import tqdm

# def calculate_ratio_to_patient_mean_capped2(df, groupby_key, feature_list, q_inf=0.1, q_sup=0.9):
#     new_columns = []
#     percentiles = {}

#     # Pré-calcular os percentis e curtose
#     for feature in feature_list:
#         percentiles[feature] = df.groupby(groupby_key)[feature].quantile([q_inf, q_sup]).unstack(level=-1)
#         df[f"{feature}_capped"] = df[feature].clip(lower=df[groupby_key].map(percentiles[feature][q_inf]),
#                                                    upper=df[groupby_key].map(percentiles[feature][q_sup]))
    
#     # Cálculo de curtose uma vez para cada feature capada
#     kurtosis = df.groupby(groupby_key).apply(lambda x: x[[f"{f}_capped" for f in feature_list]].kurt())

#     new_df = pd.DataFrame(index=df.index)
    
#     for feature in tqdm(feature_list, desc="Processing features"):
#         capped_values = df[f"{feature}_capped"]
#         patient_mean_capped = df.groupby(groupby_key)[capped_values.name].transform('mean')
#         skewness_capped = df.groupby(groupby_key)[capped_values.name].transform('skew')
        
#         new_df[f"{feature}_z_score_capped"] = (df[feature] - patient_mean_capped) / df.groupby(groupby_key)[capped_values.name].transform('std')
#         new_df[f"{feature}_min_max_scaled_capped"] = (capped_values - df[groupby_key].map(percentiles[feature][q_inf])) / (df[groupby_key].map(percentiles[feature][q_sup]) - df[groupby_key].map(percentiles[feature][q_inf]))
#         new_df[f"{feature}_skewness_capped"] = skewness_capped
#         new_df[f"{feature}_kurtosis_capped"] = df[groupby_key].map(kurtosis[f"{feature}_capped"])
        
#         new_columns.extend([
#             f"{feature}_z_score_capped",
#             f"{feature}_min_max_scaled_capped",
#             f"{feature}_skewness_capped",
#             f"{feature}_kurtosis_capped"
#         ])
    
#     df = pd.concat([df, new_df], axis=1)
#     return df, new_columns
# def calculate_ratio_to_patient_mean_capped2(df, groupby_key, feature_list, q_inf=0.1, q_sup=0.9):
#     new_columns = []
#     percentiles = {}
    
#     # Pré-calcular os percentis 0.10 e 0.90 para cada grupo e cada feature
#     for feature in feature_list:
#         percentiles[feature] = df.groupby(groupby_key)[feature].quantile([q_inf, q_sup]).unstack(level=-1)
    
#     new_df = pd.DataFrame(index=df.index)
    
#     for feature in tqdm(feature_list, desc="Processing features"):
#         lower_percentile = df[groupby_key].map(percentiles[feature][q_inf])
#         upper_percentile = df[groupby_key].map(percentiles[feature][q_sup])
#         capped_values = df[feature].clip(lower=lower_percentile, upper=upper_percentile)
#         patient_mean_capped = df.groupby(groupby_key)[capped_values.name].transform('mean')
#         skewness_capped = df.groupby(groupby_key)[capped_values.name].transform('skew')
        
#         # Usar apply para calcular curtose
#         kurtosis_capped = df.groupby(groupby_key)[capped_values.name].apply(lambda x: x.kurt())
        
#         new_df[f"{feature}_z_score_capped"] = (df[feature] - patient_mean_capped) / df.groupby(groupby_key)[capped_values.name].transform('std')
#         new_df[f"{feature}_min_max_scaled_capped"] = (capped_values - lower_percentile) / (upper_percentile - lower_percentile)
#         new_df[f"{feature}_skewness_capped"] = skewness_capped
#         new_df[f"{feature}_kurtosis_capped"] = df[groupby_key].map(kurtosis_capped)
        
#         new_columns.extend([
#             f"{feature}_z_score_capped",
#             f"{feature}_min_max_scaled_capped",
#             f"{feature}_skewness_capped",
#             f"{feature}_kurtosis_capped"
#         ])
    
#     df = pd.concat([df, new_df], axis=1)
#     return df, new_columns

SCORE
* comp_score
* comp_scorel

In [None]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def comp_scorel(y_true, y_pred):
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return 'custom_auc', partial_auc, True


FEA FUNC
* carregar_oof_embeddings_np
* carregar_e_preparar_dados_de_treino
* fit_encode_categorical_columns
* transform_categorical_columns_test

In [None]:
def carregar_oof_embeddings_np(path_embs='/kaggle/input/modelos-down/oof0_embedding.npy', 
                               n_comp=32, verbose=True, model_n=3):
    # Carregamento dos dados
    embed_data = np.load(path_embs)
    
    # Aplicação do PCA para reduzir a dimensionalidade
    pca = PCA(n_components=n_comp)
    principal_components = pca.fit_transform(embed_data)
    
    # Criação do DataFrame com os componentes principais
    columns = [f'pred_embed_{i}_b{model_n}' for i in range(n_comp)]
    pca_df = pd.DataFrame(data=principal_components, columns=columns)
    
    # Exibição opcional do resultado
    if verbose:
        print(pca_df.shape)
        display(pca_df.head(2))
    
    return embed_data, pca, pca_df, columns

In [None]:
def carregar_e_preparar_dados_de_treino_old(df_meta, oof_paths, pca_df=None):
    # Carregar metadados
    new_cols = []
    
    # Processar cada arquivo OOF
    for key, value in oof_paths.items():
        df_temp = pd.read_csv(value)
        # Verificar a ordem dos 'isic_id'
        if not df_meta['isic_id'].equals(df_temp['isic_id']):
            raise ValueError(f'Discrepância na ordem dos isic_id entre df_meta e {value}')
        df_temp = df_temp.rename(columns={"pred_oof": key})  # Renomeia a coluna de predições
        if 'isic_id' in df_temp.columns:
            if key == 'pred_oof0':
                df_meta[[key, 'fold']] = df_temp[[key, 'fold']]
            else:
                df_meta[key] = df_temp[key]
                if (df_meta['fold'] != df_temp['fold']).all():
                    raise ValueError(f'Discrepância nos folds entre df_meta e {value}')
            new_cols.append(key)
    
    # Adicionar componentes PCA se fornecido
    if pca_df is not None:
        for pca__ in pca_df:
            df_meta = pd.concat([df_meta, pca__], axis=1)
            new_cols = new_cols + pca__.columns.to_list()

    gc.collect()
    return df_meta, new_cols

In [None]:
def fit_encode_categorical_columns(df, cat_cols):
    encoder = OrdinalEncoder(
        categories='auto',
        dtype=int,
        handle_unknown='use_encoded_value',
        unknown_value=-2,  # Valor para categorias desconhecidas
        encoded_missing_value=-1  # Valor para dados faltantes
    )

    # Ajusta o codificador e transforma as colunas categóricas
    X_cat = encoder.fit_transform(df[cat_cols])

    # Substitui as colunas originais pelas codificadas
    for c, cat_col in enumerate(cat_cols):
        df[cat_col] = X_cat[:, c]

    return df, encoder

def transform_categorical_columns_test(df, cat_cols, encoder):
    X_cat = encoder.transform(df[cat_cols])

    # Substitui as colunas originais pelas codificadas
    for c, cat_col in enumerate(cat_cols):
        df[cat_col] = X_cat[:, c]

    return df


FEATURE ENGINEERING

In [None]:
num_cols = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 
]
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"]

#mudado linha que muda explicitamente dftrain por df 
def feature_engineering(df):
    # New features to try...
    if "num_rows_per_patient" not in df.columns:
        print('Primeira parte das features')
        df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
        df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
        df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
        df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
        df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
        df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
        df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
        df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
        df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
        df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
        df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
        df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
        df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]

        print('Segunda parte das features')
        df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
        df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
        df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
        df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
        df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
        df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
        df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
        df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
        df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
        df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
#         df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
        df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
        df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
        df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
        df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

        print('Terceira parte das features')
        df['num_rows_per_patient'] = df.groupby('patient_id')['patient_id'].transform('count')
    
    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index", "num_rows_per_patient"
    ]
    
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

In [None]:
def cria_features_agregadas(df, color_columns):
    # Cria estatísticas para cada coluna especificada e para cada paciente
    features = df.groupby('patient_id')[color_columns].agg(
        [
            np.mean,    # Média
            np.median,  # Mediana
            np.std,     # Desvio padrão
            np.min,     # Mínimo
            np.max      # Máximo
        ]
    )
    
    # Achatando o MultiIndex nas colunas após o groupby e agg
    features.columns = ['_'.join(col).strip() for col in features.columns.values]

    # Resetando o índice para mesclar de volta ao DataFrame original
    features.reset_index(inplace=True)

    # Mescla as features agregadas de volta ao DataFrame original
    df_with_features = pd.merge(df, features, on='patient_id', how='left')

    # Lista dos nomes das novas colunas
    new_columns = features.columns.tolist()
    new_columns.remove('patient_id')  # Removendo o 'patient_id' pois não é uma nova coluna

    return df_with_features, new_columns

TEST FUNCTIONS


In [None]:
class DataGenerator_old(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, batch_size=8, shuffle=False, fp_hdf_=None, mode='train', size_image=300):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mode = mode
        self.fp_hdf = fp_hdf_
        self.size_image = size_image
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__data_generation(indexes)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples'
        X = np.zeros((len(indexes), self.size_image, self.size_image, 3), dtype='float32')  # Pre-allocate memory for efficiency
        y = np.zeros((len(indexes), 1), dtype='float32')
        
        with h5py.File(self.fp_hdf, 'r') as hdf:
            for j, i in enumerate(indexes):
                row = self.data.iloc[i]
                isic_id = row['isic_id']
                img_data = hdf[isic_id][()]
                img_array = np.array(Image.open(BytesIO(img_data)))
                img_resized = cv2.resize(img_array, (self.size_image, self.size_image))  # Usar cv2 para redimensionar a imagem
                img_resized = img_resized / 255.0  # Normalizar os valores dos pixels para o intervalo [0, 1]
                X[j] = img_resized
                if self.mode != 'test':
                    y[j] = row["target"]

        return X, y

def lrfn(epoch):
        return [1e-3,1e-3,1e-3,1e-4,1e-4,1e-4,1e-5,1e-5,1e-5][epoch]
    
class pAUC2(tf.keras.metrics.Metric):
    def __init__(self, name='pAUC', **kwargs):
        super(pAUC2, self).__init__(name=name, **kwargs)
        self.predictions = []
        self.labels = []

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.labels.append(y_true)
        self.predictions.append(y_pred)

    def result(self):
        labels = tf.concat(self.labels, axis=0)
        predictions = tf.concat(self.predictions, axis=0)

        # Use tf.py_function to wrap roc_curve
        return tf.py_function(self.calculate_pauc, [labels, predictions], Tout=tf.float32)

    def calculate_pauc(self, labels, predictions):
        labels = labels.numpy()
        predictions = predictions.numpy()
        fpr, tpr, thresholds = roc_curve(labels, predictions)
        idxs = tpr > 0.8
        if np.any(idxs):
            return np.float32(np.trapz(tpr[idxs] - 0.8, fpr[idxs]))
        else:
            return np.float32(0.0)

    def reset_states(self):
        self.labels = []
        self.predictions = []
        
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
# Criar uma instância da métrica AUC
aucMETRIC = tf.keras.metrics.AUC(name='auc')
        
def build_EfficientNet(input_shape=(300, 300, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = 3):
    inp = tf.keras.Input(shape=input_shape)

    # Base model, sem top layer
    if model_n==0:
        base_model = efn.EfficientNetB0(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==3:
        base_model = efn.EfficientNetB3(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b3_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==5:
        base_model = efn.EfficientNetB5(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b5_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    

    # Adicionar regularização L2
    for layer in base_model.layers:
        if isinstance(layer, tf.keras.layers.Conv2D):
            layer.kernel_regularizer = tf.keras.regularizers.l2(weight_decay)

    # OUTPUT
    x = base_model(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    x = tf.keras.layers.Dense(num_classes,activation='sigmoid', dtype='float32')(x)

    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate = 1e-3)
    loss = BinaryCrossentropy()

    model.compile(loss=loss, optimizer = opt, metrics=[aucMETRIC])

    return model



In [None]:
def load_and_predict_old(models_paths, data_generator, image_size, model_n, oof_n):
    # DataFrame para armazenar as previsões finais
    df_predictions = pd.DataFrame()
    new_cols=[]
    # Seleciona a GPU
    
    for i in range(len(models_paths[0])):
        pred = 0
        for fold in range(len(models_paths)):
            with strategy.scope():
                model_path = models_paths[fold][i]
                print(f"Loading model from: {model_path}")
                model = build_EfficientNet(input_shape=(image_size, image_size, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = model_n)
                model.load_weights(model_path)
                pred += model.predict(data_generator, verbose=1)
                print(f"Current prediction sum: {pred}")

            # Limpeza de memória
            tf.keras.backend.clear_session()
            gc.collect()

        # Média das previsões
        final_pred = pred / len(models_paths)
        print(f"Final prediction for model {i}: {final_pred}")
        final_pred = np.squeeze(final_pred)
        df_predictions[f'pred_oof{oof_n}'] = final_pred
        new_cols.append(f'pred_oof{oof_n}')
    return df_predictions,new_cols

In [None]:
def pred_embeds_old(models_path, df_test, valid_gen, pca_, n_comp=32, image_size=300, model_n=3):
    def find_layer_by_prefix(model, prefix):
        for layer in model.layers:
            if layer.name.startswith(prefix):
                return layer
        return None

    pred = 0
    
    for path in models_path:
        with strategy.scope():
            print(f"Loading model from: {path}")
            model = model = build_EfficientNet(input_shape=(image_size, image_size, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = model_n)
            model.load_weights(path)
            layer = find_layer_by_prefix(model, 'global_average_pooling2d')
            model_embedding = tf.keras.Model(inputs=model.input, 
                                             outputs=model.get_layer(layer.name).output)
            pred = model_embedding.predict(valid_gen, verbose=1)

        # Liberar memória
        del model
        tf.keras.backend.clear_session()
        gc.collect()

    # Média das previsões
    final_p = pred / len(models_path)
    print(f"Final prediction: {final_p.shape}")

    principal_components_test = pca_.transform(final_p)

    # Criação do DataFrame com os componentes principais
    columns_to_save = [f'pred_embed_{i}_b{model_n}' for i in range(n_comp)]  # Apenas 10 colunas
    principal_df_test = pd.DataFrame(data=principal_components_test, columns=columns_to_save)
    df_test = pd.concat([df_test, principal_df_test], axis=1)
    print(df_test.shape)
    return df_test, final_p,columns_to_save

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from tqdm import tqdm


def calculate_ratio_to_patient_mean_capped(df, groupby_key, feature_list, q_inf=0.1, q_sup=0.9):
    new_columns = []
    percentiles = {}

    # Pré-calcular os percentis 0.10 e 0.90 para cada grupo e cada feature
    for feature in feature_list:
        percentiles[feature] = df.groupby(groupby_key)[feature].quantile([q_inf, q_sup]).unstack(level=-1)

    # DataFrame para armazenar as novas colunas
    new_df = pd.DataFrame(index=df.index)

    # Iterar sobre cada feature para calcular as razões com uma barra de progresso
    for feature in tqdm(feature_list, desc="Processing features"):
        lower_percentile = df[groupby_key].map(percentiles[feature][q_inf])
        upper_percentile = df[groupby_key].map(percentiles[feature][q_sup])

        # Capar os valores pelo percentil 0.10 e 0.90 e calcular a média capada por paciente
        capped_values = df[feature].clip(lower=lower_percentile, upper=upper_percentile)
        patient_mean_capped = df.groupby(groupby_key)[capped_values.name].transform('mean')

        # Criar uma nova coluna para a razão da feature à média capada do paciente
        new_col_name = f"{feature}_ratio_to_capped_patient_mean"
        new_df[new_col_name] = df[feature] / patient_mean_capped

        # Adicionar o nome da nova coluna à lista
        new_columns.append(new_col_name)

    # Concatenar as novas colunas ao DataFrame original
    df = pd.concat([df, new_df], axis=1)

    return df, new_columns

num_cols = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean',
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
]
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"]

def feature_engineering(df):
    # New features to try...
    if "num_rows_per_patient" not in df.columns:
        print('Primeira parte das features')
        df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
        df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
        df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
        df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
        df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
        df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
        df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
        df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
        df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
        df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
        df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
        df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
        df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]

        print('Segunda parte das features')
        df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
        df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
        df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
        df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
        df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
        df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
        df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
        df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
        df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
        df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
        df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
        df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
        df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
        df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

        print('Terceira parte das features')
        df['num_rows_per_patient'] = df.groupby('patient_id')['patient_id'].transform('count')
        # Calculando a razão do sinal versus cor da pele
        df['skin_color_signal_ratio'] = df['tbp_lv_H'] / df['tbp_lv_deltaLBnorm']

    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index",
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index", "num_rows_per_patient", 'skin_color_signal_ratio'
    ]

    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols


def carregar_e_preparar_dados_de_treino(path_metadata):
    df_meta = pd.read_csv(path_metadata)
    return df_meta
def handle_infinities_and_nans(df, columns):
    # Substitui infinitos por NaN
    df[columns] = df[columns].replace([np.inf, -np.inf], np.nan)
    # Preenche NaNs com a média ou mediana da coluna, ajuste conforme necessário
    for column in columns:
        df[column].fillna(df[column].mean(), inplace=True)
    return df

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
import pandas as pd

# Função para carregar e preparar dados de treino e teste
def carregar_e_preparar_dados(path_metadata, train=True):
    df = pd.read_csv(path_metadata)
    # Adicione aqui o código específico de preparação de dados que você usa para treino e teste
    return df

# Função principal de feature engineering e transformação
def processar_dados(df, encoder=None, scaler=None, train=True, onehot_columns=None, onehot_column_names=None):
    df, new_num_cols, new_cat_cols = feature_engineering(df)
    num_cols_ = num_cols + new_num_cols
    cat_cols_ = cat_cols + new_cat_cols
    df, new_feature_names = calculate_ratio_to_patient_mean_capped(df, 'patient_id', num_cols_, q_inf=0.01, q_sup=0.99)
    num_cols_ = num_cols_ + new_feature_names

    # Codificação ordinal para variáveis categóricas
    if train:
        encoder = OrdinalEncoder(
            categories='auto',
            dtype=int,
            handle_unknown='use_encoded_value',
            unknown_value=-2,  # Valor para categorias desconhecidas
            encoded_missing_value=-1  # Valor para dados faltantes
        )
        df[cat_cols_] = encoder.fit_transform(df[cat_cols_])
    else:
        df[cat_cols_] = encoder.transform(df[cat_cols_])

        
    # Codificação one-hot
    if train:
        # Cria um novo DataFrame com colunas one-hot
        dummies = pd.get_dummies(df[onehot_columns], columns=onehot_columns)
        onehot_column_names = dummies.columns.tolist()
        # Concatena o novo DataFrame de dummies com o original, excluindo as colunas que foram transformadas
        df = pd.concat([df.drop(columns=onehot_columns), dummies], axis=1)
    else:
        # Assegura que o DataFrame de teste inclui as mesmas colunas dummies do treino, preenchendo com zeros onde não houver dados
        dummies = pd.get_dummies(df[onehot_columns], columns=onehot_columns)
        dummies = dummies.reindex(columns=onehot_column_names, fill_value=0)
        df = pd.concat([df.drop(columns=onehot_columns), dummies], axis=1)
        
        
    # Normalização de todas as features usando MinMaxScaler
    all_features = num_cols_ + cat_cols_+onehot_column_names
    df = handle_infinities_and_nans(df, all_features)
    
    if train:
        scaler = MinMaxScaler()
        df[all_features] = scaler.fit_transform(df[all_features])
    else:
        df[all_features] = scaler.transform(df[all_features])
        

    
    meta_features = all_features
    n_meta_features = len(meta_features)

    return df, meta_features, n_meta_features, encoder, scaler, onehot_column_names

# Função para obter meta dados da base de treino e teste
def get_meta_data(train_path, test_path):
    # Processando dados de treino
    df_train = carregar_e_preparar_dados(train_path)
    df_train, meta_features, n_meta_features, encoder, scaler,onehot_column_names = processar_dados(df_train, train=True, onehot_columns=['anatom_site_general', 'attribution'])

    # Processando dados de teste
    df_test = carregar_e_preparar_dados(test_path, train=False)
    df_test, _, _, _, _,_ = processar_dados(df_test, encoder=encoder, scaler=scaler, train=False, onehot_columns=['anatom_site_general', 'attribution'], onehot_column_names=onehot_column_names)

    return df_train, df_test, meta_features, n_meta_features, encoder, scaler,onehot_column_names

# Exemplo de uso
train_path = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
test_path = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
df_train_meta, df_test_meta, meta_features, n_meta_features, encoder, scaler, onehot_column_names = get_meta_data(train_path, test_path)


#EXP1
# meta_features = [
#     "age_approx",
#     "sex",
#     # "anatom_site_general",
#     "tbp_lv_location",
#     "tbp_tile_type",
#     "tbp_lv_location_simple",
#     # 'attribution'
# ]
# # +onehot_column_names
# n_meta_features=len(meta_features)

# #EXP2
# meta_features = [
#     "age_approx",
#     "sex",
#     # "anatom_site_general",
#     # "tbp_lv_location",
#     # "tbp_tile_type",
#     # "tbp_lv_location_simple",
#     # 'attribution'
# ]+onehot_column_names
# n_meta_features=len(meta_features)

#EXP3
meta_features = [
    "age_approx",
    "sex",
    # "anatom_site_general",
    "tbp_lv_location",
    "tbp_tile_type",
    "tbp_lv_location_simple",
    # 'attribution'
]+onehot_column_names
n_meta_features=len(meta_features)

# Visualizar as features da base de treino
display(df_train_meta[meta_features])
# Visualizar as features da base de teste
display(df_test_meta[meta_features])

print("Colunas de treino:", df_train_meta[meta_features].shape)
print("Colunas de teste:", df_test_meta[meta_features].shape)


In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image
import cv2
import albumentations
# Função que retorna as transformações para o treinamento e validação
def get_transforms(image_size):
    transforms_train = albumentations.Compose([
        albumentations.Transpose(p=0.5),
        albumentations.VerticalFlip(p=0.5),
        albumentations.HorizontalFlip(p=0.5),
        albumentations.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.75),
        #albumentations.RandomContrast(limit=0.2, p=0.75),
        albumentations.OneOf([
            albumentations.MotionBlur(blur_limit=5),
            albumentations.MedianBlur(blur_limit=5),
            albumentations.GaussianBlur(blur_limit=5),
            albumentations.GaussNoise(var_limit=(5.0, 30.0)),
        ], p=0.7),
        albumentations.OneOf([
            albumentations.OpticalDistortion(distort_limit=1.0),
            albumentations.GridDistortion(num_steps=5, distort_limit=1.),
            albumentations.ElasticTransform(alpha=3),
        ], p=0.7),
        albumentations.CLAHE(clip_limit=4.0, p=0.7),
        albumentations.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
        albumentations.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=0, p=0.85),
        albumentations.Resize(image_size, image_size),
        #albumentations.Cutout(max_h_size=int(image_size * 0.375), max_w_size=int(image_size * 0.375), num_holes=1, p=0.7),
        albumentations.Normalize()
    ])

    transforms_val = albumentations.Compose([
        albumentations.Resize(image_size, image_size),
        albumentations.Normalize()
    ])

    return transforms_train, transforms_val

class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, df_, meta_features, n_meta_features,batch_size=8, shuffle=False, fp_hdf_=None, mode='train', image_size=385):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mode = mode
        self.fp_hdf_ = fp_hdf_
        self.image_size = image_size
        self.transforms_train, self.transforms_val = get_transforms(self.image_size)
        self.df_, self.meta_features, self.n_meta_features = df_, meta_features, n_meta_features
        # Precompute the mapping from isic_id to meta features
        self.isic_to_meta = {isic_id: meta_features for isic_id, meta_features in zip(self.df_['isic_id'], self.df_[self.meta_features].values)}
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        X, y, meta = self.__data_generation(indexes)
        # Retornar as entradas como uma lista que contém X e meta
        return (X, meta), y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples'
        X = np.zeros((len(indexes), self.image_size, self.image_size, 3), dtype='float32')  # Pre-allocate memory for efficiency
        y = np.zeros((len(indexes), 1), dtype='float32')
        meta = np.zeros((len(indexes), self.n_meta_features), dtype='float32')  # Pre-allocate memory for meta features

        with h5py.File(self.fp_hdf_, 'r') as hdf:
            for j, i in enumerate(indexes):
                row = self.data.iloc[i]
                isic_id = row['isic_id']
                img_data = hdf[isic_id][()]
                img_array = np.array(Image.open(BytesIO(img_data)))
                img_array = cv2.resize(img_array, (self.image_size, self.image_size))
                if self.mode == 'train':
                    img_array = self.transforms_train(image=img_array)['image']
                else:
                    img_array = self.transforms_val(image=img_array)['image']
                img_array = img_array / 255.0  # Normalizar os valores dos pixels para o intervalo [0, 1]
                X[j] = img_array
                if self.mode != 'test':
                    y[j] = row["target"]
                else:
                    y[j] = 0

                # Adicionar dados de meta-features usando o dicionário precomputado
                meta[j] = self.isic_to_meta[isic_id].flatten()

        return X, y, meta

import tensorflow as tf
import numpy as np
from PIL import Image
import cv2
import h5py
from PIL import Image
from io import BytesIO

def lrfn(epoch):
        return [1e-3,1e-3,1e-3,1e-4,1e-4,1e-4,1e-5,1e-5,1e-5][epoch]

        
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
# Criar uma instância da métrica AUC
aucMETRIC = tf.keras.metrics.AUC(name='auc')

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
import efficientnet.tfkeras as efn
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_curve
import warnings

def pAUC(truth, preds):
    if np.sum(truth) == 0:
        return 0.0  # ou qualquer valor padrão que você considere apropriado
    fpr, tpr, threshold = roc_curve(truth, preds)
    idxs = tpr > 0.8
    return np.trapz(tpr[idxs] - 0.8, fpr[idxs])

def pAUC_metric(y_true, y_pred):
    # Converting tensors to numpy arrays
    y_true = tf.keras.backend.flatten(y_true)
    y_pred = tf.keras.backend.flatten(y_pred)

    pAUC_value = tf.py_function(func=pAUC, inp=[y_true, y_pred], Tout=tf.float32)

    return pAUC_value

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
import efficientnet.tfkeras as efn

def build_EfficientNetB0(input_shape=(385, 385, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = 3):
    n_meta_dim=[512, 128]
    # Input layers for image and metadata
    inp = layers.Input(shape=input_shape)
    inp_meta = layers.Input(shape=(n_meta_features,))

        # Base model, sem top layer
    if model_n==0:
        base_model = efn.EfficientNetB0(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==3:
        base_model = efn.EfficientNetB3(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b3_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==5:
        base_model = efn.EfficientNetB5(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b5_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    

    # Adding L2 regularization to each convolutional layer
    for layer in base_model.layers:
        if isinstance(layer, layers.Conv2D):
            layer.kernel_regularizer = regularizers.l2(weight_decay)

    # Processing image features
    x = base_model(inp)
    x = layers.GlobalAveragePooling2D()(x)

    # Processing metadata features, if provided
    if n_meta_features > 0:
        x_meta = layers.Dense(n_meta_dim[0], activation='swish')(inp_meta)
        x_meta = layers.BatchNormalization()(x_meta)
        x_meta = layers.Dropout(dropout_rate)(x_meta)
        x_meta = layers.Dense(n_meta_dim[1], activation='swish')(x_meta)
        x_meta = layers.BatchNormalization()(x_meta)

        # Concatenate image features and metadata features
        x = layers.concatenate([x, x_meta])

    # Final classification layer
    x = layers.Dropout(dropout_rate)(x)
    output = layers.Dense(num_classes, activation='sigmoid', dtype='float32')(x)

    # Model definition
    model = models.Model(inputs=[inp, inp_meta], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    loss = tf.keras.losses.BinaryCrossentropy()

    model.compile(loss=loss, optimizer = optimizer, metrics=[pAUC_metric])

    return model

def pred_embeds_new(models_path, df_test, valid_gen, pca_, n_comp=32, image_size=300, model_n=3):
    def find_layer_by_prefix(model, prefix):
        for layer in model.layers:
            if layer.name.startswith(prefix):
                return layer
        return None

    pred = 0
    
    for path in models_path:
        with strategy.scope():
            print(f"Loading model from: {path}")
            model = model = build_EfficientNetB0(input_shape=(image_size, image_size, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = model_n)
            model.load_weights(path)
            layer = find_layer_by_prefix(model, 'global_average_pooling2d')
            model_embedding = tf.keras.Model(inputs=model.input, 
                                             outputs=model.get_layer(layer.name).output)
            pred = model_embedding.predict(valid_gen, verbose=1)

        # Liberar memória
        del model
        tf.keras.backend.clear_session()
        gc.collect()

    # Média das previsões
    final_p = pred / len(models_path)
    print(f"Final prediction: {final_p.shape}")

    principal_components_test = pca_.transform(final_p)

    # Criação do DataFrame com os componentes principais
    columns_to_save = [f'pred_embed_{i}_b{model_n}' for i in range(n_comp)]  # Apenas 10 colunas
    principal_df_test = pd.DataFrame(data=principal_components_test, columns=columns_to_save)
    df_test = pd.concat([df_test, principal_df_test], axis=1)
    print(df_test.shape)
    return df_test, final_p,columns_to_save

In [None]:
# path_pre = '/kaggle/input/prepros-isic/preprocessamtnto isic/'
# df_train = pd.read_parquet(f'{path_pre}df_train_new_features.parquet')
# def find_layer_by_prefix(model, prefix):
#     for layer in model.layers:
#         if layer.name.startswith(prefix):
#             return layer
#     return None

# for i in range(3):
#     df_valid_index = df_train["fold"] == i
#     df_valid= df_train[df_valid_index]
#     TEST_HDF = f'/kaggle/input/isic-2024-challenge/train-image.hdf5'

#     set_random_seed(42, deterministic=True)
#     valid_gen = DataGenerator(df_valid, 
#                           shuffle=False, 
#                           batch_size=256, 
#                           mode='test', 
#                           fp_hdf_=TEST_HDF,
#                               df_=df_train_meta, 
#                               meta_features=meta_features, 
#                               n_meta_features=n_meta_features)

#     pred = 0
#     with strategy.scope():
#         print(f"Loading model from: {f'/kaggle/input/modelos-down/modelossalvos_auc_down9_novo_b5_aug_meta/MLP_fold_{i}_model_0.h5'}")
#         model = build_EfficientNetB0()
#         model.load_weights(f'/kaggle/input/modelos-down/modelossalvos_auc_down9_novo_b5_aug_meta/MLP_fold_{i}_model_0.h5')
#         layer = find_layer_by_prefix(model, 'global_average_pooling2d')
#         model_embedding = tf.keras.Model(inputs=model.input, 
#                                          outputs=model.get_layer(layer.name).output)
#         pred = model_embedding.predict(valid_gen, verbose=1)
#         df_train = df_train.copy()
#         for j in range(pred.shape[1]):
#             df_train.loc[df_valid_index, f'pred_embed_{j}'] = pred[:, j]
#         print(f"Current prediction sum: {pred}")
        

#     # Liberar memória
#     del model
#     tf.keras.backend.clear_session()
#     gc.collect()

#     print(pred.shape)
# columns_to_save = [f'pred_embed_{i}' for i in range(pred.shape[1])]
# print(f"Salvando en formato para quê para otimizar.{df_train[columns_to_save].shape}")
# data_array = df_train[columns_to_save].to_numpy()
# np.save('oof0_embedding_B5_M0_DOWN9_aug_meta.npy', data_array)
# prunt()

# agrupamentos

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from tqdm import tqdm
from copy import deepcopy

# def preprocess_data(embeds):
#     # Substitui infinitos por NaN em arrays NumPy
#     embeds = np.where(np.isinf(embeds), np.nan, embeds)
    
#     # Imputação de média
#     imputer = SimpleImputer(strategy='mean')
#     embeds_filled = imputer.fit_transform(embeds)
    
#     # Escalamento dos dados
#     scaler = StandardScaler()
#     embeds_scaled = scaler.fit_transform(embeds_filled)
    
#     return embeds_scaled

# from sklearn.ensemble import IsolationForest

# def apply_isolation_forest(group, embeds):
#     # Inicializa scores de anomalia
#     group['scores'] =0.5 # Predefine todos os scores como 0.5

#     if len(group) >= 20:
#         # Processa com Isolation Forest usando embeddings
#         emb = embeds[group.index]
#         iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
#         iso_forest.fit(emb)
#         group['scores'] = iso_forest.decision_function(emb)  # Atribui scores do modelo
    
# #     print(group)
# #     print(group.shape)
#     return group['scores']

# def process_patients(df, embeds):
#     df_copy = deepcopy(df)
#     embeds_copy = deepcopy(embeds)
#     # Preprocessamento de embeddings
#     embeds_copy = preprocess_data(embeds_copy)
    
#     # Aplicação do Isolation Forest com barra de progresso
#     tqdm.pandas(desc="Calculando distâncias para dados de treino")
#     results = df_copy.groupby('patient_id').progress_apply(lambda group: apply_isolation_forest(group, embeds_copy))
# #     print(results)
#     print(results.shape)
#     df_copy['score'] = results.values
#     return df_copy['score'] 

# # from sklearn.cluster import KMeans
# # from sklearn.preprocessing import StandardScaler
# # import pandas as pd
# # from tqdm import tqdm

# # def apply_kmeans(group, embeds, n_clusters=5):
# #     if len(group) < 30:
# #         group['cluster_labels'] = 0  # Considera não-anômalo
# #     else:
# #         # Aplicação do K-Means
# #         kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# #         group['cluster_labels'] = kmeans.fit_predict(emb_scaled)
# #     return group

# # def process_patients_kmeans(df, embeds, n_clusters=10):
# #     embeds = preprocess_data(embeds)
# #     # Processando os dados de cada paciente individualmente usando uma barra de progresso
# #     tqdm.pandas(desc="Aplicando K-Means nos dados de cada paciente")
# #     results = df.groupby('patient_id').progress_apply(lambda group: apply_kmeans(group, embeds, n_clusters))
# #     results.reset_index(drop=True, inplace=True)
# #     return results


In [None]:
pd.set_option('future.no_silent_downcasting', True)
def paciente_ultima_tentativa_iso_forest(df, cols):
    df = df.copy()
    df['scores_iso'] = 0.5
    for i,pac in enumerate(df.groupby('patient_id')):
        if i % 20==0:   
            print(f'--{i}', end=' ')
        
        paciente_id = pac[0]
        df_pac = pac[1]
        
        if len(df_pac) >= 20:
            df_pac1=df_pac.replace([np.inf, -np.inf], np.nan)
            iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
            iso_forest.fit(df_pac1[cols].fillna(0).values)
            df.loc[df_pac.index, 'scores_iso'] = iso_forest.decision_function(df_pac1[cols].fillna(0).values)  # A
#             display(df.loc[df_pac.index, 'scores'])
#         if i >3:
#             break
    return df, ['scores_iso']
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM

pd.set_option('future.no_silent_downcasting', True)

def paciente_ultima_tentativa_svm(df, cols):
    print('----- calculando OneClassSVM')
    df = df.copy()
    df['scores_svm'] = 0.5  # Inicializar scores

    for i, (paciente_id, df_pac) in enumerate(df.groupby('patient_id')):
        if i % 20 == 0:
            print(f'--{i}', end=' ')
        
        if len(df_pac) >= 20:
            df_pac = df_pac.replace([np.inf, -np.inf], np.nan)
            # Configurar o One-Class SVM
            oc_svm = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
            oc_svm.fit(df_pac[cols].fillna(0).values)
            
            # Decision function retorna valores, quanto menor, mais anômalo é considerado o ponto
            scores = oc_svm.decision_function(df_pac[cols].fillna(0).values)
            df.loc[df_pac.index, 'scores_svm'] = scores

    return df, ['scores_svm']

# df_train2 = paciente_ultima_tentativa_svm(df_train_meta, num_cols)
# df_train2

# PRE PROCESSAMENTO

BASE DE TREINO

In [None]:
def oof_preds_treino(df_, path_, oof_name='pred_oof0', model_n=3, n_comp=32, verbose=True, mode='save'):
    
    if mode == 'save':
        path_embed_=path_+'embedding_array.npy'
        path_oof_=path_+'oof_fold_0.csv'
        embed_data_b0, pca_b0, pca_df_b0, pca_df_columns_b0 = carregar_oof_embeddings_np(path_embs=path_embed_, n_comp=n_comp, verbose=verbose, model_n=model_n)
        oof_paths = {oof_name: path_oof_}
        df_, new_cols_embeds_oof = carregar_e_preparar_dados_de_treino_old(df_, oof_paths, [pca_df_b0])
        # CALCULAR DISTANCIAS DE VETORES 
        df_, new_cols_vetores_controles_b0 = df_vetores_controle(df_, embed_data_b0, oof_name, q_inf=0.1, col_group='patient_id', name_=f'b{model_n}')
        df_, _, new_cols_dis_pos_b0 = calcular_distancias_treino_teste(df_, None, embed_data_b0, None, is_train=True, name_=f'b{model_n}')
        
        #df_=process_patients(df_, pca_df_b0.values)
        
        novas_colunas = new_cols_embeds_oof+new_cols_vetores_controles_b0+new_cols_dis_pos_b0#+['scores']
        if verbose:
            print('Base de dados de treino OOF PRED')
            display(df_.head(2))
            display(df_.shape)

        with open('df_.pkl', 'wb') as f:
            pickle.dump(df_, f)
        with open('novas_colunas.pkl', 'wb') as f:
            pickle.dump(novas_colunas, f)
    else:
        with open(f'/kaggle/input/prepros-isic/df_.pkl', 'rb') as f:
            df_ = pickle.load(f)
        with open('/kaggle/input/prepros-isic/novas_colunas.pkl',  'rb') as f:
            novas_colunas = pickle.load(f)
                  
    return df_, novas_colunas

In [None]:
%%time
pd.set_option('display.max_columns', None)
set_random_seed(42)



# TREINO ---------------------------------------------------------
if CRIA_BASE_TRAIN:
    df_train = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
    df_train, novas_colunas= oof_preds_treino(df_train, 
            path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b3_aug_sem_meta_EXP5_5FOLDS/', 
            oof_name='pred_oof0', 
            model_n=3, n_comp=32, verbose=True, mode='save')
#     df_train, novas_colunas2= oof_preds_treino(df_train, 
#             path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b3_aug_sem_meta/', 
#             oof_name='pred_oof1', 
#             model_n=33, n_comp=32, verbose=True)
#     df_train, novas_colunas3= oof_preds_treino(df_train, 
#             path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b5_aug_sem_meta/', 
#             oof_name='pred_oof2', 
#             model_n=5, n_comp=32, verbose=True)
#     novas_colunas = novas_colunas1+novas_colunas2+novas_colunas3
    # FEATURE ENGINERING
    df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
    num_cols_ = num_cols+new_num_cols+novas_colunas
    cat_cols_ = cat_cols+new_cat_cols

    # RATIO POR PACIENTE DAS PRINCIPAIS COLUNAS NUMERICAS
    df_train, new_feature_names1= calculate_ratio_to_patient_mean_capped2(df_train, 'patient_id', num_cols_, q_inf=0.01, q_sup=0.99)
#     df_train, new_feature_names2 = cria_features_agregadas(df_train, num_cols_)
    num_cols_ = num_cols_+new_feature_names1

    # TRANSFORMAÇÃO DE CATEGORIAS EM NUMEROS
    df_train, category_encoder = fit_encode_categorical_columns(df_train, cat_cols_)

    # FEATURE SELECTION INICIAL
    train_cols = num_cols_ + cat_cols_

#     df_train, new_col_scores_svm=paciente_ultima_tentativa_svm(df_train, train_cols)
#     df_train, new_col_scores_iso=paciente_ultima_tentativa_iso_forest(df_train, train_cols)
#     cluster_cols = train_cols
#     train_cols = train_cols+new_col_scores_svm+new_col_scores_iso
    
    print('Base de dados de treino COM FEATURE SELECTION INICIAL')
    display(df_train[train_cols].head(2))
    display(df_train[train_cols].shape)
    
    # SALVA O NECESSARIO PARA CSV
    # Salvando o DataFrame como Parquet com compressão 'snappy'
    df_train.to_parquet('df_train_new_features.parquet', engine='pyarrow', compression='snappy')
    # Salvando train_cols
    with open('train_cols.pkl', 'wb') as f:
        pickle.dump(train_cols, f)
    # Salvando category_encoder
    with open('category_encoder.pkl', 'wb') as f:
        pickle.dump(category_encoder, f)
    # Salvando train_cols
    with open('num_cols_.pkl', 'wb') as f:
        pickle.dump(num_cols_, f)
    # Salvando category_encoder
    with open('cat_cols_.pkl', 'wb') as f:
        pickle.dump(cat_cols_, f)
else:
    path_pre = '/kaggle/input/prepros-isic/results (6)/'
    df_train = pd.read_parquet(f'{path_pre}df_train_new_features.parquet')
    with open(f'{path_pre}train_cols.pkl', 'rb') as f:
        train_cols = pickle.load(f)
    with open(f'{path_pre}category_encoder.pkl', 'rb') as f:
        category_encoder = pickle.load(f)
    with open(f'{path_pre}num_cols_.pkl', 'rb') as f:
        num_cols_ = pickle.load(f)
    with open(f'{path_pre}cat_cols_.pkl', 'rb') as f:
        cat_cols_ = pickle.load(f)
    

BASE DE TEST

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
import efficientnet.tfkeras as efn
def build_EfficientNetB0(input_shape=(300, 300, 3), num_classes=1, model_weights=None, weight_decay=0.001, dropout_rate=0.2, model_n = 3):
    n_meta_dim=[512, 128]
    # Input layers for image and metadata
    inp = layers.Input(shape=input_shape)
    inp_meta = layers.Input(shape=(n_meta_features,))

        # Base model, sem top layer
    if model_n==0:
        base_model = efn.EfficientNetB0(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==3:
        base_model = efn.EfficientNetB3(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b3_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    elif model_n==5:
        base_model = efn.EfficientNetB5(include_top=False, weights=None, input_tensor=None)
        base_model.load_weights(f'/kaggle/input/modelos-isic/efficientnet-b5_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')

    # Adding L2 regularization to each convolutional layer
    for layer in base_model.layers:
        if isinstance(layer, layers.Conv2D):
            layer.kernel_regularizer = regularizers.l2(weight_decay)

    # Processing image features
    x = base_model(inp)
    x = layers.GlobalAveragePooling2D()(x)
    embeddings = layers.Lambda(lambda x: x, name='embeddings')(x)

#     # Processing metadata features, if provided
#     if n_meta_features > 0:
#         x_meta = layers.BatchNormalization()(inp_meta)
#         x_meta = layers.Dense(n_meta_dim[0], activation='relu')(x_meta)
#         x_meta = layers.BatchNormalization()(x_meta)
#         x_meta = layers.Dropout(dropout_rate)(x_meta)
#         x_meta = layers.Dense(n_meta_dim[1], activation='relu')(x_meta)
#         x_meta = layers.BatchNormalization()(x_meta)

#         # Concatenate image features and metadata features
#         x = layers.concatenate([x, x_meta])

#     # Adding BatchNormalization after concatenation
#     x = layers.BatchNormalization()(x)

    # Final classification layer
    x = layers.Dropout(dropout_rate)(x)
    classification_output = layers.Dense(num_classes, activation='sigmoid', name='c_out', dtype='float32')(x)

    # Model definition
    model = models.Model(inputs=[inp, inp_meta], outputs=[classification_output, embeddings])
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    loss = {'c_out': tf.keras.losses.BinaryCrossentropy(), 'embeddings': None}

    model.compile(loss=loss, optimizer=optimizer, metrics={'c_out': [pAUC_metric]})

    return model

def pred_embeds_new2(models_path, df_test, valid_gen, pca_, n_comp=32, image_size=300, model_n=3, oof_col_name='pred_oof0',tag_=3):
    pred_oof = 0   
    pred_embeds = 0   
    for path in models_path:
        with strategy.scope():
            print(f"Loading model from: {path}")
            model = model = build_EfficientNetB0(input_shape=(image_size, image_size, 3), model_n = model_n)
            model.load_weights(path)
            pred_ = model.predict(valid_gen, verbose=1)
            pred_oof=pred_[0]
            pred_embeds=pred_[1]
        # Liberar memória
        del model
        tf.keras.backend.clear_session()
        gc.collect()

    # Média das previsões
    final_p = pred_embeds / len(models_path)
    print(f"Final prediction: {final_p.shape}")
    principal_components_test = pca_.transform(final_p)
    # Criação do DataFrame com os componentes principais
    columns_to_save = [f'pred_embed_{i}_b{tag_}' for i in range(n_comp)]  # Apenas 10 colunas
    principal_df_test = pd.DataFrame(data=principal_components_test, columns=columns_to_save)
    df_test = pd.concat([df_test, principal_df_test], axis=1)
    print(df_test.shape)
    
    # Média das previsões
    final_pred = pred_oof / len(models_path)
    final_pred = np.squeeze(final_pred)
    df_test[oof_col_name] = final_pred
    
    
    return df_test, final_p,columns_to_save,principal_df_test

In [None]:
def oof_preds_test(df_test, df_train, path_, oof_name='pred_oof0', model_n=3, n_comp=32, verbose=True, image_size=300, tag_=3):
    path_embed_=path_+'embedding_array.npy'
    embed_data, pca, pca_df, pca_df_columns = carregar_oof_embeddings_np(path_embs=path_embed_, n_comp=n_comp, verbose=verbose, model_n=tag_)    
    models_path = [
            f'{path_}MLP_fold_0_model_0.h5',
            f'{path_}MLP_fold_1_model_0.h5',
            f'{path_}MLP_fold_2_model_0.h5',
            f'{path_}MLP_fold_3_model_0.h5',
            f'{path_}MLP_fold_4_model_0.h5'
        ]
    test_hdf = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
    valid_gen = DataGenerator(df_test, shuffle=False, batch_size=256, mode='test', 
                              fp_hdf_=test_hdf, image_size=image_size, 
                              df_=df_test_meta, meta_features=meta_features, n_meta_features=n_meta_features)
    df_test, embeds_test, new_col_pca,pca_df_test = pred_embeds_new2(models_path, df_test, valid_gen, pca, n_comp=n_comp, image_size=image_size, model_n=model_n, oof_col_name=oof_name, tag_=tag_)
    
    nel_col_oof_test=new_col_pca+[oof_name]
    
    # CALCULAR DISTANCIAS DE VETORES 
    df_test, new_cols_vetores_controles = df_vetores_controle(df_test, embeds_test, oof_name, q_inf=0.1, col_group='patient_id', name_=f'b{tag_}')
    _, df_test, new_cols_dis_pos = calcular_distancias_treino_teste(df_train, df_test, embed_data, embeds_test, is_train=False, name_=f'b{tag_}')
    
    #df_test=process_patients(df_test, pca_df_test.values)
    
    novas_colunas = nel_col_oof_test+new_cols_vetores_controles+new_cols_dis_pos#+['scores']
    if verbose:
        print('Base de dados de treino OOF PRED')
        display(df_test.head(2))
        display(df_test.shape)
        
    return df_test, novas_colunas



In [None]:
%%time
    
# TEST ------------------------------------------------------------
df_test = pd.read_csv(f'/kaggle/input/isic-2024-challenge/test-metadata.csv')
test_hdf = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
set_random_seed(42)

df_test, novas_colunas= oof_preds_test(df_test, df_train,
        path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b3_aug_sem_meta_EXP5_5FOLDS/', 
        oof_name='pred_oof0', 
        model_n=3, n_comp=32, verbose=True, image_size=300, tag_=3)
# df_test, novas_colunas2= oof_preds_test(df_test, df_train,
#         path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b3_aug_sem_meta/', 
#         oof_name='pred_oof1', 
#         model_n=3, n_comp=32, verbose=True, image_size=300, tag_=33)
# df_test, novas_colunas3= oof_preds_test(df_test, df_train,
#         path_='/kaggle/input/modelos-down/modelossalvos_auc_down_b5_aug_sem_meta/', 
#         oof_name='pred_oof2', 
#         model_n=5, n_comp=32, verbose=True, image_size=384, tag_=5)
# novas_colunas = novas_colunas1+novas_colunas2+novas_colunas3
# FEATURE ENGINERING
df_test, new_num_cols, new_cat_cols = feature_engineering(df_test.copy())
num_cols_ = num_cols+new_num_cols+novas_colunas
cat_cols_ = cat_cols+new_cat_cols

# RATIO POR PACIENTE DAS PRINCIPAIS COLUNAS NUMERICAS
df_test, new_feature_names1= calculate_ratio_to_patient_mean_capped2(df_test, 'patient_id', num_cols_, q_inf=0.01, q_sup=0.99)
# df_test, new_feature_names2 = cria_features_agregadas(df_test, num_cols_)
num_cols_ = num_cols_+new_feature_names1

# TRANSFORMAÇÃO DE CATEGORIAS EM NUMEROS
df_test, category_encoder = fit_encode_categorical_columns(df_test, cat_cols_)

# df_test, new_col_scores_svm=paciente_ultima_tentativa_svm(df_test, cluster_cols)
# df_test, new_col_scores_iso=paciente_ultima_tentativa_iso_forest(df_test, cluster_cols)

print('Base de dados de treino COM FEATURE SELECTION INICIAL')
display(df_train[train_cols].head(2))
display(df_train[train_cols].shape)

# FEATURE SELECTION INICIAL
print('Base de dados de treino COM FEATURE SELECTION INICIAL')
display(df_test[train_cols].head(2))
display(df_train[train_cols].shape, df_test[train_cols].shape)


# Pré-processamento

# TREINO

In [None]:
# df_train=df_train[df_train['copyright_license']=='CC-BY']
# df_train

#### lgbm

In [None]:
   
result={}

# Callbacks para o treinamento do modelo
callbacks = [
    log_evaluation(period=100),
    early_stopping(stopping_rounds=200, first_metric_only=True, verbose=True)
]
    
for i in range(42,43,1):
    set_random_seed(i)

    lgb_params = {
        'objective': 'binary',
        'metrics': 'None',
        "random_state": i,
        "n_estimators":1000,
        'learning_rate':0.003,
        'num_leaves':20,
        'min_data_in_leaf':40,
        'bagging_freq': 1,
        'pos_bagging_fraction':0.75,
        'neg_bagging_fraction':0.05,
        'feature_fraction':0.57,
        'lambda_l1':0.27,
        'lambda_l2':1.0,
        "verbosity": -1,
#         "class_weight":'balanced',
#         "extra_trees": True
    }
    
    scores = []
    models = []
    for fold in range(FOLDS):
        _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
        _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
        model = lgb.LGBMRegressor(
#             **top_10_paramns[i]
            **lgb_params
        )
        model.fit(
            _df_train[train_cols], _df_train["target"],
            eval_set=[(_df_valid[train_cols], _df_valid["target"]), (_df_train[train_cols], _df_train["target"])], 
            eval_metric=comp_scorel,
            callbacks=callbacks
        )
        preds = model.predict(_df_valid[train_cols])
        score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
        print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
        scores.append(score)
        models.append(copy.deepcopy(model))
        
    result[i] ={
        "scores_":np.mean(scores),
#         "lgb_params": top_10_paramns[i],
        "lgb_params": lgb_params,
        "models": copy.deepcopy(models),
        "fold1Score":scores[0],
        "fold2Score":scores[1],
        "fold3Score":scores[2],
        "fold4Score":scores[3],
        "fold5Score":scores[4],
        "scores_total":scores,
    }
    print(result[i])
# Salvando os resultados
with open('result_data.pkl', 'wb') as file:
    pickle.dump(result, file)

In [None]:
# Nome do arquivo onde os dados foram salvos
filename = 'result_data.pkl'

# Carregando os dados do arquivo
with open(filename, 'rb') as file:
    loaded_result = pickle.load(file)

# Ordenando os resultados do melhor para o pior baseado em 'scores_'
sorted_results = sorted(loaded_result.items(), key=lambda x: x[1]['scores_'], reverse=True)

# Imprimindo os resultados ordenados
for i, (key, value) in enumerate(sorted_results):
    if i < 2:
        print(50*'*')
        print(50*'*')
        print(f"Rank {i+1}:")
        print(f"ID: {key}")
        print(f"Score: {value['scores_']}")
        print(f"Parameters: {value['lgb_params']}")
        print(f"Models: {value['models']}")
        print(f"fold1score: {value['fold1Score']}")
        print(f"fold2score: {value['fold2Score']}")
        print(f"fold3score: {value['fold3Score']}")
        print(50*'*')
        print(50*'*')
        print()


# Lista para armazenar os três melhores modelos
top_3_models = []
valor_fold = []
for i in range(FOLDS):
    ii = i+1
    # Ordenando os resultados do melhor para o pior baseado em 'scores_'
    sorted_results = sorted(loaded_result.items(), key=lambda x: x[1][f'fold{ii}Score'], reverse=True)
    # Coletando os três melhores modelos
    if ii==3:
        limit = 1
    elif ii==2:
        limit = 1
    else:
        limit = 1
#     limit = 1
    for j, (key, value) in enumerate(sorted_results):
        if j<limit:
            valor_fold.append(value[f'fold{ii}Score']) 
            top_3_models.append(value['models'][i]) 
#             print(50*'*')
#             print(50*'*')
#             print(f"Rank {i+1}:")
#             print(f"ID: {key}")
#             print(f"Score: {value['scores_']}")
#             print(f"Parameters: {value['lgb_params']}")
#             print(f"Models: {value['models']}")
#             print(f"fold1score: {value['fold1Score']}")
#             print(f"fold2score: {value['fold2Score']}")
#             print(f"fold3score: {value['fold3Score']}")
#             print(50*'*')
#             print(50*'*')
            print()
display(len(top_3_models))
display(valor_fold)
display(np.mean(valor_fold))

In [None]:
importances = np.mean([model.feature_importances_ for model in top_3_models], axis=0)
feature_names = df_train[train_cols].columns
df_imp = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values("importance").reset_index(drop=True)

In [None]:
# How does the new feature(s) perform?
# I would go with the > 20 in the index.
pd.set_option('display.max_rows', 500)
df_imp.sort_values("importance", ascending=False)

In [None]:
preds_lgbm = np.mean([model.predict(df_test[train_cols]) for model in top_3_models], axis=0)

#### catboost

In [None]:
from sklearn.ensemble import VotingClassifier

import optuna
import catboost as cb
import lightgbm as lgb
import xgboost as xgb

OPTIMIZE_OPTUNA = False
SUBSAMPLE = False
SUBSAMPLE_RATIO = 0.5 # only effective if SUBSAMPLE=True
DISPLAY_FEATURE_IMPORTANCE = False

def objective(trial):
    param = {
        "objective":         trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth":             trial.suggest_int("depth", 1, 12),
        "boosting_type":     trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type":    trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        # "task_type":       "GPU",
        # "used_ram_limit":  "3gb",
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    scores = []
    
    for fold in range(N_SPLITS):
        _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
        _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
        gbm = cb.CatBoostClassifier(**param)
        gbm.fit(_df_train[train_cols], _df_train["target"], eval_set=[(_df_valid[train_cols], _df_valid["target"])], verbose=0, early_stopping_rounds=100)
        preds = gbm.predict(_df_valid[train_cols])
        score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
        scores.append(score)
        
    return np.mean(scores)

In [None]:

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=21, timeout=500)
# print("Number of finished trials: {}".format(len(study.trials)))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

In [None]:
import numpy as np
from catboost import MetricVisualizer
from sklearn.metrics import roc_auc_score

class CustomAUCMetric:
    def is_max_optimal(self):
        """Define que uma métrica maior é melhor."""
        return True

    def get_final_error(self, error, weight):
        """Retorna o erro final usado para a métrica."""
        return error

    def evaluate(self, approxes, target, weight):
        """Avalia a métrica durante o treinamento.
        
        Args:
            approxes: Lista de listas das previsões do modelo.
            target: Lista dos valores verdadeiros (rótulos).
            weight: Pode ser usado para ponderação, mas ignorado neste exemplo.
        
        Returns:
            Tuple[float, int]: Retorna a métrica calculada e o peso (assumido como 1 aqui).
        """
        approxes = np.array(approxes[0])
        target = np.array(target)
        
        min_tpr = 0.80  # Taxa mínima de verdadeiros positivos
        max_fpr = 1 - min_tpr
        
        v_gt = abs(target - 1)
        v_pred = 1.0 - approxes

        # Cálculo do AUC parcial
        partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)

        # Reescala o AUC para o intervalo customizado
        partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
        
        return partial_auc, 1

In [None]:
cb_params = {
    'objective': 'Logloss',
    "random_state": 42,
    # "colsample_bylevel": 0.3, # 0.01, 0.1
    "iterations": 700,
    "learning_rate": 0.05,
    "cat_features": cat_cols,
    "max_depth": 12,
    "l2_leaf_reg": 3,
    "task_type": "GPU",
    # "scale_pos_weight": 2,
    "verbose": 50,
    "eval_metric": CustomAUCMetric()
}
cb_scores = []
cb_models = []
for fold in range(FOLDS):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(_df_train[train_cols], _df_train["target"],
          eval_set=(_df_valid[train_cols], _df_valid["target"]),
          early_stopping_rounds=200,
          plot=True)
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    cb_scores.append(score)
    cb_models.append(model)

In [None]:
cb_score = np.mean(cb_scores)
print(f"CatBoost Score: {cb_score:.5f}")

In [None]:
importances = np.mean([model.feature_importances_ for model in cb_models], axis=0)
feature_names = df_train[train_cols].columns
df_imp = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values("importance").reset_index(drop=True)
pd.set_option('display.max_rows', 700)
df_imp.sort_values("importance", ascending=False)

In [None]:
with open('cb_models.pkl', 'wb') as file:
    pickle.dump(cb_models, file)
cb_preds  = np.mean([model.predict_proba(df_test[train_cols])[:, 1] for model in cb_models],  0)
# cb_preds = np.mean([model.predict_proba(df_test[train_cols])[:, 1] for model in cb_models[:2]], 0)


# fazer previsoes

In [None]:
preds = preds_lgbm * 0.5 + cb_preds * 0.5

In [None]:
df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
df_sub["target"] = preds
df_sub

In [None]:
df_sub.to_csv("submission.csv", index=False)

In [None]:
df_sub