In [1]:
import os
import gc
import time
import copy
import optuna

from pathlib import Path

import numpy as np, pandas as pd, polars as pl

from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingClassifier

from sklearn.utils import resample

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

import lightgbm as lgb, catboost as cb, xgboost as xgb

from optuna.samplers import TPESampler

from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
# from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import NearMiss, TomekLinks
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, VarianceThreshold
from sklearn.base import BaseEstimator, TransformerMixin

# IMAGE PREDICTIONS

In [2]:
#EFFNET V1B0
!python /kaggle/input/isic-script-inference-effnetv1b0-f313ae/main.py /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
!mv submission.csv submission_effnetv1b0.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]


In [3]:
#TARGET 3
!python /kaggle/input/isic-2024-pl-submission-script-and-preds/pl_submission.py
!mv submission.csv submission_image3.csv

  df_train_meta = pd.read_csv(BASE_DATA_DIR + "train-metadata.csv")


In [4]:
#effnet_b3
!python /kaggle/input/lets-go-neural-8-image-only-inference-script/main.py /kaggle/input/lets-go-neural-8-image-only-state-dict/fullmodel2_epoch7loss0.133auc0.933pauc0.175.pth
!mv submission.csv submission_effnetb3.csv


Densenet models:  ['efficientnet_b3', 'efficientnet_b3_g8_gn', 'efficientnet_b3_gn', 'efficientnet_b3_pruned', 'tf_efficientnet_b3']
set()
set()
set()
set()
set()
train_size before: 401059
train_size after: 320848
n_neg: 320532, n_pos: 316 in train
n_neg: 80134, n_pos: 77 in val
train_size even after: 632
n_neg: 316, n_pos: 316 in train
n_neg: 80134, n_pos: 77 in val
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]


# DATA PREPROCESS

In [7]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
image_cols = ["target_effnetv1b0","target_3","target_effnetb3"] # ,"target_3", "target_effnetv1b0", "target_eva02"] # ,"target_edgenext"]

#norm_cols += image_cols
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols

In [8]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
        .set_index(id_col)
    )

In [9]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')  
    
    # effnetv1b0
    df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
    df_train = df_train.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_train["target_effnetv1b0"] = df_eff["target_effnetv1b0"]
    df_eff = pd.read_csv("submission_effnetv1b0.csv")
    df_test = df_test.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_test["target_effnetv1b0"] = df_eff["target"]
        
    # target 3
    df_image_3 = pd.read_csv("/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv")
    df_train = df_train.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_train["target_3"] = df_image_3["pred"]
    df_3 = pd.read_csv("submission_image3.csv")
    df_test = df_test.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_test["target_3"] = df_3["target"]
    
    # effnetb3
    df_eff = pd.read_csv("/kaggle/input/final-train-preds-149/train_preds_149.csv")
    df_train = df_train.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_train["target_effnetb3"] = df_eff["target"]
    df_eff = pd.read_csv("submission_effnetb3.csv")
    df_test = df_test.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_test["target_effnetb3"] = df_eff["target"]
    


    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

In [10]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [11]:
df_train = read_data(train_path)
df_test = read_data(test_path)
df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_test = preprocess(df_train, df_test)

  df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")


In [12]:
#they are detected at the first run
least_important_features = ['onehot_32', 'onehot_6', 'onehot_33', 'onehot_30', 'onehot_26', 'onehot_22', 'onehot_36', 'onehot_4']
df_train.drop(columns =least_important_features,inplace = True)
for feature in least_important_features:
    cat_cols.remove(feature)
    feature_cols.remove(feature)

# MODEL INITIALIZATION

In [13]:
feature_cols_without_image_cols = copy.copy(feature_cols)
feature_cols += image_cols

class SelectColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [14]:
import optuna
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

In [15]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.012889168767402828, 
    'lambda_l2':        3.721594573574584e-05, 
    'learning_rate':    0.03134634860324871, 
    'max_depth':        5, 
    'num_leaves':       99, 
    'colsample_bytree': 0.9942718837995181, 
    'colsample_bynode': 0.5049540538048654, 
    'bagging_fraction': 0.8789058331092248, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 20, 
    'scale_pos_weight': 1.7052854467822236,
}


sampling_ratio = 0.01
seed =42

lgb_model = ImbPipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        211,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         8, 
    'learning_rate':     0.05821860895001415, 
    'scale_pos_weight':  2.139674953542652, 
    'l2_leaf_reg':       7.604361144096182, 
    'subsample':         0.6143681181742084, 
    'min_data_in_leaf':  46,
    'cat_features':      cat_cols,
}
cb_model = ImbPipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.03358291583962914, 
    'lambda':             4.939464912891554, 
    'alpha':              1.9362954932444323, 
    'max_depth':          8, 
    'subsample':          0.6448469794293958, 
    'colsample_bytree':   0.7016885417642298, 
    'colsample_bylevel':  0.6393079225027909, 
    'colsample_bynode':   0.6680932948467931, 
    'scale_pos_weight':   3.4553765452582335,
}

xgb_model = ImbPipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
], voting='soft') # ,weights=[0.30,0.47,0.28])

In [18]:
# import optuna
# from sklearn.model_selection import cross_val_score
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
# from sklearn.pipeline import Pipeline
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import VotingClassifier
# from imblearn.pipeline import Pipeline as ImbPipeline

# # Define the objective function for Optuna
# def objective(trial):
    
    
#     # Define the pipelines for each model
#     sampling_ratio = 0.01
#     seed =42
    
#     # LightGBM hyperparameters
# #     lgb_params = {
# #         'objective': 'binary',
# #         'verbosity': -1,
# #         'n_estimator': 200,
# #         'boosting_type': 'gbdt',
# #         'random_state': seed,
# #         'lambda_l1': trial.suggest_loguniform('lgb_lambda_l1', 1e-8, 10.0),
# #         'lambda_l2': trial.suggest_loguniform('lgb_lambda_l2', 1e-8, 10.0),
# #         'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 0.1),
# #         'max_depth': trial.suggest_int('lgb_max_depth', 3, 10),
# #         'num_leaves': trial.suggest_int('lgb_num_leaves', 31, 128),
# #         'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0),
# #         'colsample_bynode': trial.suggest_uniform('lgb_colsample_bynode', 0.4, 1.0),
# #         'bagging_fraction': trial.suggest_uniform('lgb_bagging_fraction', 0.5, 1.0),
# #         'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 10),
# #         'min_data_in_leaf': trial.suggest_int('lgb_min_data_in_leaf', 20, 100),
# #         'scale_pos_weight': trial.suggest_uniform('lgb_scale_pos_weight', 1.0, 5.0),
# #     }

# #     # CatBoost hyperparameters
# #     cb_params = {
# #         'loss_function': 'Logloss',
# #         'iterations': trial.suggest_int('cb_iterations', 100, 1000),
# #         'verbose': False,
# #         'random_state': seed,
# #         'max_depth': trial.suggest_int('cb_max_depth', 4, 10),
# #         'learning_rate': trial.suggest_loguniform('cb_learning_rate', 1e-4, 0.1),
# #         'scale_pos_weight': trial.suggest_uniform('cb_scale_pos_weight', 1.0, 5.0),
# #         'l2_leaf_reg': trial.suggest_uniform('cb_l2_leaf_reg', 1.0, 10.0),
# #         'subsample': trial.suggest_uniform('cb_subsample', 0.5, 1.0),
# #         'min_data_in_leaf': trial.suggest_int('cb_min_data_in_leaf', 10, 50),
# #         'cat_features': cat_cols,
# #     }

# #     # XGBoost hyperparameters
#     xgb_params = {
#         'enable_categorical': True,
#         'tree_method': 'hist',
#         'random_state': seed,
#         'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 0.1),
#         'lambda': trial.suggest_uniform('xgb_lambda', 1.0, 10.0),
#         'alpha': trial.suggest_uniform('xgb_alpha', 0.1, 5.0),
#         'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
#         'subsample': trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.5, 1.0),
#         'colsample_bylevel': trial.suggest_uniform('xgb_colsample_bylevel', 0.5, 1.0),
#         'colsample_bynode': trial.suggest_uniform('xgb_colsample_bynode', 0.5, 1.0),
#         'scale_pos_weight': trial.suggest_uniform('xgb_scale_pos_weight', 1.0, 5.0),
#     }
    
# #     sample_pipe = Pipeline([
# #         ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
# #         ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
# #     ])
    
    

# #     lgb_model = ImbPipeline([
# #         ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
# #         ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
# #         ('filter', SelectColumns(feature_cols_without_image_cols)),
# #         ('classifier', lgb.LGBMClassifier(**lgb_params)),
# #     ])

# #     cb_model = ImbPipeline([
# #         ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
# #         ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
# #         ('classifier', cb.CatBoostClassifier(**cb_params)),
# #     ])

#     xgb_model = ImbPipeline([
#         ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
#         ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
#         ('classifier', xgb.XGBClassifier(**xgb_params)),
#     ])

#     # Voting classifier
# #     estimator = VotingClassifier([
# #         ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
# #     ], voting='soft') # ,weights=[0.30,0.47,0.28])

#     # Cross-validation
#     X = df_train[feature_cols]
#     y = df_train[target_col]
#     groups = df_train[group_col]
#     cv = StratifiedGroupKFold(2, shuffle=True, random_state=seed)
    
    
#     val_score = cross_val_score(
#         estimator=xgb_model, 
#         X=X, y=y, 
#         cv=cv, 
#         groups=groups,
#         scoring=custom_metric,
#     )
#     print("Score: ",np.mean(val_score))

#     return np.mean(val_score)

# # Optuna study
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=15)

# # Output best trial
# print("Best trial:")
# trial = study.best_trial
# print(trial.values)
# print(trial.params)

# # Update the tuned parameters in your models
# best_params = trial.params

# # Extract and set the best params for each model
# # lgb_params.update({
# #     'lambda_l1': best_params['lgb_lambda_l1'],
# #     'lambda_l2': best_params['lgb_lambda_l2'],
# #     'learning_rate': best_params['lgb_learning_rate'],
# #     'max_depth': best_params['lgb_max_depth'],
# #     'num_leaves': best_params['lgb_num_leaves'],
# #     'colsample_bytree': best_params['lgb_colsample_bytree'],
# #     'colsample_bynode': best_params['lgb_colsample_bynode'],
# #     'bagging_fraction': best_params['lgb_bagging_fraction'],
# #     'bagging_freq': best_params['lgb_bagging_freq'],
# #     'min_data_in_leaf': best_params['lgb_min_data_in_leaf'],
# #     'scale_pos_weight': best_params['lgb_scale_pos_weight'],
# # })

# # cb_params.update({
# #     'iterations': best_params['cb_iterations'],
# #     'max_depth': best_params['cb_max_depth'],
# #     'learning_rate': best_params['cb_learning_rate'],
# #     'scale_pos_weight': best_params['cb_scale_pos_weight'],
# #     'l2_leaf_reg': best_params['cb_l2_leaf_reg'],
# #     'subsample': best_params['cb_subsample'],
# #     'min_data_in_leaf': best_params['cb_min_data_in_leaf'],
# # })

# xgb_params.update({
#     'learning_rate': best_params['xgb_learning_rate'],
#     'lambda': best_params['xgb_lambda'],
#     'alpha': best_params['xgb_alpha'],
#     'max_depth': best_params['xgb_max_depth'],
#     'subsample': best_params['xgb_subsample'],
#     'colsample_bytree': best_params['xgb_colsample_bytree'],
#     'colsample_bylevel': best_params['xgb_colsample_bylevel'],
#     'colsample_bynode': best_params['xgb_colsample_bynode'],
#     'scale_pos_weight': best_params['xgb_scale_pos_weight'],
# })

# # Now you can retrain the models with the updated hyperparameters

# xgb_model = ImbPipeline([
#     ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
#     ('classifier', xgb.XGBClassifier(**xgb_params)),
# ])
# # cb_model.set_params(classifier__**cb_params)
# # xgb_model.set_params(classifier__**xgb_params)
# estimator = VotingClassifier([
#     ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
# ], voting='soft') # ,weights=[0.30,0.47,0.28])

# CROSS VALIDATION

In [19]:
X = df_train[feature_cols]
y = df_train[target_col]
groups = df_train[group_col]
cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

# val_score = cross_val_score(
#     estimator=estimator, 
#     X=X, y=y, 
#     cv=cv, 
#     groups=groups,
#     scoring=custom_metric,
# )

# np.mean(val_score), val_score


In [20]:
X, y = df_train[feature_cols], df_train[target_col]

estimator.fit(X, y)



In [21]:
import pickle
with open('train_meta.pkl','wb') as f:
    pickle.dump(estimator,f)

In [22]:
df_subm['target'] = estimator.predict_proba(df_test[feature_cols])[:, 1]

df_subm.to_csv('submission.csv')
df_subm.head()

Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,0.248508
ISIC_0015729,0.225606
ISIC_0015740,0.317654
