# Ensemble CNNs + CatBoost + LGBM models

What to find in this notebook:
- Reload all trained models here.
- Load in tabular as well as image data in test dataset.
- Setup ensemble model.
- Evaluate models with appropriate data.
- Create submission.csv for Kaggle submission.

## Imports

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import torchvision.transforms as transforms
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

import cv2
import numpy as np
import h5py
from tqdm import tqdm
import io
import random
import pandas as pd
from PIL import Image


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


  warn(


In [2]:
SEED = 111
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

ROOT_DIR = "../data"
TRAIN_CSV = f"{ROOT_DIR}/train-metadata.csv"
TRAIN_HDF = f"{ROOT_DIR}/train-image.hdf5"
TEST_CSV = f'{ROOT_DIR}/test-metadata.csv'
TEST_HDF = f'{ROOT_DIR}/test-image.hdf5'
SAMPLE = f'{ROOT_DIR}/sample_submission.csv'

In [3]:
CONFIG = {
    "seed": 42,
    "epochs": 20,
    "img_size": 224,
    "train_batch_size": 150,
    "valid_batch_size": 200,
    "learning_rate": 1e-5,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "fold" : 0,
    "n_fold": 5,
    "n_accumulate": 1,
    "device": device,

    }

## Competitive Score metric

In [4]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def custom_lgbm_metric(y_true, y_hat):
    # TODO: Refactor with the above.
    min_tpr = 0.80
    v_gt = abs(y_true-1)
    v_pred = np.array([1.0 - x for x in y_hat])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return "pauc80", partial_auc, True

## Init CNN Architectures

In [5]:
"""
RESNET152
"""

class CustomResNet152(nn.Module):
    def __init__(self):
        super(CustomResNet152, self).__init__()
        # Load a pre-trained ResNet-152 model
        self.base_model = models.resnet152(weights=models.ResNet152_Weights.IMAGENET1K_V1)
        
        # remove last fully connected layer for our purposes
        self.features = nn.Sequential(*list(self.base_model.children())[:-2])

        # Classifier that includes flattening the feature map and linear layer for class prediction
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(2048, 1),  # output single value for prob calculation
            nn.Sigmoid()  # sigmoid activation for probability
        )
    
    def forward(self, x):
        # extract features
        x = self.features(x)
        # classify features
        output = self.classifier(x)
        return output

model1 = CustomResNet152()

In [6]:
"""
RESNET50
"""

class CustomResNet50(nn.Module):
    def __init__(self):
        super(CustomResNet50, self).__init__()
        # Load a pre-trained ResNet-152 model
        self.base_model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        
        # remove last fully connected layer for our purposes
        self.features = nn.Sequential(*list(self.base_model.children())[:-2])

        # Classifier that includes flattening the feature map and linear layer for class prediction
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(2048, 1),  # output single value for prob calculation
            nn.Sigmoid()  # sigmoid activation for probability
        )
    
    def forward(self, x):
        # extract features
        x = self.features(x)
        # classify features
        output = self.classifier(x)
        return output

model2 = CustomResNet50()

In [7]:
"""
MobileNetV2
"""

class CustomMobileNetV2(nn.Module):
    def __init__(self):
        super(CustomMobileNetV2, self).__init__()
        self.base_model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
        
        self.features = self.base_model.features
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(1280, 1),
            nn.Sigmoid()  
        )
    
    def forward(self, x):
        x = self.features(x)
        output = self.classifier(x)
        return output

model3 = CustomMobileNetV2()

In [8]:
"""
MNASNet-1.0
"""

class CustomMNASNet(nn.Module):
    def __init__(self):
        super(CustomMNASNet, self).__init__()
        self.base_model = models.mnasnet1_0(weights=models.MNASNet1_0_Weights.IMAGENET1K_V1)
        
        self.features = self.base_model.layers
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(1280, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.features(x)
        output = self.classifier(x)
        return output
    
model4 = CustomMNASNet()

In [9]:
"""
EfficientNetB4
"""

class CustomEfficientNetB4(nn.Module):
    def __init__(self):
        super(CustomEfficientNetB4, self).__init__()
        self.base_model = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.IMAGENET1K_V1)
        
        self.base_model.classifier = nn.Sequential(
            nn.Dropout(p=0.4), 
            nn.Linear(self.base_model.classifier[1].in_features, 1),
            nn.Sigmoid() 
        )
    
    def forward(self, x):
        output = self.base_model(x)
        return output
    
model5 = CustomEfficientNetB4()

In [10]:
"""
DenseNet121
"""

class CustomDenseNet121(nn.Module):
    def __init__(self):
        super(CustomDenseNet121, self).__init__()
        self.base_model = models.densenet121(weights=models.DenseNet121_Weights.IMAGENET1K_V1)
        
        self.base_model.classifier = nn.Sequential(
            nn.Linear(1024, 1), 
            nn.Sigmoid()
        )
    
    def forward(self, x):
        output = self.base_model(x)
        return output

model6 = CustomDenseNet121()

## Load CNNs in

In [11]:
model1.load_state_dict(torch.load('./output/res152_ISIC_best.pth', map_location=torch.device('cuda')))
model2.load_state_dict(torch.load('./output/res50_ISIC_best.pth', map_location=torch.device('cuda')))
model3.load_state_dict(torch.load('./output/mobileV2_ISIC_best.pth', map_location=torch.device('cuda')))
model4.load_state_dict(torch.load('./output/mnas1_0_ISIC_best.pth', map_location=torch.device('cuda')))
model5.load_state_dict(torch.load('./output/effB4_ISIC_best.pth', map_location=torch.device('cuda')))
model6.load_state_dict(torch.load('./output/Dense121_ISIC_best.pth', map_location=torch.device('cuda')))

model1 = model1.to(device)
model2 = model2.to(device)
model3 = model3.to(device)
model4 = model4.to(device)
model5 = model5.to(device)
model6 = model6.to(device)

## Load CatBoost and LGBM in

In [12]:
"""
Catboost
"""

model7 = CatBoostClassifier()
model7.load_model("./output/catboost_model.cbm")

<catboost.core.CatBoostClassifier at 0x1352aa99b70>

In [13]:
"""
LGBM
"""
# Load the Booster model
model8 = lgb.Booster(model_file="./output/lightgbm_model.txt")


## Reinput data

In [14]:
"""
import just the testing dataset
"""

def read_images_from_hdf5(file_path):
    images = {}
    try:
        with h5py.File(file_path, 'r') as file:
            for key in tqdm(file.keys(), desc="Reading Files"):
                try:
                    image_data = file[key][()]
                    image = Image.open(io.BytesIO(image_data))
                    images[key] = image
                except Exception as e:
                    print(f"Error! from {key}: {e}")
    except Exception as e:
        print(f"Error occured while reading files : {e}")
    
    return images
train_images = read_images_from_hdf5(TRAIN_HDF)
test_images = read_images_from_hdf5(TEST_HDF)
train_metadata = pd.read_csv(TRAIN_CSV)
test_metadata = pd.read_csv(TEST_CSV)

Reading Files: 100%|██████████| 401059/401059 [02:19<00:00, 2880.98it/s]
Reading Files: 100%|██████████| 3/3 [00:00<00:00, 3007.39it/s]
  train_metadata = pd.read_csv(TRAIN_CSV)


In [15]:
# Define oversampling and undersampling strategy
oversample = RandomOverSampler(sampling_strategy=0.003)  
undersample = RandomUnderSampler(sampling_strategy=0.9)  

# Create a pipeline
pipeline = Pipeline([
    ('oversample', oversample),
    ('undersample', undersample)
])

# Fit and transform the data
X_sample, y_sample = pipeline.fit_resample(train_metadata.drop(["target"],axis=1),train_metadata["target"])
X_sample["target"] = y_sample

In [16]:
"""
import test image dataloaders for CNN evaluation
"""

def remove_hair(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
    blackhat = cv2.morphologyEx(gray,cv2.MORPH_BLACKHAT,kernel)

    _, thresh = cv2.threshold(blackhat, 10 ,255, cv2.THRESH_BINARY)
    inpainted_image = cv2.inpaint(image, thresh, 1, cv2.INPAINT_TELEA)
    return inpainted_image


class ISIC_2024(Dataset):
    def __init__(self,pil_images,metadata,transform=None,test=False):
        self.pil_images = pil_images
        self.metadata = metadata
        self.transform = transform
        self.test= test
        
    def __len__(self):
        return len(self.metadata)
    
    # This function from https://www.kaggle.com/competitions/isic-2024-challenge/discussion/519735
    def remove_hair(image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
        blackhat = cv2.morphologyEx(gray,cv2.MORPH_BLACKHAT,kernel)

        _, thresh = cv2.threshold(blackhat, 10 ,255, cv2.THRESH_BINARY)
        inpainted_image = cv2.inpaint(image, thresh, 1, cv2.INPAINT_TELEA)
        return inpainted_image
    
    def __getitem__(self,idx):
        isic_id = self.metadata.iloc[idx,0]
        cleaned_image = remove_hair(np.array(self.pil_images[isic_id]))
        image = Image.fromarray(cleaned_image)
        if self.transform:
            image = self.transform(image)
        if self.test:
            return image, isic_id
        label = self.metadata.iloc[idx,-1]
        return image,label,isic_id

train_transforms = transforms.Compose([
    transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
    transforms.RandomHorizontalFlip(p=0.5), 
    transforms.RandomVerticalFlip(p=0.5),    
    transforms.RandomRotation(20),           
    transforms.ColorJitter(brightness=0.4, contrast=0.5, saturation=0.2, hue=0.1),
    transforms.ToTensor(),                   
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # commonly used mean and std calculated from ImageNet
])

test_transforms = transforms.Compose([
     transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # commonly used mean and std calculated from ImageNet
])

In [17]:
train_dataset = ISIC_2024(train_images, X_sample, transform = train_transforms)
test_dataset = ISIC_2024(test_images, test_metadata, transform = test_transforms, test = True)
train_loader = DataLoader(train_dataset, batch_size=32, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, pin_memory=True)

In [24]:
df_train = pd.read_csv("../data/train-metadata.csv")
df_test = pd.read_csv("../data/test-metadata.csv")

def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"]             = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"]             = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"]        = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"]              = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"]               = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    
    df["3d_position_distance"]           = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"]        = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["area_to_perimeter_ratio"]        = df["tbp_lv_areaMM2"] / df["tbp_lv_perimeterMM"]
    df["lesion_visibility_score"]        = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"]       = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"]    = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["consistency_symmetry_border"]    = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"] / (df["tbp_lv_symm_2axis"] + df["tbp_lv_norm_border"])
    
    df["color_consistency"]              = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    df["consistency_color"]              = df["tbp_lv_stdL"] * df["tbp_lv_Lext"] / (df["tbp_lv_stdL"] + df["tbp_lv_Lext"])
    df["size_age_interaction"]           = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"]      = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"]          = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"]         = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"]           = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    
    df["log_lesion_area"]                = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"]            = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"]               = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"]    = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"]          = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"]     = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4
    df["color_variance_ratio"]           = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    df["border_color_interaction"]       = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"]      = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    df["color_asymmetry_index"]          = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    
    df["3d_volume_approximation"]        = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"]                    = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"]        = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"]            = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"]        = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    df["index_age_size_symmetry"]        = df["age_approx"] * df["tbp_lv_areaMM2"] * df["tbp_lv_symm_2axis"]

    
    new_num_cols = [
        "lesion_size_ratio",             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
        "lesion_shape_index",            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
        "hue_contrast",                  # tbp_lv_H                - tbp_lv_Hext              abs
        "luminance_contrast",            # tbp_lv_L                - tbp_lv_Lext              abs
        "lesion_color_difference",       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
        "border_complexity",             # tbp_lv_norm_border      + tbp_lv_symm_2axis
        "color_uniformity",              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max
        
        "3d_position_distance",          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
        "perimeter_to_area_ratio",       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
        "area_to_perimeter_ratio",       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
        "lesion_visibility_score",       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
        # "combined_anatomical_site"      # anatom_site_general     + "_" + tbp_lv_location ! categorical feature
        "symmetry_border_consistency",   # tbp_lv_symm_2axis       * tbp_lv_norm_border
        "consistency_symmetry_border",   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)
        
        "color_consistency",             # tbp_lv_stdL             / tbp_lv_Lext
        "consistency_color",             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
        "size_age_interaction",          # clin_size_long_diam_mm  * age_approx
        "hue_color_std_interaction",     # tbp_lv_H                * tbp_lv_color_std_mean
        "lesion_severity_index",         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
        "shape_complexity_index",        # border_complexity       + lesion_shape_index
        "color_contrast_index",          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm
        
        "log_lesion_area",               # tbp_lv_areaMM2          + 1  np.log
        "normalized_lesion_size",        # clin_size_long_diam_mm  / age_approx
        "mean_hue_difference",           # tbp_lv_H                + tbp_lv_Hext    / 2
        "std_dev_contrast",              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
        "color_shape_composite_index",   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
        "3d_lesion_orientation",         # tbp_lv_y                , tbp_lv_x  np.arctan2
        "overall_color_difference",      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3
        
        "symmetry_perimeter_interaction",# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
        "comprehensive_lesion_index",    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
        "color_variance_ratio",          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
        "border_color_interaction",      # tbp_lv_norm_border      * tbp_lv_norm_color
        "size_color_contrast_ratio",     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
        "age_normalized_nevi_confidence",# tbp_lv_nevi_confidence  / age_approx
        "color_asymmetry_index",         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max
        
        "3d_volume_approximation",       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
        "color_range",                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
        "shape_color_consistency",       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
        "border_length_ratio",           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
        "age_size_symmetry_index",       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
         #"index_age_size_symmetry",      # age_approx              * sqrt(tbp_lv_areaMM2 * tbp_lv_symm_2axis)
        "index_age_size_symmetry",       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis

    ]
    
    new_cat_cols = ["combined_anatomical_site"]
    
    return df, new_num_cols, new_cat_cols

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].median())
df_test [num_cols] = df_test [num_cols].fillna(df_train[num_cols].median())

df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
df_test, _, _                        = feature_engineering(df_test.copy())

num_cols += new_num_cols

# anatom_site_general
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols
#train_cols = num_cols + cat_cols
train_cols = num_cols + cat_cols

category_encoder = OrdinalEncoder(
    categories='auto',
    dtype=int,
    handle_unknown='use_encoded_value',
    unknown_value=-2,
    encoded_missing_value=-1,
)

X_cat1 = category_encoder.fit_transform(df_test[cat_cols])
X_cat2 = category_encoder.fit_transform(df_train[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_test[cat_col] = X_cat1[:, c]
    df_train[cat_col] = X_cat2[:, c]
    

  df_train = pd.read_csv("../data/train-metadata.csv")


## Ensemble model (Averaging)

There are 8 models to ensemble through averaging predictions. 

In [43]:
cnn_models = [model1, model2, model3, model4, model5, model6]

def get_test_predictions(dataloader, model, device, model_type='cnn'):
    predictions = []
    if model_type == 'cnn':
        model.eval()
        with torch.no_grad():
            for data, _ in dataloader:
                data = data.to(device)
                pred = model(data).cpu().numpy()
                predictions.append(pred)
    elif model_type == 'cat':
        pred = model.predict_proba(dataloader)[:, 1]
        predictions.append(pred)
    elif model_type == 'lgbm':
        pred = model.predict(dataloader, raw_score=False)
        predictions.append(pred)
    return np.concatenate(predictions, axis=0)


In [64]:
print('start')
# Get CNN predictions
cnn_preds_list = [get_test_predictions(test_loader, model, device, model_type='cnn') for model in cnn_models]
cnn_preds_list = np.stack(cnn_preds_list, axis=1).reshape(-1, len(cnn_models)) #reshape for concat later
print('combined cnn preds')


start
combined cnn preds


In [67]:
# Get tabular model predictions
catboost_preds = get_test_predictions(df_test[train_cols], model7, device, model_type='cat').reshape(-1, 1)
print('catboost done')
lgbm_preds = get_test_predictions(df_test[train_cols], model8, device, model_type='lgbm').reshape(-1, 1)
print('lgbm done')
# Combine all predictions into a feature matrix
ensemble_preds = np.column_stack([cnn_preds_list, catboost_preds, lgbm_preds])


catboost done
lgbm done


In [80]:
#prep submission
df_subm = pd.read_csv('../data/sample_submission.csv', index_col='isic_id')

# weighted average
weights = [0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.3, 0.3]  # Example weights
df_subm['target'] = np.average(ensemble_preds, axis=1, weights=weights)

#output submission csv file
df_subm.to_csv('./output/submission.csv')

# TESTING

In [None]:
def eval_cnn(model, dataloader, device):

    #init eval mode
    model.eval()

    all_preds = []
    all_ids = []

    for data, id in tqdm(dataloader):
        
        #init data and target into cuda
        data = data.to(device)

        #predict using input data
        curr_pred = model(data)
        print(curr_pred)
        _, predicted = torch.max(curr_pred, 1) 

        #store preds and ids
        all_preds.append(predicted.cpu())
        all_ids.extend(id)
    return predicted

In [None]:
eval_cnn(model1, test_loader, device)

100%|██████████| 1/1 [00:00<00:00,  5.04it/s]

tensor([[0.0158],
        [0.0173],
        [0.0021]], device='cuda:0', grad_fn=<SigmoidBackward0>)





tensor([0, 0, 0], device='cuda:0')

In [None]:
test1 = model7.predict_proba(df_test[train_cols])[:, 1]
print(test1)


[4.15957845e-05 3.14439194e-05 9.77989859e-05]


In [23]:
model8.predict(df_test[train_cols], raw_score=False)

NameError: name 'train_cols' is not defined

In [None]:
for images, label, ids in train_loader:
    print(ids)

['ISIC_0015670', 'ISIC_0015845', 'ISIC_0015864', 'ISIC_0015902', 'ISIC_0024200', 'ISIC_0035502', 'ISIC_0051648', 'ISIC_0051665', 'ISIC_0051710', 'ISIC_0051758', 'ISIC_0051812', 'ISIC_0051822', 'ISIC_0051896', 'ISIC_0051897', 'ISIC_0051958', 'ISIC_0051983', 'ISIC_0052003', 'ISIC_0052004', 'ISIC_0052026', 'ISIC_0052042', 'ISIC_0052068', 'ISIC_0052094', 'ISIC_0052109', 'ISIC_0052122', 'ISIC_0052164', 'ISIC_0052205', 'ISIC_0052213', 'ISIC_0052220', 'ISIC_0052231', 'ISIC_0052241', 'ISIC_0052259', 'ISIC_0052270']
['ISIC_0052310', 'ISIC_0052313', 'ISIC_0052328', 'ISIC_0052332', 'ISIC_0052355', 'ISIC_0052357', 'ISIC_0052367', 'ISIC_0061318', 'ISIC_0062556', 'ISIC_0065755', 'ISIC_0067881', 'ISIC_0068212', 'ISIC_0070972', 'ISIC_0071851', 'ISIC_0071852', 'ISIC_0073261', 'ISIC_0073270', 'ISIC_0073301', 'ISIC_0073316', 'ISIC_0073364', 'ISIC_0073396', 'ISIC_0073412', 'ISIC_0073426', 'ISIC_0073467', 'ISIC_0073505', 'ISIC_0073511', 'ISIC_0073521', 'ISIC_0073522', 'ISIC_0073555', 'ISIC_0073642', 'ISIC_

KeyboardInterrupt: 