In [1]:
!pip install catboost
!pip install xgboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.

In [2]:
import numpy as np
import pandas as pd
import pandas.api.types
import matplotlib.pyplot as plt
import polars as pl

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.ensemble import VotingClassifier

import optuna
import catboost as cb
import xgboost as xgb


In [3]:
%%capture
!unzip /content/drive/MyDrive/Colab\ Notebooks/SkinCancer_ISIC/isic-2024-challenge.zip

In [4]:
!unzip /content/drive/MyDrive/Colab\ Notebooks/SkinCancer_ISIC/train_preds.csv.zip

Archive:  /content/drive/MyDrive/Colab Notebooks/SkinCancer_ISIC/train_preds.csv.zip
  inflating: train_preds.csv         


In [5]:
df_train = pd.read_csv("/content/train-metadata.csv")
df_test = pd.read_csv("/content/test-metadata.csv")
df_eff = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SkinCancer_ISIC/train_effnetv1b0.csv", usecols=["target_effnetv1b0"])
df_eva = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SkinCancer_ISIC/train_eva02.csv", usecols=["target_eva02"])
df_image_3 = pd.read_csv("/content/train_preds.csv")

  df_train = pd.read_csv("/content/train-metadata.csv")


In [6]:

OPTIMIZE_OPTUNA = False
SUBSAMPLE = False  #undersample the negative (majority) class if set true
SUBSAMPLE_RATIO = 0.5 # only effective if SUBSAMPLE=True
DISPLAY_FEATURE_IMPORTANCE = True

**My Current understanding of the code:**
The code first calculates a bunch of features and add them to the training data. It inputes NaN Values. Encodes categorical values. And loads predictions from other models as features for the current model.
"Num_cols" keeps track of the columns that should be treated as numerical values.
"Car_cols" keeps track of the categorical valuesd

In [7]:


def feature_engineering(df):
    # New features to try...
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]

    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
    df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    # Until here.

    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index",
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",

        "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

num_cols = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean',
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
]
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].median())
df_test[num_cols] = df_test[num_cols].fillna(df_train[num_cols].median())
df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
df_test, _, _ = feature_engineering(df_test.copy())
num_cols += new_num_cols

# anatom_site_general
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols
train_cols = num_cols + cat_cols

df_eff = df_eff[["target_effnetv1b0"]]

df_eva = df_eva[["target_eva02"]]


df_train["target_effnetv1b0"] = df_eff["target_effnetv1b0"]
df_train["target_eva02"] = df_eva["target_eva02"]
df_train["target_3"] = df_image_3["pred"]

train_cols += ["target_effnetv1b0","target_eva02", "target_3"]

category_encoder = OrdinalEncoder(
    categories='auto',
    dtype=int,
    handle_unknown='use_encoded_value',
    unknown_value=-2,
    encoded_missing_value=-1,
)

X_cat = category_encoder.fit_transform(df_train[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_train[cat_col] = X_cat[:, c]

In [8]:
df_train.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [9]:
N_SPLITS = 5
gkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

if SUBSAMPLE:
    df_pos = df_train[df_train["target"] == 1]
    df_neg = df_train[df_train["target"] == 0]
    df_neg = df_neg.sample(frac=SUBSAMPLE_RATIO, random_state=42)
    df_train = pd.concat([df_pos, df_neg]).sample(frac=1.0, random_state=42).reset_index(drop=True)

df_train["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx

In [15]:
#comp metrics
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc


In [11]:
# def objective(trial):
#     param = {
#         "objective": "binary",
#         # "metric": "custom",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#         "device": "gpu"
#     }
#     scores = []
#     for fold in range(N_SPLITS):
#         _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
#         _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
#         dtrain = lgb.Dataset(_df_train[train_cols], label=_df_train["target"])
#         gbm = lgb.train(param, dtrain)
#         preds = gbm.predict(_df_valid[train_cols])
#         score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
#         scores.append(score)
#     return np.mean(scores)

In [12]:
# if OPTIMIZE_OPTUNA:
#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=20)

#     print("Number of finished trials: {}".format(len(study.trials)))

#     print("Best trial:")
#     trial = study.best_trial

#     print("  Value: {}".format(trial.value))

#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))


In [38]:


cat_params = {
    "objective": "Logloss",  # Equivalent to "binary" in LightGBM
    "iterations": 200,  # Equivalent to "n_estimators" in LightGBM
    'learning_rate': 0.05,
    'l2_leaf_reg': 8.765240856362274,  # Equivalent to 'lambda_l2' in LightGBM
    'depth': 8,
    'random_strength': 0.5392005444882538,
    'bagging_temperature': 0.9577412548866563,  # Equivalent to 'bagging_fraction' in LightGBM
    'border_count': 254,
    "task_type": "GPU",
    'devices': '0:1',
    'verbose': 0
}

cb_scores = []
cb_models = []
cb_oof_df = pd.DataFrame()

for fold in range(N_SPLITS):
    print('#'*25)
    print('### Fold', fold+1)
    print('#'*25)
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    model = cb.CatBoostClassifier(**cat_params)
    model.fit(_df_train[train_cols], _df_train["target"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]))
    print(f"fold: {fold+1} - Partial AUC Score: {score:.5f}")
    cb_models.append(model)
    cb_oof_single = _df_valid[["isic_id", "target"]].copy()
    cb_oof_single["pred"] = preds
    cb_oof_df = pd.concat([cb_oof_df, cb_oof_single])
    print()

cb_overall_true = df_train['target']
cb_overall_pred = cb_oof_df.sort_values('isic_id')['pred']  # Ensure predictions are in the same order as the original data

cb_score = comp_score(
    pd.DataFrame(cb_overall_true),
    pd.DataFrame(cb_overall_pred)
)
print('Overall CV score before threshold optimization =', cb_score)




#########################
### Fold 1
#########################
fold: 1 - Partial AUC Score: 0.17833

#########################
### Fold 2
#########################
fold: 2 - Partial AUC Score: 0.17976

#########################
### Fold 3
#########################
fold: 3 - Partial AUC Score: 0.19385

#########################
### Fold 4
#########################
fold: 4 - Partial AUC Score: 0.17423

#########################
### Fold 5
#########################
fold: 5 - Partial AUC Score: 0.18733

Overall CV score before threshold optimization = 0.1812080074970339


In [45]:
xgb_params = {
    "objective": "binary:logistic",  # Equivalent to "Logloss" in CatBoost
    "n_estimators": 200,  # Equivalent to "iterations" in CatBoost
    'learning_rate': 0.05,
    'reg_lambda': 8.765240856362274,  # Equivalent to 'l2_leaf_reg' in CatBoost
    'max_depth': 8,  # Equivalent to 'depth' in CatBoost
    'subsample': 0.9577412548866563,  # Equivalent to 'bagging_temperature' in CatBoost
    'colsample_bytree': 0.9577412548866563,  # Additional parameter for feature subsampling
    'tree_method': 'gpu_hist',  # Use GPU
    'gpu_id': 0,  # Use first GPU
    'max_bin': 254,  # Equivalent to 'border_count' in CatBoost
    'verbosity': 0,  # Equivalent to 'verbose' in CatBoost
    'missing': np.inf
}

xgb_scores = []
xgb_models = []
xgb_oof_df = pd.DataFrame()

for fold in range(N_SPLITS):
    print('#'*25)
    print('### Fold', fold+1)
    print('#'*25)

    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    model = xgb.XGBClassifier(**xgb_params)
    # model = VotingClassifier([
    #     (f"xgb_{i}", xgb.XGBClassifier(random_state=i, **xgb_params)) for i in range(3)
    # ], voting="soft")
    _df_train[train_cols] = _df_train[train_cols].replace([np.inf, -np.inf], np.nan)
    # Handle 'inf' values in your data
    model.fit(_df_train[train_cols], _df_train["target"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]

    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]))
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")

    xgb_models.append(model)
    xgb_oof_single = _df_valid[["isic_id", "target"]].copy()
    xgb_oof_single["pred"] = preds
    xgb_oof_df = pd.concat([xgb_oof_df, xgb_oof_single])
    print()
xgb_overall_true = df_train['target']
xgb_overall_pred = xgb_oof_df.sort_values('isic_id')['pred']  # Ensure predictions are in the same order as the original data

print('#'*25)
xgb_score = comp_score(
    pd.DataFrame(xgb_overall_true),
    pd.DataFrame(xgb_overall_pred)
)
print('Overall CV score before threshold optimization =', xgb_score)


#########################
### Fold 1
#########################
fold: 0 - Partial AUC Score: 0.17839

#########################
### Fold 2
#########################
fold: 1 - Partial AUC Score: 0.19075

#########################
### Fold 3
#########################
fold: 2 - Partial AUC Score: 0.19483

#########################
### Fold 4
#########################
fold: 3 - Partial AUC Score: 0.17893

#########################
### Fold 5
#########################
fold: 4 - Partial AUC Score: 0.19059

#########################
Overall CV score before threshold optimization = 0.18520017161248398


In [None]:

# def find_optimal_threshold(y_true, y_prob, steps=1000):
#     best_score = -np.inf
#     best_threshold = 0
#     thresholds = np.linspace(0, 1, steps)
#     scores = []

#     v_gt = abs(np.asarray(y_true) - 1)
#     v_pred = 1.0 - y_prob
#     max_fpr = 0.2  # Corresponds to min_tpr of 0.80

#     for threshold in thresholds:
#         pred_binary = (v_pred <= threshold).astype(int)
#         partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
#         partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
#         scores.append(partial_auc)
#         if partial_auc > best_score:
#             best_score = partial_auc
#             best_threshold = threshold

#     return best_threshold, best_score, thresholds, scores

# # Usage
# best_threshold, best_score, thresholds, scores = find_optimal_threshold(df_train['target'].values, lgb_oof_df)
# print(f"Best threshold: {best_threshold}")
# print(f"Best score with threshold optimization: {best_score}")