In [None]:

%%capture
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, ViTFeatureExtractor, ViTModel
from PIL import Image
from tqdm import tqdm
import gc
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.ensemble import VotingClassifier
import cv2
print(f"CV2 VERSION: {cv2.__version__}")
import h5py
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
import re
import random
import numpy as np
import xgboost as xgb
from tqdm import tqdm
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
import random
import os
from imblearn.under_sampling import RandomUnderSampler
from cuml.svm import SVR
import cuml
import cudf
import cupy as cp
import dask_cudf
from safetensors import safe_open

def set_seed(seed=42):
    # Python's built-in random module
    random.seed(seed)
    np.random.seed(seed)
    # Scikit-learn
    from sklearn.utils import check_random_state
    check_random_state(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    cp.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)


print('RAPIDS version',cuml.__version__)
torch.cuda.is_available()


CV2 VERSION: 4.10.0
RAPIDS version 24.04.00


True

In [None]:
df_train_metadata = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/train-metadata.csv')

  df_train_metadata = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/train-metadata.csv')


In [None]:
models = [
    ("google/vit-base-patch16-224", 32),
    # ("facebook/deit-base-distilled-patch16-224", 32),
    # ("microsoft/cvt-13", 32),

]


class ImageDataset(Dataset):
    def __init__(self, image_paths, feature_extractor):
        self.image_paths = image_paths
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')

        # Preprocess the image using the feature extractor
        inputs = self.feature_extractor(images=image, return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dimension
        return inputs


def load_model_from_safetensors(model_name, safetensors_path):
    # Load the model and manually load the safetensors weights
    model = AutoModel.from_pretrained(model_name)
    state_dict = {}

    # Load weights using safetensors
    with safe_open(safetensors_path, framework="pt") as f:
        for key in f.keys():
            state_dict[key] = f.get_tensor(key)

    model.load_state_dict(state_dict)
    return model

def get_image_embeddings(model_name='', batch_size=32, image_paths=None, safetensors_path=None):
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    all_embeddings = []

    try:
        # Load model and feature extractor
        if safetensors_path:
            model = load_model_from_safetensors(model_name, safetensors_path).to(DEVICE)
        else:
            model = AutoModel.from_pretrained(model_name).to(DEVICE)

        feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
        model.eval()

        # Create dataset and dataloader
        dataset = ImageDataset(image_paths, feature_extractor)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        with torch.no_grad():
            for i, batch in enumerate(tqdm(dataloader, total=len(dataloader))):
                pixel_values = batch["pixel_values"].to(DEVICE)
                with torch.amp.autocast('cuda', enabled=True):
                        model_output = model(pixel_values=pixel_values)
                        embeddings = model_output.last_hidden[-1][:,0,:]  # or adjust this based on your model output
                embeddings = F.normalize(embeddings, p=2, dim=1)
                all_embeddings.extend(embeddings.cpu().numpy())

                if (i + 1) % 100 == 0:  # Clear cache every 100 batches
                    torch.cuda.empty_cache()
        print(f"{model_name} has embedding shape:", embeddings.shape)
        print(f"Processed {len(all_embeddings)} images for {model_name}")

    except Exception as e:
        print(f"An error occurred while processing {model_name}: {str(e)}")
        import traceback
        traceback.print_exc()

    finally:
        # Clear memory
        del dataset, dataloader, model
        gc.collect()
        torch.cuda.empty_cache()

    return np.array(all_embeddings)

def sanitize_filename(name):
    # Replace '/' and '-' with '_'
    name = name.replace('/', '_').replace('-', '_')
    # Remove any other non-alphanumeric characters (except underscore)
    return re.sub(r'[^\w\-_\.]', '', name)



In [None]:
image_paths = [f"/content/train-image/image/{id}.jpg" for id in df_train_metadata.isic_id]
image_paths = image_paths[:10]
for model_name, batch_size in models:
    all_embeddings = []
    embeddings = get_image_embeddings(model_name=model_name, batch_size=batch_size, image_paths=image_paths)
    all_embeddings.append(embeddings)
    all_embeddings = np.vstack(all_embeddings)
    save_directory = "/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/ModelsEmbeddings"
    os.makedirs(save_directory, exist_ok=True)

    # Sanitize the model_name to ensure it's a valid filename
    safe_model_name = sanitize_filename(model_name)
    file_path = os.path.join(save_directory, f"{safe_model_name}_image_embeddings.npy")

    print(f"Attempting to save to: {file_path}")
    print(f"embedding shape for {model_name}:",all_embeddings.shape)
    try:
        np.save(file_path, all_embeddings)
        print(f"Successfully saved embeddings for {model_name}")
    except Exception as e:
        print(f"Error saving embeddings for {model_name}: {str(e)}")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Directory contents: {os.listdir(save_directory)}")

Some weights of DeiTModel were not initialized from the model checkpoint at facebook/deit-base-distilled-patch16-224 and are newly initialized: ['deit.pooler.dense.bias', 'deit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:00<00:00, 10.42it/s]


facebook/deit-base-distilled-patch16-224 has embedding shape: torch.Size([10, 768])
Processed 10 images for facebook/deit-base-distilled-patch16-224
Attempting to save to: /content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/ModelsEmbeddings/facebook_deit_base_distilled_patch16_224_image_embeddings.npy
embedding shape for facebook/deit-base-distilled-patch16-224: (10, 768)
Successfully saved embeddings for facebook/deit-base-distilled-patch16-224


  0%|          | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "<ipython-input-22-be92fc65d6e2>", line 53, in get_image_embeddings
    embeddings = model_with_embedding(pixel_values)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "<ipython-input-22-be92fc65d6e2>", line 24, in forward
    features = self.model.forward_features(x)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1729, in __getattr__
    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
AttributeError: 'CvtModel' object has no attribute 'forward_features'


An error occurred while processing microsoft/cvt-13: 'CvtModel' object has no attribute 'forward_features'
Attempting to save to: /content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/ModelsEmbeddings/microsoft_cvt_13_image_embeddings.npy
embedding shape for microsoft/cvt-13: (1, 0)
Successfully saved embeddings for microsoft/cvt-13


Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:00<00:00, 11.29it/s]


facebook/dino-vitb16 has embedding shape: torch.Size([10, 768])
Processed 10 images for facebook/dino-vitb16
Attempting to save to: /content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/ModelsEmbeddings/facebook_dino_vitb16_image_embeddings.npy
embedding shape for facebook/dino-vitb16: (10, 768)
Successfully saved embeddings for facebook/dino-vitb16


100%|██████████| 1/1 [00:00<00:00, 14.11it/s]


facebook/vit-mae-base has embedding shape: torch.Size([10, 768])
Processed 10 images for facebook/vit-mae-base
Attempting to save to: /content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/ModelsEmbeddings/facebook_vit_mae_base_image_embeddings.npy
embedding shape for facebook/vit-mae-base: (10, 768)
Successfully saved embeddings for facebook/vit-mae-base


In [None]:
google_vit_embeddings  = np.load("/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/google_vit_base_patch16_224_image_embeddings.npy")
facebook_vit_embeddings = np.load("/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/facebook_deit_small_patch16_224_image_embeddings.npy")
all_embeddings = np.concatenate((google_vit_embeddings, facebook_vit_embeddings), axis=1)
google_vit_embeddings.shape, facebook_vit_embeddings.shape,all_embeddings.shape

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/google_vit_base_patch16_224_image_embeddings.npy'

In [None]:
google_vit_embeddings_fine = np.load("/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/VisualEmbeddings/finetuned/google_finetuned_vit_base_patch16_224_image_embeddings.npy")


In [None]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc



In [None]:

X = google_vit_embeddings_fine
y = df_train_metadata['target'].values

FOLDS = 5
skf_svr = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

svr_oof = np.zeros(len(X), dtype='float32')
svr_fold_scores = []
mse_fold_scores = []
trained_models = []
for fold, (train_index, val_index) in enumerate(skf_svr.split(X, y)):
    print('#'*50)
    print(f'### Fold {fold+1}')
    print('#'*50)

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    print(f"Before undersampling:")
    print(f"Training set shape: {X_train.shape}, Training set distribution: {np.bincount(y_train)}")
    print(f"Validation set shape: {X_val.shape}, Validation set distribution: {np.bincount(y_val)}")

    # Undersample only the training data
    undersampler = RandomUnderSampler(sampling_strategy=0.1, random_state=42)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
    print(f"\nAfter undersampling:")
    print(f"Training set shape: {X_train_resampled.shape}, Training set distribution: {np.bincount(y_train_resampled)}")
    # indices = undersampler.sample_indices_
    # positive_train_indices = np.where(y_train_resampled == 1)[0]
    # for i, index in enumerate(positive_train_indices):
    #   start_index = i * 5
    #   end_index =((i + 1) * 5) -2
    #   new_aug_embed = augmented_pos_img_embeddings[start_index:end_index]
    #   X_train_resampled = np.vstack((X_train_resampled, new_aug_embed))
    #   y_train_resampled = np.append(y_train_resampled, [1]*new_aug_embed.shape[0])
    # print(f"\nAfter augmentation:")
    # print(f"Training set shape: {X_train_resampled.shape}, Training set distribution: {np.bincount(y_train_resampled)}")

    print(f"Validation set shape: {X_val.shape}, Validation set distribution: {np.bincount(y_val)}")


    X_train_cp = cp.asarray(X_train_resampled)
    X_val_cp = cp.asarray(X_val)
    y_train_resampled = cp.asarray(y_train_resampled)

    # Initialize and train RAPIDS SVR
    model = SVR(
        C=1.0,
        epsilon=0.1,
        kernel='rbf',
        cache_size=4096,
        max_iter=1000,
        tol=1e-3,
        verbose=True
    )

    model.fit(X_train_cp, y_train_resampled)

    preds = model.predict(X_val_cp)
    print(preds[:10])
    preds = (preds - preds.min()) / (preds.max() - preds.min())

    # Move predictions back to CPU for scoring
    preds_cpu = cp.asnumpy(preds)
    svr_oof[val_index] = preds_cpu
    fold_score = comp_score(pd.DataFrame(y_val), pd.DataFrame(preds_cpu))
    svr_fold_scores.append(fold_score)
    mse_score = mean_squared_error(y_val, preds_cpu)
    mse_fold_scores.append(mse_score)
    trained_models.append(model)

    print(f"Fold {fold+1} MSE: {mse_score}")

    print(f"\n=> Fold score: {fold_score}")
    print("\n")

print(f"Average MSE: {np.mean(mse_fold_scores)}")
print(f"Overall MSE: {mean_squared_error(y, svr_oof)}")
print('#'*50)
overall_score = comp_score(pd.DataFrame(y), pd.DataFrame(svr_oof))
print(f'Mean fold score = {np.mean(svr_fold_scores)}')
print(f'Overall CV score = {overall_score}')
for i, model in enumerate(trained_models):
    joblib.dump(model, f'svr_model_fold_{i+1}.joblib')



##################################################
### Fold 1
##################################################
Before undersampling:
Training set shape: (320847, 768), Training set distribution: [320532    315]
Validation set shape: (80212, 768), Validation set distribution: [80134    78]

After undersampling:
Training set shape: (3465, 768), Training set distribution: [3150  315]
Validation set shape: (80212, 768), Validation set distribution: [80134    78]
[D] [23:23:26.923061] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [23:23:27.002045] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 11 outer iterations, total inner 4202 iterations, and diff 0.000998
[ 0.18256041  0.02420256  0.15531597 -0.05643341  0.06001154 -0.03587881
 -0.03185198  0.20800409  0.09985235  0.04926333]
Fold 1 MSE: 0.08422665769561974

=> Fold score: 0.12072083746903421


##################################################
### Fold 2
###########

In [None]:
# Specify the folder where the models will be saved
output_folder = '/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/VisualEmbeddingSVR/less_num_milder_aug_saved_models'

# Create the folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Save each model into the specified folder
for i, model in enumerate(trained_models):
    model_path = os.path.join(output_folder, f'svr_model_fold_{i+1}.joblib')
    joblib.dump(model, model_path)