In [None]:
import medmnist
from medmnist import INFO
import numpy as np
import torch
from torchvision import models, transforms


data_flag = 'pathmnist'
info = INFO[data_flag]
DataClass = getattr(medmnist, info['python_class'])

# Load the data
data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize the data
])

# Initialize the dataset
train_data = DataClass(split='train', transform=data_transform, download=True)
test_data = DataClass(split='test', transform=data_transform, download=True)


In [None]:
# Load a pre-trained ResNet model
model = models.resnet18(pretrained=True)
model.fc = torch.nn.Identity()  # Modify the fully connected layer to output embeddings

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()  # Set the model to evaluation mode


In [None]:
def get_embeddings(dataset):
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False)
    embeddings = []
    labels = []

    with torch.no_grad():
        for images, targets in loader:
            images = images.to(device)
            out = model(images)
            embeddings.append(out.cpu().numpy())
            labels.append(targets.numpy())

    embeddings = np.concatenate(embeddings, axis=0)
    labels = np.concatenate(labels, axis=0)
    return embeddings, labels


In [None]:


# Get embeddings for train and test datasets
train_embeddings, train_labels = get_embeddings(train_data)
test_embeddings, test_labels = get_embeddings(test_data)

# Save the embeddings and labels in .npz format
np.savez("pathmnist_train.npz", embeddings=train_embeddings, labels=train_labels)
np.savez("pathmnist_test.npz", embeddings=test_embeddings, labels=test_labels)
