# PhenoProfiler Introduction

Welcome to the PhenoProfiler project! This notebook will guide you through using PhenoProfiler. For more detailed information, please refer to our preprint on bioRxiv.

## Main API Functions

PhenoProfiler provides two primary API functions:

1. **Process a Multi-Channel Image**:
   - **Input**: multi-channel images.
   - **Output**: Returns an embedding for these images.

2. **Process a Dataset**:
   - **Input**: A dataset containing multiple images.
   - **Output**: Returns an embedding for the dataset.


# 1. Process a Multi-Channel Image from BBBC022

In [4]:
import numpy as np
import torch
from skimage.transform import resize
from PIL import Image
from models import PhenoProfiler

# List of image paths
img_paths = [
    './Sample_imgs/IXMtest_A01_s2_w1AA6B1894-F561-42EE-9D1D-E21E5C741B75.png',
    './Sample_imgs/IXMtest_A01_s2_w3A597237B-C3D7-43AE-8399-83E76DA1532D.png',
    './Sample_imgs/IXMtest_A01_s2_w50F1562CD-EBCF-408E-9D8D-F6F0FDD746C8.png',
    './Sample_imgs/IXMtest_A01_s2_w246FFAEE1-BEB6-4C81-913B-B979EC0C4BC3.png',
    './Sample_imgs/IXMtest_A01_s2_w46657239A-5AFE-4C29-BB9B-49978EFE4791.png',
]

# Load and preprocess images
images = np.stack([resize(np.array(Image.open(path)), (448, 448), anti_aliasing=True) for path in img_paths])
images_tensor = torch.tensor(images).float().cuda()

# Load model
model = PhenoProfiler().cuda()
model.load_state_dict(torch.load('./PhenoProfiler.pt', weights_only=True))

# Generate embeddings
image_features = model.image_encoder(images_tensor.unsqueeze(0))
image_embeddings = model.image_projection(image_features)

# Print the shape of the embeddings
print(image_embeddings.shape)

torch.Size([1, 672])


# 2. Test BBBC022 dataset

In [None]:
## Step 1: mkdir filepath to save dataset

!mkdir -p '../dataset/bbbc022/'

# Step 2: dowaload BBBC022 dataset, very lagre, about 70G
!aws s3 cp s3://cytodata/datasets/Bioactives-BBBC022-Gustafsdottir/ ../dataset/bbbc022/ --recursive --no-sign-request

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from dataset import PDDMtaskDataset
from models import PhenoProfiler
from utils import *

def build_loaders_inference(batch_size):
    print("Building loaders")
    dataset = PDDMtaskDataset(image_path = "../dataset/bbbc022/images/",
               embedding_path = "../dataset/bbbc022/embedding/",
               CSV_path = "../dataset/bbbc022/profiling.csv")
    
    dataset = torch.utils.data.ConcatDataset([dataset])
    test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

    print("Finished building loaders")
    return test_loader

def get_image_embeddings(model_path, model, batch_size):
    test_loader = build_loaders_inference(batch_size)

    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    model.eval()

    print("Finished loading model")
    test_image_embeddings = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            image_features = model.image_encoder(batch["image"].cuda())
            image_embeddings = model.image_projection(image_features)
            test_image_embeddings.append(image_embeddings)
    
    return torch.cat(test_image_embeddings)

In [None]:
model_path = "PhenoProfiler.pt"
save_path = "output/bbbc022/PhenoProfiler/"

if not os.path.exists(save_path):
    os.makedirs(save_path)

model = PhenoProfiler().cuda()
img_embeddings = get_image_embeddings(model_path, model, batch_size=800)  # change batch_size to fit your device
features = img_embeddings.cpu().numpy()

np.save(save_path + "PhenoProfiler_alltrain_22test" + ".npy", features.T)