# [KRONOS](https://github.com/mahmoodlab/KRONOS): mapping marker ID's.

This tutorial demonstrates how to map dataset-specific marker IDs to the pretrained marker IDs used by KRONOS.

In [None]:
import harpy as hp

# fetch the codex example data
sdata = hp.datasets.codex_example()

# fetch the kronos marker metadata. This is the marker metadata specific to the kronos model
registry = hp.datasets.get_registry()
marker_metadata_path = registry.fetch("proteomics/codex/chl_maps_dataset/marker_metadata.csv")

In [2]:
import pandas as pd

marker_names = sdata["image"]["scale4"]["image"].c.data

marker_data_specific = pd.DataFrame({"marker_id": range(len(marker_names)), "marker_name": marker_names})
marker_data_specific.head()  # dataset-specific marker info file.

Unnamed: 0,marker_id,marker_name
0,0,BCL-2
1,1,CCR6
2,2,CD11B
3,3,CD11C
4,4,CD15


In [3]:
import pandas as pd

marker_metadata = pd.read_csv(marker_metadata_path)
marker_metadata.rename(columns={"marker_id": "marker_id_pretrained"}, inplace=True)
marker_metadata.head()  # pretrained marker metadata file.

Unnamed: 0,marker_name,marker_id_pretrained,marker_mean,marker_std
0,DAPI,4,0.083207,0.095882
1,ARID1A,8,0.005042,0.010528
2,ATRX,10,0.021219,0.048886
3,BCL6,12,0.023329,0.054438
4,CDT1,14,0.00363,0.019132


In [4]:
matched_markers = (
    pd.merge(marker_metadata, marker_data_specific, on="marker_name", how="inner")
    .sort_values(by="marker_id")
    .reset_index(drop=True)
)
print(f"There are {matched_markers.shape[0]} matched markers.")
display(matched_markers)

There are 31 matched markers.


Unnamed: 0,marker_name,marker_id_pretrained,marker_mean,marker_std,marker_id
0,CCR6,166,0.044867,0.042833,1
1,CD11B,180,0.032169,0.052366,2
2,CD11C,182,0.019039,0.044336,3
3,CD15,194,0.016322,0.040416,4
4,CD16,196,0.041869,0.055626,5
5,CD162,198,0.012217,0.040094,6
6,CD163,200,0.014384,0.033087,7
7,CD2,212,0.161256,0.110404,8
8,CD20,214,0.045192,0.057727,9
9,CD206,216,0.014008,0.044501,10


In [5]:
missing_markers = marker_data_specific[~marker_data_specific["marker_name"].isin(marker_metadata["marker_name"])]
print(
    f"There are {missing_markers.shape[0]} markers that could not be matched to a marker from the pretraining dataset."
)
missing_markers.head()

There are 18 markers that could not be matched to a marker from the pretraining dataset.


Unnamed: 0,marker_id,marker_name
0,0,BCL-2
18,18,CD46
26,26,COLLAGEN 4
27,27,CYTOKERITIN
28,28,DAPI-01


In [6]:
unmatched_markers = marker_metadata[~marker_metadata["marker_name"].isin(marker_data_specific["marker_name"])]
print(f"There are {unmatched_markers.shape[0]} markers from the pretraining dataset that where not yet matched.")
unmatched_markers.head()  # markers from dataset that could not be mapped to marker_metadata -> do a manual mapping

There are 146 markers from the pretraining dataset that where not yet matched.


Unnamed: 0,marker_name,marker_id_pretrained,marker_mean,marker_std
0,DAPI,4,0.083207,0.095882
1,ARID1A,8,0.005042,0.010528
2,ATRX,10,0.021219,0.048886
3,BCL6,12,0.023329,0.054438
4,CDT1,14,0.00363,0.019132


In [7]:
matched_markers["marker_name_pretrained"] = matched_markers[
    "marker_name"
]  # we want to keep track of name of data specific markers and pretrained marker names
matched_markers = matched_markers[
    ["marker_name", "marker_name_pretrained", "marker_id_pretrained", "marker_mean", "marker_std", "marker_id"]
]
matched_markers.head()

Unnamed: 0,marker_name,marker_name_pretrained,marker_id_pretrained,marker_mean,marker_std,marker_id
0,CCR6,CCR6,166,0.044867,0.042833,1
1,CD11B,CD11B,180,0.032169,0.052366,2
2,CD11C,CD11C,182,0.019039,0.044336,3
3,CD15,CD15,194,0.016322,0.040416,4
4,CD16,CD16,196,0.041869,0.055626,5


### Lets match the unmatched

In [8]:
# idea taken from https://github.com/mahmoodlab/KRONOS/blob/1f57c51efd863968cfa491819aef0e37dd3e2fdb/tutorials/utils/marker_metadata.py#L60
from difflib import SequenceMatcher

import numpy as np

top_5_suggestions = []

for _missing_marker in missing_markers["marker_name"].values:
    similarity_list = np.array(
        [
            SequenceMatcher(None, _missing_marker.upper(), marker_name).ratio()
            for marker_name in unmatched_markers["marker_name"].to_list()
        ]
    )
    sorted_index = np.argsort(similarity_list, stable=True)
    sorted_index = sorted_index[::-1]

    top_5_suggestions.append(unmatched_markers["marker_name"].values[sorted_index][:5])

top_5_suggestions = np.stack(top_5_suggestions)
top_5_suggestions_names = [f"marker name suggestion {i + 1} (pretrained)" for i in range(top_5_suggestions.shape[1])]

# Combine into DataFrame
top_5_suggestions = pd.DataFrame(
    top_5_suggestions, columns=top_5_suggestions_names, index=missing_markers["marker_name"].values
)
top_5_suggestions.index.name = "marker name (data specific)"
display(top_5_suggestions)

print("Following markers need to be manually mapped: ")

dict.fromkeys(top_5_suggestions.index, "")

Unnamed: 0_level_0,marker name suggestion 1 (pretrained),marker name suggestion 2 (pretrained),marker name suggestion 3 (pretrained),marker name suggestion 4 (pretrained),marker name suggestion 5 (pretrained)
marker name (data specific),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BCL-2,BCL2,BDCA-2,BCL6,B2M,CD28
CD46,CD61,CD54,CD45,CD40,CD36
COLLAGEN 4,COLLAGEN,LANGERIN,CTLA4,CATHEPSIN L,LAG3
CYTOKERITIN,CYTOKERATIN,CLUSTERIN,E-CADHERIN,CATHEPSIN L,LANGERIN
DAPI-01,DAPI,VDAC1,IL-1B,BDCA-2,PD1
GRANZYME B,GZMB,LYSOZYME,RELB,LANGERIN,RB
HLA-DR,HLA_DR,HLA_DRA,HLA_DRBPB,HLA1,HLA_1
IDO-1,IDO1,IL-1B,ARID1A,PD1,IGD
LAG-3,LAG3,SIGELC-3,LANGERIN,COLLAGEN,HLA1
MMP-9,MMP9,MPO,LMP1,TMPRSS2,SIGLEC-9


Following markers need to be manually mapped: 


{'BCL-2': '',
 'CD46': '',
 'COLLAGEN 4': '',
 'CYTOKERITIN': '',
 'DAPI-01': '',
 'GRANZYME B': '',
 'HLA-DR': '',
 'IDO-1': '',
 'LAG-3': '',
 'MMP-9': '',
 'MUC-1': '',
 'PD-1': '',
 'PD-L1': '',
 'T-BET': '',
 'TCR-G-D': '',
 'TCRB': '',
 'TIM-3': '',
 'VISA': ''}

In [9]:
manual_map = {
    "BCL-2": "BCL2",
    "COLLAGEN 4": "COLLAGEN",
    "CYTOKERITIN": "CYTOKERATIN",
    "DAPI-01": "DAPI",
    "GRANZYME B": "GZMB",
    "IDO-1": "IDO1",
    "LAG-3": "LAG3",
    "MMP-9": "MMP9",
    "MUC-1": "MUC1",
    "PD-1": "PD1",
    "PD-L1": "PDL1",
    "T-BET": "TBET",
    "TIM-3": "TIM3",
    "TCR-G-D": "TCR_GD",
    "TCRB": "TCR_B",
    "HLA-DR": "HLA_DR",
}
manual_map

{'BCL-2': 'BCL2',
 'COLLAGEN 4': 'COLLAGEN',
 'CYTOKERITIN': 'CYTOKERATIN',
 'DAPI-01': 'DAPI',
 'GRANZYME B': 'GZMB',
 'IDO-1': 'IDO1',
 'LAG-3': 'LAG3',
 'MMP-9': 'MMP9',
 'MUC-1': 'MUC1',
 'PD-1': 'PD1',
 'PD-L1': 'PDL1',
 'T-BET': 'TBET',
 'TIM-3': 'TIM3',
 'TCR-G-D': 'TCR_GD',
 'TCRB': 'TCR_B',
 'HLA-DR': 'HLA_DR'}

In [10]:
# append the manual mapped ones to the matched_markers dataframe

for _key, _value in manual_map.items():
    # some sanity checks
    if _value in matched_markers["marker_name_pretrained"].values:
        raise ValueError(f"'{_value}' already exists, please map to a marker not already in matched_markers dataframe.")
    if _value in matched_markers["marker_name"].values:
        raise ValueError(
            f"You mapped to a marker name '{_value}' that is in the data specific column. Please verify your manual mapping."
        )
    if _key in matched_markers["marker_name"].values or _key in matched_markers["marker_name"]:
        raise ValueError(f"Marker '{_key}' is already mapped.")
    if _key in matched_markers["marker_name_pretrained"].values:
        raise ValueError(f"Marker '{_key}' is in the pretrained column. Please verify your manual mapping.")
    if _key not in marker_data_specific["marker_name"].values:
        raise ValueError(f"Marker '{_key}' is not in data specific marker data.")
    if _value not in marker_metadata["marker_name"].values:
        raise ValueError(f"marker '{_value}' not in pretrained marker metadata file.")
    # unmatched_markers holds pretrained marker names
    _match = unmatched_markers[unmatched_markers["marker_name"] == _value].copy()
    # keep track of both marker_name_pretrained and data specific marker name ('marker_name')
    _match.rename(columns={"marker_name": "marker_name_pretrained"}, inplace=True)
    _match["marker_name"] = _key
    _marker_id = marker_data_specific[marker_data_specific["marker_name"] == _key]["marker_id"].item()
    _match["marker_id"] = _marker_id  # marker id, is the id in the data specific marker data.
    matched_markers = (
        pd.concat([matched_markers, _match]).reset_index(drop=True).sort_values(by="marker_id").reset_index(drop=True)
    )

In [11]:
matched_markers

Unnamed: 0,marker_name,marker_name_pretrained,marker_id_pretrained,marker_mean,marker_std,marker_id
0,BCL-2,BCL2,150,0.047104,0.060276,0
1,CCR6,CCR6,166,0.044867,0.042833,1
2,CD11B,CD11B,180,0.032169,0.052366,2
3,CD11C,CD11C,182,0.019039,0.044336,3
4,CD15,CD15,194,0.016322,0.040416,4
5,CD16,CD16,196,0.041869,0.055626,5
6,CD162,CD162,198,0.012217,0.040094,6
7,CD163,CD163,200,0.014384,0.033087,7
8,CD2,CD2,212,0.161256,0.110404,8
9,CD20,CD20,214,0.045192,0.057727,9


### Manually set metadata:

Taken from Kronos tutorials, manually set metadata:

If some markers are still unmatched with the pretrained dataset and you can not ignore these marker then you can manually assign their marker ID, mean, and standard deviation values:

- **Marker ID**: Choose an unassigned ID from the range 1â€“512 in marker_metadata.csv. Ideally, select an ID close to a biologically similar marker.
- **Mean & Std Values**: Calculate these from your dataset for the corresponding markers. Ensure marker intensities are converted to float type and intensities are in range of 0-1 before computing the mean and standard deviation.


Marker IDs are assigned as integers from 1 to 512. In the pretrained dataset, nuclear markers are assigned IDs from 1 to 127, while non-nuclear markers receive IDs from 128 to 512. This grouping helps capture high-level similarities between markers of the same type. Within each category, markers are arranged alphabetically, but only even-numbered IDs are assigned to those included in the pretrained dataset. The odd-numbered IDs are intentionally left unassigned, reserved for biologically similar markers that were not part of the pretrained dataset. This approach allows end-users to assign marker IDs from the odd-numbered values, ensuring that any newly added markers remain closely linked to the existing structure while preserving biological relevance.

In [12]:
# print markers still not matched:
missing_markers = marker_data_specific[~marker_data_specific["marker_id"].isin(matched_markers["marker_id"])]
missing_markers

Unnamed: 0,marker_id,marker_name
18,18,CD46
45,45,VISA


In [13]:
df_manually_set_rows = pd.DataFrame(
    [
        {
            "marker_name": "CD46",
            "marker_id_pretrained": 295,
            "marker_mean": 0.051,
            "marker_std": 0.085,
            "marker_id": 18,
        },
        {
            "marker_name": "VISA",
            "marker_id_pretrained": 150,
            "marker_mean": 0.015,
            "marker_std": 0.014,
            "marker_id": 45,
        },
    ]
)

# add a sanity check here
for _value in df_manually_set_rows["marker_name"].values:
    if _value in matched_markers["marker_name"].values:
        raise ValueError(f"Marker '{_value}' already matched.")
    if _value in matched_markers["marker_name_pretrained"].values:
        raise ValueError(
            f"Marker '{_value}' found in the 'marker_name_pretrained' column. Please verify your manual mapping."
        )


matched_markers = (
    pd.concat([matched_markers, df_manually_set_rows], ignore_index=True)
    .sort_values(by="marker_id")
    .reset_index(drop=True)
)
matched_markers

Unnamed: 0,marker_name,marker_name_pretrained,marker_id_pretrained,marker_mean,marker_std,marker_id
0,BCL-2,BCL2,150,0.047104,0.060276,0
1,CCR6,CCR6,166,0.044867,0.042833,1
2,CD11B,CD11B,180,0.032169,0.052366,2
3,CD11C,CD11C,182,0.019039,0.044336,3
4,CD15,CD15,194,0.016322,0.040416,4
5,CD16,CD16,196,0.041869,0.055626,5
6,CD162,CD162,198,0.012217,0.040094,6
7,CD163,CD163,200,0.014384,0.033087,7
8,CD2,CD2,212,0.161256,0.110404,8
9,CD20,CD20,214,0.045192,0.057727,9


In [16]:
# export to csv:
import os

output_path = "/data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv"  # change the output path

if os.path.exists(output_path):
    print(f"File already exists: {output_path}")
    print("Aborting to prevent overwrite.")
else:
    matched_markers.to_csv(output_path, index=False)
    print(f"File saved to: {output_path}")

File already exists: /data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv
Aborting to prevent overwrite.


In [None]:
df = pd.read_csv("/data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv")
df.head()  # feed this to kronos, do mean and std normalization inside the chunk

Unnamed: 0,marker_name,marker_name_pretrained,marker_id_pretrained,marker_mean,marker_std,marker_id
0,BCL-2,BCL2,150,0.047104,0.060276,0
1,CCR6,CCR6,166,0.044867,0.042833,1
2,CD11B,CD11B,180,0.032169,0.052366,2
3,CD11C,CD11C,182,0.019039,0.044336,3
4,CD15,CD15,194,0.016322,0.040416,4
