# Create a table with the compartment annotations for the human organelles

These annotations are from different sources and harmonized to the Uniprot CV terms.

## Human Organelle annotations

| Publication | Dataset | Download URL | DOI |
|------------|---------|-----|-----|
| Hein & Peng et al. 2024 | Ground truth (mmc2.xlsx) | https://www.cell.com/cms/10.1016/j.cell.2024.11.028/attachment/b3bcdc15-9cc8-4fa3-9e21-9184bc057f68/mmc2.xlsx | 10.1016/j.cell.2024.11.028 | 
| Hein & Peng et al. 2024 | mmc2.xlsx | https://www.cell.com/cms/10.1016/j.cell.2024.11.028/attachment/b3bcdc15-9cc8-4fa3-9e21-9184bc057f68/mmc2.xlsx | https://doi.org/10.1016/j.cell.2024.11.028 |
| Itzhak et al. 2016 | elife-16950-supp1-v3.xlsx | https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMTY5NTAvZWxpZmUtMTY5NTAtc3VwcDEtdjMueGxzeA--/elife-16950-supp1-v3.xlsx?_hash=poOexSxgbevvH2UEZx5nzzh6K7agaizjz75KasKBr2E%3D | https://doi.org/10.7554/eLife.16950.015 |


## Subcellular localization annotations

UniprotKB: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/subcell.txt

## Protein ID mapping
UniprotKB accession to gene name (uniprot_sp_accessions.tsv): https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Cgene_names%2Cgene_primary&format=tsv&query=%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29


In [60]:
import pandas as pd

pd.set_option("display.max_rows", 50)

In [61]:
symbol_map = pd.read_csv("external/uniprot_sp_accessions.tsv", sep="\t")
symbol_map.rename(
    columns={
        "Gene Names (primary)": "gene_symbol",
        "Gene Names": "gene_symbol_alt",
        "Entry": "uniprot_id",
    },
    inplace=True,
)
# If the primary gene symbol is missing, use the alternative gene symbol
symbol_map.loc[symbol_map.gene_symbol.isna(), "gene_symbol"] = (
    symbol_map.loc[symbol_map.gene_symbol.isna(), "gene_symbol_alt"]
    .str.split(" ")
    .str[0]
)
symbol_map.drop(columns=["gene_symbol_alt"], inplace=True)

In [62]:
hein = pd.read_excel("external/mmc4.xlsx", sheet_name="subcell_annotation_summary")
# Some gene names are not correct, so we drop them
hein = hein[~hein.Gene_name_canonical.str.contains("\\[p\\]")]
# There are duplicates in the Hein dataset, so we drop them (they cannot be distinguished by the name)
hein = hein.drop_duplicates(subset="Gene_name_canonical").iloc[:, :3]
hein = hein.rename(
    columns={
        "Gene_name_canonical": "gene_symbol",
        "graph_localization_annotation": "hein2024_component",
    }
)
hein = hein[hein.hein2024_component != "unclassified"]
hein = hein.merge(symbol_map, on="gene_symbol", how="left")
# We cannot distinguish duplicated gene symbols in the uniprot map so we try to choose from the majority protein IDs
hein_duplicated = hein[
    hein.gene_symbol.isin(hein[hein.gene_symbol.duplicated()].gene_symbol)
]
del_indices = []
for i, row in hein_duplicated.iterrows():
    if not str(row["Majority protein IDs"]).find(str(row["uniprot_id"])) >= 0:
        del_indices.append(i)

del_indices
hein.drop(del_indices, inplace=True)

# Fill unmatched gene symbols with the first available uniprot id
majority_uniprot_ids = (
    hein.loc[hein.uniprot_id.isna(), "Majority protein IDs"].str.split(";").str[0]
)
gene_maps = majority_uniprot_ids.map(symbol_map.set_index("uniprot_id")["gene_symbol"])
hein.loc[hein.uniprot_id.isna(), "uniprot_id"] = gene_maps.values
hein

# The rest we cannot resolve and drop
hein = hein[hein.uniprot_id.notna()]
hein = hein.drop_duplicates(subset="uniprot_id")
hein = hein.drop_duplicates(subset="gene_symbol")

hein.drop(columns=["Majority protein IDs"], inplace=True)

# hein
# hein.nunique()

In [63]:
hein_gt = pd.read_excel("external/mmc2.xlsx", sheet_name="organelle_markers")
hein_gt = hein_gt.drop_duplicates(subset="gene_name_canonical").iloc[:, :2]
hein_gt = hein_gt.rename(
    columns={"gene_name_canonical": "gene_symbol", "organelle": "hein2024_gt_component"}
)
# We cannot distinguish duplicated gene symbols in the uniprot map so we choose the fist one
hein_gt = hein_gt.merge(
    symbol_map.drop_duplicates(subset="gene_symbol"), on="gene_symbol", how="left"
)
# Drop gene symbols that could not be mapped to uniprot
hein_gt = hein_gt[hein_gt.uniprot_id.notna()]
hein_gt


Unnamed: 0,gene_symbol,hein2024_gt_component,uniprot_id
0,ACTN1,actin-binding_protein,P12814
1,ACTN4,actin-binding_protein,O43707
2,COBL,actin-binding_protein,O75128
3,CORO1B,actin-binding_protein,Q9BR76
4,CORO2A,actin-binding_protein,Q92828
...,...,...,...
2379,YBX3,stress granule,P16989
2380,YTHDF1,stress granule,Q9BYJ9
2381,YTHDF2,stress granule,Q9Y5A9
2382,YTHDF3,stress granule,Q7Z739


In [64]:
itzhak = pd.read_excel(
    "external/elife-16950-supp1-v3.xlsx", sheet_name="Organellar Markers HeLa"
)
itzhak = itzhak.iloc[:, [0, 2, 4]]
itzhak = itzhak.rename(
    columns={
        "Gene name": "gene_symbol",
        "Compartment": "itzhak2016_component",
        "Protein ID (canonical)": "uniprot_id",
    }
)
# Itzhak can be merged based on the uniprot_id,the gene_symbol is not always the primary one uniprot uses
itzhak.drop(columns=["gene_symbol"], inplace=True)
itzhak = itzhak.merge(symbol_map, on="uniprot_id", how="left")
itzhak

Unnamed: 0,uniprot_id,itzhak2016_component,gene_symbol
0,O14639,Actin binding proteins,ABLIM1
1,P60709,Actin binding proteins,ACTB
2,P63261,Actin binding proteins,ACTG1
3,P12814,Actin binding proteins,ACTN1
4,O43707,Actin binding proteins,ACTN4
...,...,...,...
1071,O00186,Plasma membrane,STXBP3
1072,Q03167,Plasma membrane,TGFBR3
1073,Q8TAA9,Plasma membrane,VANGL1
1074,Q9UBH6,Plasma membrane,XPR1


In [65]:
all_annotations = hein.merge(
    hein_gt, on=["gene_symbol", "uniprot_id"], how="outer"
).merge(itzhak, on=["gene_symbol", "uniprot_id"], how="outer")
all_annotations.drop_duplicates(subset=["gene_symbol"], inplace=True)
all_annotations.drop_duplicates(subset=["uniprot_id"], inplace=True)
all_annotations.set_index("uniprot_id", inplace=True)

all_annotations

Unnamed: 0_level_0,gene_symbol,hein2024_component,hein2024_gt_component,itzhak2016_component
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q9NRG9,AAAS,ER,,
Q86V21,AACS,cytosol,,
Q6PD74,AAGAB,cytosol,cytosol,
Q2M2I8,AAK1,plasma_membrane,,
Q9H7C9,AAMDC,cytosol,,
...,...,...,...,...
P16066,NPR1,,,Plasma membrane
P78380,OLR1,,,Plasma membrane
Q03405,PLAUR,,,Plasma membrane
O75051,PLXNA2,,,Plasma membrane


In [66]:
hein_location_mapping = {
    "14-3-3_scaffold": "14-3-3 scaffold",  # No uniprot CV equivalent
    "actin_cytoskeleton": "Cytoskeleton",
    "centrosome": "Centrosome",
    "cytosol": "Cytosol",
    "early_endosome": "Early endosome",
    "ER": "Endoplasmic reticulum",
    "ERGIC": "Endoplasmic reticulum-Golgi intermediate compartment",
    "Golgi": "Golgi apparatus",
    "lysosome": "Lysosome",
    "mitochondrion": "Mitochondrion",
    "nucleolus": "Nucleolus",
    "nucleus": "Nucleus",
    "p-body": "P-body",
    "peroxisome": "Peroxisome",
    "plasma_membrane": "Cell membrane",
    "proteasome": "Proteasome",  # No uniprot CV equivalent
    "recycling_endosome": "Recycling endosome",
    "stress_granule": "Stress granule",
    "trans-Golgi": "trans-Golgi network",
    "translation": "Translation",  # No uniprot CV equivalent
}
hein_gt_location_mapping = {
    "actin-binding_protein": "Cytoskeleton",
    "centrosome": "Centrosome",
    "cytosol": "Cytosol",
    "early_endosome": "Early endosome",
    "ER": "Endoplasmic reticulum",
    "ERGIC": "Endoplasmic reticulum-Golgi intermediate compartment",
    "Golgi": "Golgi apparatus",
    "lysosome": "Lysosome",
    "mitochondria": "Mitochondrion",
    "nuclear_pore": "Nuclear pore complex",
    "nucleus": "Nucleus",
    "p-body": "P-body",
    "peroxisome": "Peroxisome",
    "plasma membrane": "Cell membrane",
    "ribosome": "Translation",  # No uniprot CV equivalent going with hein2024_component
    "stress granule": "Stress granule",
}

itzhak_location_mapping = {
    "Actin binding proteins": "Actin binding proteins",  # No uniprot CV equivalent
    "Endosome": "Endosome",
    "ER": "Endoplasmic reticulum",
    "ER_high_curvature": "Endoplasmic reticulum membrane",  # Best match for high curvature regions
    "Ergic/cisGolgi": "Endoplasmic reticulum-Golgi intermediate compartment",
    "Golgi": "Golgi apparatus",
    "Large Protein Complex": "Large protein complex",  # No uniprot CV equivalent
    "Lysosome": "Lysosome",
    "Mitochondrion": "Mitochondrion",
    "Nuclear pore complex": "Nuclear pore complex",
    "Peroxisome": "Peroxisome",
    "Plasma membrane": "Cell membrane",
}

all_annotations.hein2024_component = all_annotations.hein2024_component.map(
    hein_location_mapping
)
all_annotations.hein2024_gt_component = all_annotations.hein2024_gt_component.map(
    hein_gt_location_mapping
)
all_annotations.itzhak2016_component = all_annotations.itzhak2016_component.map(
    itzhak_location_mapping
)
all_annotations


Unnamed: 0_level_0,gene_symbol,hein2024_component,hein2024_gt_component,itzhak2016_component
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q9NRG9,AAAS,Endoplasmic reticulum,,
Q86V21,AACS,Cytosol,,
Q6PD74,AAGAB,Cytosol,Cytosol,
Q2M2I8,AAK1,Cell membrane,,
Q9H7C9,AAMDC,Cytosol,,
...,...,...,...,...
P16066,NPR1,,,Cell membrane
P78380,OLR1,,,Cell membrane
Q03405,PLAUR,,,Cell membrane
O75051,PLXNA2,,,Cell membrane


In [67]:
for col in ["hein2024_component", "hein2024_gt_component", "itzhak2016_component"]:
    print(col)
    print(all_annotations[col].value_counts())
    print("\n")


hein2024_component
hein2024_component
Cytosol                                                 1723
Nucleus                                                 1424
Mitochondrion                                            830
Endoplasmic reticulum                                    711
Cell membrane                                            609
Cytoskeleton                                             251
Stress granule                                           217
Early endosome                                           212
Nucleolus                                                200
Golgi apparatus                                          180
Lysosome                                                 164
Translation                                              152
14-3-3 scaffold                                          121
trans-Golgi network                                      105
Recycling endosome                                        82
Proteasome                                     

In [68]:
all_annotations.to_csv("external/subcellular_annotations.tsv", sep="\t")