In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import muon as mu
import anndata as ad
from muon import prot as pt
import squidpy as sq
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import liana as li
from liana.method import MistyData, genericMistyData, lrMistyData
li.__version__

'1.0.0a0'

In [3]:
from pybiomart import Dataset

In [4]:
# from https://www.hcdm.org/index.php/molecule-information
cd_to_ncbi = pd.read_csv("CD_to_ncbi.txt", sep="\t", header=None, names=["CD_NAME", "NCBI_NAME", "GENE_NAME", "NCBI_OTHER_NAME"])
cd_to_ncbi.NCBI_OTHER_NAME = [str(s).split(";") for s in cd_to_ncbi.NCBI_OTHER_NAME]
cd_to_ncbi

Unnamed: 0,CD_NAME,NCBI_NAME,GENE_NAME,NCBI_OTHER_NAME
0,CD1a,CD1A,CD1A,[CD1]
1,CD1b,CD1B,CD1B,[CD1]
2,CD1c,CD1C,CD1C,[CD1]
3,CD1d,CD1D,CD1D,[none]
4,CD1e,CD1E,CD1E,[HSCDIEL]
...,...,...,...,...
412,CD367,CLEC4A,CD367,"[DCIR, DDB27, CLECSF6]"
413,CD368,CLEC4D,CD368,"[MCL, CLECSF8, CLEC-6, MPCL]"
414,CD369,CLEC7A,CD369,"[DECTIN-1, CLECSF12,]"
415,CD370,CLEC9A,CD370,"[HEEE9341, UNQ9341, DNGR1]"


In [5]:
#dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
##with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
##    display(dataset.list_attributes())
## or check attributes here: https://www.ensembl.org/biomart/martview
#
#gene_annot = dataset.query(attributes=["ensembl_gene_id", "external_gene_name", "hgnc_symbol", "uniprot_gn_symbol", "wikigene_name"])
#gene_annot
#
#gene_annot_2 = dataset.query(attributes=["ensembl_gene_id", "external_gene_name", "hgnc_id"])
#hgnc = pd.read_csv("group-471.csv", skiprows=1).rename(columns={'HGNC ID (gene)':'HGNC ID'})
#gene_annot_2 = gene_annot_2.set_index("HGNC ID").join(hgnc, how="left")
#gene_annot_2

In [8]:
rna_path = "/Users/pschafer/Liana_Vignette/data/GSE213264/GSM6578062_humantonsil_RNA.tsv.gz"
prot_path = "/Users/pschafer/Liana_Vignette/data/GSE213264/GSM6578071_humantonsil_protein.tsv.gz"
ab_path = "/Users/pschafer/Liana_Vignette/data/41587_2023_1676_MOESM3_ESM.xlsx"

In [9]:
# sheets: HUMAN_273, MOUSE_189
ab_df = pd.read_excel(ab_path, sheet_name="HUMAN_273")

In [10]:
rna = pd.read_csv(rna_path, sep="\t")
prot = pd.read_csv(prot_path, sep="\t")
assert set(rna.X) == set(prot.X)

rna = rna.set_index("X")
prot = prot.set_index("X")
prot = prot.loc[rna.index]
assert np.all(rna.index == prot.index)

In [11]:
# get coordinates
def get_coord(col):
    x_coord = [s.split("x")[0] for s in col]
    y_coord = [s.split("x")[1] for s in col]
    df_coord = pd.DataFrame({"x": x_coord, "y": y_coord})
    df_coord = df_coord.apply(pd.to_numeric)
    return df_coord
coord = get_coord(rna.index)

In [12]:
# remove spike ins from rna
print(rna.shape)
spike_bool = np.array(["ERCC" in c for c in rna.columns])
print(spike_bool.sum())
rna = rna.loc[:,~spike_bool]
print(rna.shape)

(2492, 28417)
29
(2492, 28388)


In [13]:
# TODO: remove unmapped from protein?
print(prot.shape)
unmapped_bool = np.array(["unmapped" in c for c in prot.columns])
print(unmapped_bool.sum())
prot = prot.loc[:,~unmapped_bool]
print(prot.shape)

(2492, 283)
1
(2492, 282)


In [14]:
# clean protein names
clean_names = prot.columns
clean_names = [re.sub("\\.[AGCT]{15}", "", s) for s in prot.columns]
clean_names = [re.sub("anti\\.mouse\\.human", "", s) for s in clean_names]
clean_names = [re.sub("anti\\.human\\.mouse", "", s) for s in clean_names]
clean_names = [re.sub("anti\\.mouse\\.rat\\.human", "", s) for s in clean_names]
clean_names = [re.sub("anti\\.mouse\\.rat", "", s) for s in clean_names]
clean_names = [re.sub("Mouse", "", s) for s in clean_names]
clean_names = [re.sub("Rat", "", s) for s in clean_names]
clean_names = [re.sub("Human", "", s) for s in clean_names]
clean_names = [re.sub("isotype\\.Ctrl", "", s) for s in clean_names]
clean_names = [re.sub("Isotype\\.Ctrl", "", s) for s in clean_names]
clean_names = [re.sub("\\.\\.k", "", s) for s in clean_names]
clean_names = [re.sub("\\.+$", "", s) for s in clean_names]
clean_names = [re.sub("^\\.+", "", s) for s in clean_names]

In [15]:
# merge from barcode excel sheet
barcodes = np.array([re.search("[AGCT]{15}|unmapped", s).group() for s in prot.columns])
barcodes = pd.DataFrame({"barcode": barcodes}).join(ab_df.set_index("Barcode"), on="barcode")
barcodes["Clean"] = clean_names
#barcodes["Target"] = [re.sub("anti-human ", "", str(s)) for s in barcodes["Target"].to_numpy()]
#barcodes["Target"] = [re.sub("anti-human/mouse ", "", str(s)) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("anti-human ", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("anti-human/mouse ", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("anti-mouse/human ", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("Recombinant", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("anti-Human", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [re.sub("anti-", "", s) for s in barcodes["Target"].to_numpy()]
barcodes["Target"] = [s.strip() for s in barcodes["Target"].to_numpy()]

# extract all names in the bracket in s and put into a list
def extract_names(s):
    pattern = re.compile(r'\((.*?)\)')
    return pattern.findall(s)

# remove the bracket in s and put into a list
def remove_bracket(s):
    pattern = re.compile(r' \((.*?)\)')
    return pattern.sub('', s)

primary_names = []
alternative_names = []
for bc, target in zip(barcodes.barcode, barcodes.Target):
    primary_names.append(remove_bracket(target))
    alternative_names.append(extract_names(target))

barcodes["Primary"] = [s.strip() for s in primary_names]
barcodes["Alternative"] = alternative_names
barcodes = barcodes.set_index("Primary").join(cd_to_ncbi.set_index("CD_NAME"), how="left")
barcodes = barcodes.reset_index().rename(columns={"index": "Primary"})
barcodes = barcodes.fillna("none")
barcodes

Unnamed: 0,Primary,barcode,DNA_ID,Target,Clean,Alternative,NCBI_NAME,GENE_NAME,NCBI_OTHER_NAME
0,CD64,AAGTATGCCCTACGA,A0162,CD64,CD64,[],none,none,none
1,CD1d,TCGAGTCGCTTATCA,A0164,CD1d,CD1d,[],CD1D,CD1D,[none]
2,CD270,TGATAGAAACAGACC,A0020,"CD270 (HVEM, TR2)",CD270..HVEM_.TR2,"[HVEM, TR2]",TNFRSF14,CD270,"[ATAR, CD270, ""herpesvirus entry mediator"", ..."
3,CD319,AGTATGCCATGTCTT,A0830,CD319 (CRACC),CD319..CRACC,[CRACC],SLAMF7,CD319,"[19A, CRACC, CS1]"
4,CD34,GCAGAAATCTCCCTT,A0054,CD34,CD34,[],CD34,CD34,[none]
...,...,...,...,...,...,...,...,...,...
277,CD37,ACAGTCACTGGGCAA,A0941,CD37,CD37,[],CD37,CD37,[GP52-40]
278,TNF-α,CCTATGAACGTAACG,A0945,TNF-α,TNF.a,[],none,none,none
279,CD215,CATATTCCGCCGTAA,A0947,CD215 (IL-15Rα),CD215..IL.15Ra,[IL-15Rα],IL15RA,CD215,"[CD215, IL-15RA]"
280,CD321,GACAGTACCGACACT,A0948,CD321,CD321,[],F11R,CD321,"[JAM, KAT, JAM1, JCAM, JAM-1, PAM-1]"


In [77]:
#li.rs.show_resources()
db = li.rs.select_resource("consensus")
ligands_orig = np.unique(db.ligand.to_numpy())
receptors_orig = np.unique(db.receptor.to_numpy())
print(ligands_orig.shape)
print(receptors_orig.shape)

(1036,)
(1059,)


In [78]:
# any overlap between ligands and receptors?
print(len(set(ligands_orig) & set(receptors_orig)))
np.array(list(set(ligands_orig) & set(receptors_orig)))

62


array(['NECTIN2', 'CEACAM1', 'CEACAM5', 'ITGB2', 'CD96', 'NTRK3', 'C1QB',
       'PDPN', 'NLGN3', 'HLA-DPA1', 'FGFR3', 'IL1RAPL1', 'NRXN3', 'APP',
       'CD28', 'HAVCR1', 'SDC2', 'DSC3', 'CADM1', 'CD22', 'CRTAM',
       'HLA-DPB1', 'PODXL', 'SELL', 'MADCAM1', 'MUC1', 'DSC1', 'GP1BA',
       'CD200R1', 'TNFSF8', 'SCUBE2', 'CSPG4', 'SIRPA', 'CD47', 'NLGN1',
       'TIGIT', 'CNTN1', 'NLGN2', 'CD72', 'SEMA4C', 'ACTR2', 'NECTIN1',
       'NRXN2', 'CD86', 'SPN', 'CLEC1B', 'CFC1', 'FGFR4', 'CLEC2D',
       'DSCAM', 'NECTIN4', 'SIRPG', 'CD80', 'CXADR', 'CD200', 'THY1',
       'NRXN1', 'JAML', 'ADAM23', 'CD70', 'RGMB', 'ITGAV'], dtype='<U8')

In [107]:
# for now we remove entries that are annotated as both receptors and ligands
# TODO: Change in future
ligands = np.array(list(set(ligands_orig) - (set(ligands_orig) & set(receptors_orig))))
receptors = np.array(list(set(receptors_orig) - (set(ligands_orig) & set(receptors_orig))))
print(ligands.shape)
print(receptors.shape)

(974,)
(997,)


In [108]:
translate_names = {
    "CD270": "TNFRSF14",
    "CD335": "NCR1",
    "CD117": "KIT",
    "CD357": "TNFRSF18",
    "CD268": "TNFRSF13C",
    "CD252": "TNFSF4",
    "CD137L": "TNFSF9", # ligand for CD137: TNFRSF9
    "TNF-α" : "TNF",
    "LOX-1": "OLR1",
    "TSLPR": "CRLF2",
    "CD273": "PDCD1LG2",
    "CD169": "SIGLEC1",
    "β2-microglobulin": "B2M",
    "FcεRIα": "FCER2",
    "CD294": "PTGDR2",
    "integrin β7": "ITGB7",
    "CD32": "FCGR2A",
    "CD85j": "LILRB1",
    "CD158": "CD40LG",
    "Mac-2": "LGALS3",
    "Podoplanin": "PDPN",
    "CD8": "CD8A",
    "CD122": "IL2RB",
    "Galectin-9": "LGALS9",
    "Cadherin 11": "CDH11",
    "Notch 1": "NOTCH1",
    "CD119": "IFNGR1",
    "Notch 3": "NOTCH3",
    "CD85g": "LILRA4",
    "TIM-4": "TIMD4",
    "C5L2": "C5AR2",
    "mast cell tryptase": "TPSAB1",
    "GP130": "IL6ST",
    "CD218a": "IL18R1",
    "VEGFR-3": "FLT4",
    "GPR56": "ADGRG1",
    "MICA/MICB": "MICA",
    "NKp80": "KLRF1",
    "CD131": "CSF2RB",
    "Lymphotoxin β Receptor": "LTBR",
    "Annexin A1": "ANXA1",
}
barcodes["MANUAL"] = [translate_names.get(s, "none") for s in barcodes.Primary]

In [109]:
match_bool_receptors = np.zeros(len(barcodes), dtype=bool)
matched_receptors = []

for i, row_dict in enumerate(barcodes.to_dict(orient="records")):
    
    if row_dict["Primary"] in receptors:
        matched_receptors.append(row_dict["Primary"])
        match_bool_receptors[i] = True
        continue

    if row_dict["NCBI_NAME"] in receptors:
        matched_receptors.append(row_dict["NCBI_NAME"])
        match_bool_receptors[i] = True
        continue

    if row_dict["MANUAL"] in receptors:
        matched_receptors.append(row_dict["MANUAL"])
        match_bool_receptors[i] = True
        continue

    tmp_flag = False
    for entry in row_dict["NCBI_OTHER_NAME"]:
        if entry.strip() in receptors:
            matched_receptors.append(entry.strip() )
            match_bool_receptors[i] = True
            tmp_flag = True
            break
    if tmp_flag:
        continue

    tmp_flag = False
    for entry in row_dict["Alternative"]:
        if entry.strip() in receptors:
            matched_receptors.append(entry.strip() )
            match_bool_receptors[i] = True
            tmp_flag = True
            break
    if tmp_flag:
        continue

    matched_receptors.append("none")

barcodes["matched_receptor_bool"] = match_bool_receptors
match_bool_receptors.sum()

133

In [110]:
match_bool_ligands = np.zeros(len(barcodes), dtype=bool)
matched_ligands = []

for i, row_dict in enumerate(barcodes.to_dict(orient="records")):
    
    if row_dict["Primary"] in ligands:
        matched_ligands.append(row_dict["Primary"])
        match_bool_ligands[i] = True
        continue

    if row_dict["NCBI_NAME"] in ligands:
        matched_ligands.append(row_dict["NCBI_NAME"])
        match_bool_ligands[i] = True
        continue

    if row_dict["MANUAL"] in ligands:
        matched_ligands.append(row_dict["MANUAL"])
        match_bool_ligands[i] = True
        continue

    tmp_flag = False
    for entry in row_dict["NCBI_OTHER_NAME"]:
        if entry.strip() in ligands:
            matched_ligands.append(entry.strip() )
            match_bool_ligands[i] = True
            tmp_flag = True
            break
    if tmp_flag:
        continue

    tmp_flag = False
    for entry in row_dict["Alternative"]:
        if entry.strip() in ligands:
            matched_ligands.append(entry.strip() )
            match_bool_ligands[i] = True
            tmp_flag = True
            break
    if tmp_flag:
        continue

    matched_ligands.append("none")

barcodes["matched_ligand_bool"] = match_bool_ligands
match_bool_ligands.sum()

52

In [111]:
barcodes["matched"] = np.logical_or(barcodes["matched_receptor_bool"], barcodes["matched_ligand_bool"])

In [112]:
barcodes.matched_receptor = matched_receptors
barcodes.matched_ligand = matched_ligands

In [115]:
matched_receptors = np.array(matched_receptors)[[s != "none" for s in matched_receptors]]
matched_ligands = np.array(matched_ligands)[[s != "none" for s in matched_ligands]]

In [116]:
barcodes.loc[barcodes.matched]

Unnamed: 0,Primary,barcode,DNA_ID,Target,Clean,Alternative,NCBI_NAME,GENE_NAME,NCBI_OTHER_NAME,MANUAL,matched_receptor,matched_ligand,matched,matched_receptor_bool,matched_ligand_bool
1,CD1d,TCGAGTCGCTTATCA,A0164,CD1d,CD1d,[],CD1D,CD1D,[none],none,none,CD1D,True,False,True
2,CD270,TGATAGAAACAGACC,A0020,"CD270 (HVEM, TR2)",CD270..HVEM_.TR2,"[HVEM, TR2]",TNFRSF14,CD270,"[ATAR, CD270, ""herpesvirus entry mediator"", ...",TNFRSF14,TNFRSF14,none,True,True,False
4,CD34,GCAGAAATCTCCCTT,A0054,CD34,CD34,[],CD34,CD34,[none],none,none,CD34,True,False,True
5,CD335,ACAATTTGAACAGCG,A0101,CD335 (NKp46),CD335..NKp46,[NKp46],NCR1,CD335,"[LY94, NK-p46, NKP46]",NCR1,NCR1,none,True,True,False
6,CD193,ACCAATCCTTTCGTC,A0397,CD193 (CCR3),CD193..CCR3,[CCR3],CCR3,CD193,"[CC-CKR-3, CKR3, CMKBR3]",none,CCR3,none,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,CD317,AAGAGCCGTTGTGAA,A0936,"CD317 (BST2, Tetherin)",CD317..BST2_.Tetherin,"[BST2, Tetherin]",BST2,CD317,[none],none,none,BST2,True,False,True
276,CD116,ATGGACAGTTCGTGT,A0940,CD116,CD116,[],CSF2RA,CD116,"[CD116, CDw116, CSF2R, CSF2RAX, CSF2RAY, ...",none,CSF2RA,none,True,True,False
278,TNF-α,CCTATGAACGTAACG,A0945,TNF-α,TNF.a,[],none,none,none,TNF,none,TNF,True,False,True
279,CD215,CATATTCCGCCGTAA,A0947,CD215 (IL-15Rα),CD215..IL.15Ra,[IL-15Rα],IL15RA,CD215,"[CD215, IL-15RA]",none,none,IL15RA,True,False,True


In [120]:
# almost all of the matched ligands and receptors are in the rna data (only 6 are missing)
set(list(matched_ligands) + list(matched_receptors)) - set(rna.columns.to_list())

{'ADGRG1', 'CD177', 'CD24', 'HLA-E', 'MPL', 'PECAM1'}

In [87]:
#g = "KLRF1"
#print(receptors[np.array([g in l for l in receptors])])
#print(ligands[np.array([g in l for l in ligands])])

In [88]:
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
#    display(barcodes.loc[~barcodes.matched, :].loc[:, ["Primary"]])

In [121]:
db_subset = db.loc[np.logical_or(np.array([s in np.unique(matched_receptors) for s in db.receptor.to_numpy()]),
                                 np.array([s in np.unique(matched_ligands) for s in db.ligand.to_numpy()]))]
db_subset

Unnamed: 0,ligand,receptor
0,LGALS9,PTPRC
1,LGALS9,MET
2,LGALS9,CD44
3,LGALS9,LRP1
4,LGALS9,CD47
...,...,...
4684,NRG3,EGFR
4685,CSF1,CSF2RA
4686,CSF3,CSF2RA
4688,MAML2,NOTCH3


In [122]:
print(db.shape)

db_subset = db.loc[np.logical_or(np.array([s in np.unique(matched_receptors) for s in db.receptor.to_numpy()]),
                                 np.array([s in np.unique(matched_ligands) for s in db.ligand.to_numpy()]))]
print(db_subset.shape)

all_genes = np.unique(db_subset.ligand.to_list() + db_subset.receptor.to_list())
all_genes_in_rna = all_genes[[s in rna.columns for s in all_genes]]

db_subset = db_subset.loc[np.logical_and(np.array([s in all_genes_in_rna for s in db_subset.receptor.to_numpy()]),
                                         np.array([s in all_genes_in_rna for s in db_subset.ligand.to_numpy()]))]
print(db_subset.shape)

(4701, 2)
(972, 2)
(819, 2)


In [126]:
misty_predictors = np.union1d(np.unique(matched_receptors), np.unique(matched_ligands))
print(misty_predictors.shape)
misty_targets = np.array(list(set(list(all_genes_in_rna)) - set(list(misty_predictors))))
print(misty_targets.shape)

(180,)
(447,)


In [None]:
markers = ["CD19", "CD21", "CD23"]
fig, axs = plt.subplots(1, len(markers), figsize=(15, 5))
for i, marker in enumerate(markers):
    plot_df = coord.copy()
    plot_df["marker"] = prot[marker].to_numpy()
    sns.scatterplot(x='x', y='y', hue='marker', data=plot_df, ax=axs[i], s=20)
    axs[i].axis('equal')
    axs[i].invert_yaxis()
    axs[i].set_title(marker)
plt.show()

In [None]:
markers = ["IgM", "IgG.Fc", "IgD"]
fig, axs = plt.subplots(1, len(markers), figsize=(15, 5))
for i, marker in enumerate(markers):
    plot_df = coord.copy()
    plot_df["marker"] = prot[marker].to_numpy()
    sns.scatterplot(x='x', y='y', hue='marker', data=plot_df, ax=axs[i], s=20)
    axs[i].axis('equal')
    axs[i].invert_yaxis()
    axs[i].set_title(marker)
plt.show()

In [None]:
obs_df = rna.index.to_frame(name="name")
obs_df["x"] = coord["x"].to_numpy()
obs_df["y"] = coord["y"].to_numpy()
obs_df

In [None]:
spatial = obs_df.copy().loc[:, ["x", "y"]]
spatial

In [None]:
rna_var = rna.columns.to_frame(name="gene")
rna_var.index = ["rna:" + str(i) for i in rna_var.index]
rna_var

In [None]:
prot_var = prot.columns.to_frame(name="prot")
prot_var.index = ["prot:" + str(i) for i in prot_var.index]
prot_var

In [None]:
rna_ad = ad.AnnData(X=rna.to_numpy(), var=rna_var, obs=obs_df, obsm={"spatial": spatial})
rna_ad

In [None]:
prot_ad = ad.AnnData(X=prot.to_numpy(), var=prot_var, obs=obs_df, obsm={"spatial": spatial})
prot_ad

In [None]:
mdata = mu.MuData({"rna": rna_ad, "protein": prot_ad})
mdata

# RNA Analysis Only

In [None]:
li.ut.spatial_neighbors(mdata.mod["rna"], cutoff=0, bandwidth=2.5, set_diag=False)
li.pl.connectivity(mdata.mod["rna"], idx=0)

In [None]:
sc.pp.highly_variable_genes(mdata.mod["rna"])
hvg = mdata.mod["rna"].var[mdata.mod["rna"].var['highly_variable']].index

In [None]:
hvg

In [None]:
sc.pp.normalize_total(mdata.mod["rna"], target_sum=1e4)
sc.pp.log1p(mdata.mod["rna"])

In [None]:
#li.rs.show_resources()
db = li.rs.select_resource("cellphonedb")
db

In [None]:
ligands = np.unique(db.ligand.to_numpy())
receptors = np.unique(db.ligand.to_numpy())
genes = [x.split(":")[1] for x in mdata.mod["rna"].var_names]
proteins = [x.split(":")[1] for x in mdata.mod["protein"].var_names]

In [None]:
print(len(ligands))
print(len(receptors))

In [None]:
print(np.array([l in genes for l in ligands]).sum())
print(np.array([l in genes for l in receptors]).sum())

In [None]:
"CD270" in receptors

In [None]:
proteins

In [None]:
print(np.array([l in proteins for l in ligands]).sum())
print(np.array([l in proteins for l in receptor]).sum())

In [None]:
orig_names

In [None]:
# Appendix
print(np.array([s in gene_annot["Gene name"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot["HGNC symbol"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot["UniProtKB Gene Name symbol"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot["WikiGene name"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())

print(np.array([s in gene_annot_2["Approved symbol"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot_2["Approved name"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot_2["Previous symbols"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())
print(np.array([s in gene_annot_2["Aliases"].to_numpy() for s in barcodes.loc[~barcodes.matched, :].Primary.to_numpy()]).sum())