# Cell-Annotation

###  Cell annotation by large language model GPT-4
Hou, W., Ji, Z. Assessing GPT-4 for cell type annotation in single-cell RNA-seq analysis. Nat Methods 21, 1462–1465 (2024). https://doi.org/10.1038/s41592-024-02235-4

In [1]:
import scanpy as sc
from cell_annotator import CellAnnotator
import pandas as pd
import numpy as np
import glob
import os
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load Gene Orthology Reference


In [5]:
# Load gene otholog reference table (adapted with `gene_reference.ipynb`)
gene_reference = pd.read_csv('/home/raquelcr/scanpy/gene_reference.csv')
gene_reference.head()

Unnamed: 0,NMR gene name,gene_ids,Human gene stable ID,Human gene name,Mouse gene stable ID,Mouse gene name,Local_ID,Global_name,Global_ID
0,ZMYND10,ENSHGLG00000010267,ENSG00000004838,ZMYND10,ENSMUSG00000010044,Zmynd10,LOCAL00000000000,ZMYND10,ENSHGLG00000010267
1,AMIGO3,ENSHGLG00000010542,,,,,LOCAL00000000001,AMIGO3,ENSHGLG00000010542
2,TNFSF10,ENSHGLG00000010558,,,,,LOCAL00000000002,TNFSF10,ENSHGLG00000010558
3,ENSHGLG00000010617,ENSHGLG00000010617,,,,,LOCAL00000000003,ENSHGLG00000010617,ENSHGLG00000010617
4,NPRL2,ENSHGLG00000010655,ENSG00000114388,NPRL2,ENSMUSG00000010057,Nprl2,LOCAL00000000004,NPRL2,ENSHGLG00000010655


In [6]:
nmr2human = pd.read_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/nmr_to_human.csv')
nmr2human = nmr2human[nmr2human['perc_id']>60] # filter out by perc_id
nmr2human

Unnamed: 0,heterocephalus_glaber_female_gene,homo_sapiens_gene,orthology_type,perc_id,dn_ds
0,U6,ENSG00000252980,ortholog_one2one,67.0000,
1,C3orf38,ENSG00000179021,ortholog_one2one,85.1064,
2,ZNF654,ENSG00000175105,ortholog_one2one,90.5142,
3,CGGBP1,ENSG00000163320,ortholog_one2one,99.4012,
4,HTR1F,ENSG00000179097,ortholog_one2one,93.9891,
...,...,...,...,...,...
12378,COX2,ENSG00000198712,ortholog_one2one,69.1630,
12380,ND3,ENSG00000198840,ortholog_one2one,67.8261,
12381,ND4,ENSG00000198886,ortholog_one2one,69.4989,
12382,ND5,ENSG00000198786,ortholog_one2one,65.1741,


In [7]:
human2nmr = pd.read_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/human_to_nmr.csv')
human2nmr = human2nmr[human2nmr['perc_id']>60] # filter out by perc_id
human2nmr

Unnamed: 0,homo_sapiens_gene,heterocephalus_glaber_female_gene,orthology_type,perc_id,dn_ds
0,MT-ND1,ENSHGLG00000000006,ortholog_one2one,75.2351,
2,MT-CO1,ENSHGLG00000000016,ortholog_one2one,91.0331,
3,MT-CO2,ENSHGLG00000000019,ortholog_one2one,69.1630,
5,MT-CO3,ENSHGLG00000000023,ortholog_one2one,85.8238,
6,MT-ND3,ENSHGLG00000000025,ortholog_one2one,67.8261,
...,...,...,...,...,...
16610,S100A13,ENSHGLG00000042410,ortholog_one2one,81.6327,
16611,TADA1,ENSHGLG00000015074,ortholog_one2one,97.0149,
16613,TSPAN1,ENSHGLG00000001782,ortholog_one2one,76.8908,
16614,JMJD4,ENSHGLG00000001365,ortholog_one2one,70.8061,


In [None]:
nmr_genes = pd.read_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/nmr_genes_biomart.csv')
human_genes = pd.read_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/human_genes_biomart.csv')
nmr_genes['heterocephalus_glaber_female_gene'] = nmr_genes['gene_id']
human_genes['homo_sapiens_gene'] = human_genes['gene_id']
full_hum2nmr = pd.merge(human2nmr.drop('dn_ds',axis=1), nmr_genes.drop('gene_id',axis=1), on='heterocephalus_glaber_female_gene')
full_nmr2hum = pd.merge(nmr2human.drop('dn_ds',axis=1), human_genes.drop('gene_id',axis=1), on='homo_sapiens_gene')
full_hum2nmr.rename(columns={'homo_sapiens_gene':'human_gene_name', 'heterocephalus_glaber_female_gene':'nmr_gene_acc','gene_name':'nmr_gene_name'},inplace=True)
full_nmr2hum.rename(columns={'heterocephalus_glaber_female_gene':'nmr_gene_name','homo_sapiens_gene':'human_gene_acc','gene_name':'human_gene_name'},inplace=True)
full_hum2nmr

Unnamed: 0,human_gene_name,nmr_gene_acc,orthology_type,perc_id,nmr_gene_name,chromosome,biotype,description,species
0,MT-ND1,ENSHGLG00000000006,ortholog_one2one,75.2351,ND1,MT,protein_coding,NADH dehydrogenase subunit 1 [Source:NCBI gene...,naked_mole_rat
1,MT-CO1,ENSHGLG00000000016,ortholog_one2one,91.0331,COX1,MT,protein_coding,cytochrome c oxidase subunit I [Source:NCBI ge...,naked_mole_rat
2,MT-CO2,ENSHGLG00000000019,ortholog_one2one,69.1630,COX2,MT,protein_coding,cytochrome c oxidase subunit II [Source:NCBI g...,naked_mole_rat
3,MT-CO3,ENSHGLG00000000023,ortholog_one2one,85.8238,COX3,MT,protein_coding,cytochrome c oxidase subunit III [Source:NCBI ...,naked_mole_rat
4,MT-ND3,ENSHGLG00000000025,ortholog_one2one,67.8261,ND3,MT,protein_coding,NADH dehydrogenase subunit 3 [Source:NCBI gene...,naked_mole_rat
...,...,...,...,...,...,...,...,...,...
14909,S100A13,ENSHGLG00000042410,ortholog_one2one,81.6327,,OX090908.1,protein_coding,,naked_mole_rat
14910,TADA1,ENSHGLG00000015074,ortholog_one2one,97.0149,TADA1,28,protein_coding,transcriptional adaptor 1 [Source:HGNC Symbol;...,naked_mole_rat
14911,TSPAN1,ENSHGLG00000001782,ortholog_one2one,76.8908,TSPAN1,7,protein_coding,tetraspanin 1 [Source:HGNC Symbol;Acc:HGNC:20657],naked_mole_rat
14912,JMJD4,ENSHGLG00000001365,ortholog_one2one,70.8061,JMJD4,1,protein_coding,jumonji domain containing 4 [Source:HGNC Symbo...,naked_mole_rat


In [30]:
full_hum2nmr.groupby('human_gene_name').count()['species'][full_hum2nmr.groupby('human_gene_name').count()['species']>1]

human_gene_name
AAAS       2
AADACL2    2
AATF       2
ABCB11     2
ABCC1      2
          ..
ZNHIT3     2
ZP2        2
ZSCAN2     2
ZSWIM4     2
ZYX        2
Name: species, Length: 790, dtype: int64

In [25]:
full_nmr2hum.to_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/gene_ref_nmr2hum.csv', index= False)
full_hum2nmr.to_csv('/home/raquelcr/NMR-snRNA-seq/auto_annotation/gene_ref_hum2nmr.csv', index= False)

In [19]:
full_hum2nmr

Unnamed: 0,homo_sapiens_gene,heterocephalus_glaber_female_gene,orthology_type,perc_id,gene_name,chromosome,biotype,description,species
0,MT-ND1,ENSHGLG00000000006,ortholog_one2one,75.2351,ND1,MT,protein_coding,NADH dehydrogenase subunit 1 [Source:NCBI gene...,naked_mole_rat
1,MT-CO1,ENSHGLG00000000016,ortholog_one2one,91.0331,COX1,MT,protein_coding,cytochrome c oxidase subunit I [Source:NCBI ge...,naked_mole_rat
2,MT-CO2,ENSHGLG00000000019,ortholog_one2one,69.1630,COX2,MT,protein_coding,cytochrome c oxidase subunit II [Source:NCBI g...,naked_mole_rat
3,MT-CO3,ENSHGLG00000000023,ortholog_one2one,85.8238,COX3,MT,protein_coding,cytochrome c oxidase subunit III [Source:NCBI ...,naked_mole_rat
4,MT-ND3,ENSHGLG00000000025,ortholog_one2one,67.8261,ND3,MT,protein_coding,NADH dehydrogenase subunit 3 [Source:NCBI gene...,naked_mole_rat
...,...,...,...,...,...,...,...,...,...
14909,S100A13,ENSHGLG00000042410,ortholog_one2one,81.6327,,OX090908.1,protein_coding,,naked_mole_rat
14910,TADA1,ENSHGLG00000015074,ortholog_one2one,97.0149,TADA1,28,protein_coding,transcriptional adaptor 1 [Source:HGNC Symbol;...,naked_mole_rat
14911,TSPAN1,ENSHGLG00000001782,ortholog_one2one,76.8908,TSPAN1,7,protein_coding,tetraspanin 1 [Source:HGNC Symbol;Acc:HGNC:20657],naked_mole_rat
14912,JMJD4,ENSHGLG00000001365,ortholog_one2one,70.8061,JMJD4,1,protein_coding,jumonji domain containing 4 [Source:HGNC Symbo...,naked_mole_rat


In [None]:
# Map mouse gene names to global names using gene_reference
mouse_gene_map = dict(zip(gene_reference['Mouse gene name'], gene_reference['Global_name']))
mouse_data.var['global_name'] = mouse_data.var_names.map(mouse_gene_map)


## Load NMR data 

In [4]:
nmr_files = glob.glob('/home/raquelcr/scanpy/cellbender/denoised/*_filtered.h5')
nmr_files

['/home/raquelcr/scanpy/cellbender/denoised/NMR2_cerebral_cortex_denoised_filtered.h5',
 '/home/raquelcr/scanpy/cellbender/denoised/NMR1_cerebral_cortex_denoised_filtered.h5',
 '/home/raquelcr/scanpy/cellbender/denoised/NMR3_hippocampus_denoised_filtered.h5',
 '/home/raquelcr/scanpy/cellbender/denoised/NMR4_hippocampus_denoised_filtered.h5',
 '/home/raquelcr/scanpy/cellbender/denoised/NMR5_midbrain_denoised_filtered.h5',
 '/home/raquelcr/scanpy/cellbender/denoised/NMR6_midbrain_denoised_filtered.h5']

In [5]:
nmr_adatas = []
nmr_gene_map = dict(zip(gene_reference['NMR gene name'], gene_reference['Global_name']))

for file in nmr_files:
    # CellBender output should be read with read_10x_h5
    try: 
        adata = sc.read_10x_h5(file)
    except:
        adata = sc.read(file)

    adata.var['species'] = 'nmr'

    sample = file.replace('/home/raquelcr/scanpy/cellbender/denoised/NMR', '').replace('_denoised_filtered.h5', '').split('_', 1)
    sample_id = sample[0]
    tissue = sample[1]

    print(f'Loading sample {sample_id} of {tissue}')

    adata.obs['species'] = 'nmr'
    adata.obs['tissue'] = tissue
    adata.obs['sample_id'] = sample_id
    adata.obs['replicate'] = (int(sample_id)+1)%2+1

    # Map NMR gene IDs to global names
    adata.var['global_name'] = adata.var_names.map(nmr_gene_map)
    
    nmr_adatas.append(adata)

nmr_adatas

Loading sample 2 of cerebral_cortex
Loading sample 1 of cerebral_cortex
Loading sample 3 of hippocampus
Loading sample 4 of hippocampus
Loading sample 5 of midbrain
Loading sample 6 of midbrain


[AnnData object with n_obs × n_vars = 19487 × 20774
     obs: 'species', 'tissue', 'sample_id', 'replicate'
     var: 'gene_ids', 'feature_types', 'genome', 'species', 'global_name',
 AnnData object with n_obs × n_vars = 14268 × 20774
     obs: 'species', 'tissue', 'sample_id', 'replicate'
     var: 'gene_ids', 'feature_types', 'genome', 'species', 'global_name',
 AnnData object with n_obs × n_vars = 6743 × 20774
     obs: 'species', 'tissue', 'sample_id', 'replicate'
     var: 'gene_ids', 'feature_types', 'genome', 'species', 'global_name',
 AnnData object with n_obs × n_vars = 8179 × 20774
     obs: 'species', 'tissue', 'sample_id', 'replicate'
     var: 'gene_ids', 'feature_types', 'genome', 'species', 'global_name',
 AnnData object with n_obs × n_vars = 7369 × 20774
     obs: 'species', 'tissue', 'sample_id', 'replicate'
     var: 'gene_ids', 'feature_types', 'genome', 'species', 'global_name',
 AnnData object with n_obs × n_vars = 10080 × 20774
     obs: 'species', 'tissue', 'samp

In [None]:
# Change gene names to human gene names


### Load few cells from human (adult and developing human) to perform cross-validation by triangulation

## GPT-Cell-Annotation

In [None]:
cell_ann = CellAnnotator(
    adata, species="human", tissue="brain", cluster_key="leiden", sample_key="samples",
).annotate_clusters()