# Basic usage

This notebook shows some basic usage of the genomic-features package.

In [1]:
import pandas as pd

import genomic_features as gf

## Retrieving an annotation:

We can load annotation tables using {func}`genomic_features.ensembl.annotation`.

In [2]:
ensdb = gf.ensembl.annotation(species="Hsapiens", version="97")

These tables have been created for the [`ensembldb` Bioconductor package](https://bioconductor.org/packages/release/bioc/html/AnnotationHub.html) {cite:p}`Rainer_2019`, and are automatically downloaded and cached from the [`AnnotationHub`](https://bioconductor.org/packages/release/bioc/html/AnnotationHub.html) resource.

## Using annotations

In [3]:
genes = ensdb.genes()
genes.head()

Unnamed: 0,gene_id,gene_name,gene_biotype,gene_seq_start,gene_seq_end,seq_name,seq_strand,seq_coord_system,description,gene_id_version
0,ENSG00000000003,TSPAN6,protein_coding,100627109,100639991,X,-1,chromosome,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],ENSG00000000003.14
1,ENSG00000000005,TNMD,protein_coding,100584936,100599885,X,1,chromosome,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],ENSG00000000005.6
2,ENSG00000000419,DPM1,protein_coding,50934867,50958555,20,-1,chromosome,dolichyl-phosphate mannosyltransferase subunit...,ENSG00000000419.12
3,ENSG00000000457,SCYL3,protein_coding,169849631,169894267,1,-1,chromosome,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,ENSG00000000457.14
4,ENSG00000000460,C1orf112,protein_coding,169662007,169854080,1,1,chromosome,chromosome 1 open reading frame 112 [Source:HG...,ENSG00000000460.17


In [4]:
genes


Unnamed: 0,gene_id,gene_name,gene_biotype,gene_seq_start,gene_seq_end,seq_name,seq_strand,seq_coord_system,description,gene_id_version
0,ENSG00000000003,TSPAN6,protein_coding,100627109,100639991,X,-1,chromosome,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],ENSG00000000003.14
1,ENSG00000000005,TNMD,protein_coding,100584936,100599885,X,1,chromosome,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],ENSG00000000005.6
2,ENSG00000000419,DPM1,protein_coding,50934867,50958555,20,-1,chromosome,dolichyl-phosphate mannosyltransferase subunit...,ENSG00000000419.12
3,ENSG00000000457,SCYL3,protein_coding,169849631,169894267,1,-1,chromosome,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,ENSG00000000457.14
4,ENSG00000000460,C1orf112,protein_coding,169662007,169854080,1,1,chromosome,chromosome 1 open reading frame 112 [Source:HG...,ENSG00000000460.17
...,...,...,...,...,...,...,...,...,...,...
67662,LRG_995,FUBP1,LRG_gene,77948402,77979205,1,-1,chromosome,far upstream element binding protein 1 [Source...,LRG_995.1
67663,LRG_996,ERBB3,LRG_gene,56080025,56103507,12,1,chromosome,erb-b2 receptor tyrosine kinase 3 [Source:HGNC...,LRG_996.1
67664,LRG_997,ROS1,LRG_gene,117288367,117425855,6,-1,chromosome,"ROS proto-oncogene 1, receptor tyrosine kinase...",LRG_997.1
67665,LRG_998,CCND3,LRG_gene,41934933,42048894,6,-1,chromosome,cyclin D3 [Source:HGNC Symbol;Acc:HGNC:1585],LRG_998.1


In [5]:
ensdb.chromosomes()

Unnamed: 0,seq_name,seq_length,is_circular
0,X,156040895,0
1,20,64444167,0
2,1,248956422,0
3,6,170805979,0
4,3,198295559,0
...,...,...,...
419,LRG_311,115492,0
420,LRG_721,33396,0
421,LRG_741,231167,0
422,LRG_763,176286,0


### Adding annotations to an AnnData object:

In [18]:
import scanpy as sc
pbmc = sc.datasets.pbmc3k()

In [19]:
pbmc.var.head()

Unnamed: 0_level_0,gene_ids
index,Unnamed: 1_level_1
MIR1302-10,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945


In [8]:
annotated_var = gf.annotate_anndata(pbmc.var, genes, on='gene_ids')
annotated_var

  annotated_var = gf.annotate_anndata(pbmc.var, genes, on='gene_ids')


Unnamed: 0_level_0,gene_ids,gene_id,gene_name,gene_biotype,gene_seq_start,gene_seq_end,seq_name,seq_strand,seq_coord_system,description,gene_id_version
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MIR1302-10,ENSG00000243485,ENSG00000243485,MIR1302-2HG,lncRNA,29554.0,31109.0,1,1.0,chromosome,MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG...,ENSG00000243485.5
FAM138A,ENSG00000237613,ENSG00000237613,FAM138A,lncRNA,34554.0,36081.0,1,-1.0,chromosome,family with sequence similarity 138 member A [...,ENSG00000237613.2
OR4F5,ENSG00000186092,ENSG00000186092,OR4F5,protein_coding,65419.0,71585.0,1,1.0,chromosome,olfactory receptor family 4 subfamily F member...,ENSG00000186092.6
RP11-34P13.7,ENSG00000238009,ENSG00000238009,AL627309.1,lncRNA,89295.0,133723.0,1,-1.0,chromosome,novel transcript,ENSG00000238009.6
RP11-34P13.8,ENSG00000239945,ENSG00000239945,AL627309.3,lncRNA,89551.0,91105.0,1,-1.0,chromosome,novel transcript,ENSG00000239945.1
...,...,...,...,...,...,...,...,...,...,...,...
AC145205.1,ENSG00000215635,,,,,,,,,,
BAGE5,ENSG00000268590,,,,,,,,,,
CU459201.1,ENSG00000251180,,,,,,,,,,
AC002321.2,ENSG00000215616,,,,,,,,,,


In [9]:
pbmc.var = annotated_var.copy()

In [23]:
adata_var['gene_id'] = adata_var['gene_ids'].copy()

In [24]:
common_cols = genes.columns[genes.columns.isin(pbmc.var.columns)]

In [25]:
adata_var = pbmc.var

In [27]:
adata_var[c]

index
MIR1302-10      ENSG00000243485
FAM138A         ENSG00000237613
OR4F5           ENSG00000186092
RP11-34P13.7    ENSG00000238009
RP11-34P13.8    ENSG00000239945
                     ...       
AC145205.1      ENSG00000215635
BAGE5           ENSG00000268590
CU459201.1      ENSG00000251180
AC002321.2      ENSG00000215616
AC002321.1      ENSG00000215611
Name: gene_id, Length: 32738, dtype: object

In [26]:
for c in common_cols:
    adata_var[c] == genes[c]

ValueError: Can only compare identically-labeled Series objects

In [12]:
gf.annotate_anndata(pbmc.var, genes)

KeyError: "None of [Index(['gene_id', 'gene_name', 'gene_biotype', 'gene_seq_start',\n       'gene_seq_end', 'seq_name', 'seq_strand', 'seq_coord_system',\n       'description', 'gene_id_version'],\n      dtype='object')] are in the [columns]"