# Ensembl genes table extraction EDA

This notebook is useful for development as well as exploratory data analysis on the extracted tables.
It is currently automically executed and saved as part of exports using `papermill`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from ensembl_genes import ensembl_genes
from bioregistry import normalize_prefix

In [3]:
# parameters cell
species = "human"
release = "111"

In [4]:
# Parameters
species = "rat"
release = "113"


In [5]:
ensg = ensembl_genes.Ensembl_Gene_Queries(release=release, species=species)
ensg.connection_url

'mysql+mysqlconnector://anonymous@ensembldb.ensembl.org:3306/rattus_norvegicus_core_113_72'

In [6]:
database = ensg.database
database

'rattus_norvegicus_core_113_72'

## Extract data

## gene attrib counts

In [7]:
ensg.run_query("gene_attrib_counts").head(15)

Unnamed: 0,attrib_type_id,code,name,description,attrib_type_count,attrib_type_examples
0,142,GeneGC,Gene GC,Percentage GC content for this gene,30562,"49.33, 44.32, 39.05, 40.58, 36.24, 47.10, 55.1..."
1,127,cds_end_NF,CDS end not found,,0,
2,32,KnwnPCCount,protein_coding_KNOWN,Number of Known Protein Coding,0,
3,90,bacend_well_nam,BACend well name,,0,
4,118,ensembl_name,Ensembl name,Name of equivalent Ensembl chromosome,0,
5,520,proj_parent_g,projection parent gene,Stable identifier of the parent gene this gene...,0,
6,358,PHIbase_mutant,PHI-base mutant,PHI-base phenotype of the mutants,0,
7,532,dbnsfp_revel_pred,dbNSFP REVEL prediction,dbNSFP REVEL prediction,0,
8,16,non_ref,Non Reference,Non Reference Sequence Region,0,
9,55,transcr_class,Transcript class,Transcript class,0,


## genes

In [8]:
ensg.gene_df.head()

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
0,ENSRNOG00000000001,6,Arsj,RGD,1307640,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,214774654,214854612,1,True,,,"arylsulfatase family, member J",RGD Symbol,1307640,ENSRNOG00000000001
1,ENSRNOG00000000007,8,Gad1,RGD,2652,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,55369704,55410333,1,True,,,glutamate decarboxylase 1,RGD Symbol,2652,ENSRNOG00000000007
2,ENSRNOG00000000008,8,Alx4,RGD,1310201,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,79611719,79648260,1,True,,,ALX homeobox 4,RGD Symbol,1310201,ENSRNOG00000000008
3,ENSRNOG00000000009,6,Tmco5b,RGD,1561237,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,100064979,100083289,1,True,,,transmembrane and coiled-coil domains 5B,RGD Symbol,1561237,ENSRNOG00000000009
4,ENSRNOG00000000010,6,Cbln1,RGD,1562813,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,19608716,19612572,1,True,,,cerebellin 1 precursor,RGD Symbol,1562813,ENSRNOG00000000010


In [9]:
# clone-based genes no longer get a symbol and are filled with the stable ID
# https://www.ensembl.info/2021/03/15/retirement-of-clone-based-gene-names/
ensg.gene_df.query("gene_symbol == ensembl_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
4841,ENSRNOG00000009738,6,ENSRNOG00000009738,,,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,5549839,5593185,1,True,,,,,,ENSRNOG00000009738
5326,ENSRNOG00000010657,5,ENSRNOG00000010657,,,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,80719012,80721104,1,True,,,,,,ENSRNOG00000010657


In [10]:
# which external database the gene symbol derives from versus the ensembl source
pd.crosstab(
    ensg.gene_df.ensembl_source,
    ensg.gene_df.gene_symbol_source_db.fillna("missing (clone-based)"),
    margins=True,
)

gene_symbol_source_db,EntrezGene,MGI,RFAM,RGD,missing (clone-based),All
ensembl_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RefSeq,0,0,0,37,0,37
ensembl,2,7,113,26023,4380,30525
All,2,7,113,26060,4380,30562


In [11]:
ensg.gene_df.coord_system.value_counts().head(10)

primary_assembly    30562
Name: coord_system, dtype: int64

In [12]:
ensg.gene_df.gene_biotype.value_counts().head(10)

protein_coding          23098
lncRNA                   2488
snoRNA                   1706
snRNA                    1512
pseudogene                726
miRNA                     444
rRNA                      210
processed_pseudogene      192
IG_V_gene                  38
scaRNA                     37
Name: gene_biotype, dtype: int64

In [13]:
pd.crosstab(ensg.gene_df.coord_system, ensg.gene_df.primary_assembly, margins=True)

primary_assembly,False,True,All
coord_system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
primary_assembly,108,30454,30562
All,108,30454,30562


In [14]:
ensg.gene_df.mhc.value_counts()

Series([], Name: mhc, dtype: int64)

In [15]:
len(ensg.gene_df)

30562

## alternative gene alleles

Related:

- [OTP: Origin of genes_with_non_reference_ensembl_ids.tsv](https://github.com/opentargets/platform/issues/702)
- [biostars: map between different assemblies of one ensembl release](https://www.biostars.org/p/143956/)
- using `attrib_type.code = "non_ref"` for `primary_assembly` doesn't appear to return any results

In [16]:
ensg.representative_gene_df.head()

Unnamed: 0,rs_allele_group,ensembl_gene_id,gene_symbol,ensembl_created_date,seq_region,primary_assembly,alt_allele_group_id,alt_allele_attrib,alt_allele_is_representative,ensembl_representative_gene_id,is_representative_gene
0,1700001K19Rikl,ENSRNOG00000007184,1700001K19Rikl,2009-07-29 15:36:02,6,True,,,False,ENSRNOG00000007184,True
1,1700006A11Rikl,ENSRNOG00000024928,1700006A11Rikl,2009-07-29 15:36:02,2,True,,,False,ENSRNOG00000024928,True
2,1700009N14Rikl,ENSRNOG00000031013,1700009N14Rikl,2005-03-02 00:00:00,5,True,,,False,ENSRNOG00000031013,True
3,1700012A03Rikl,ENSRNOG00000027055,1700012A03Rikl,2009-07-29 15:36:02,4,True,,,False,ENSRNOG00000027055,True
4,1700012B07Rkl,ENSRNOG00000024233,1700012B07Rkl,2009-07-29 15:36:02,10,True,,,False,ENSRNOG00000024233,True


In [17]:
# looks like non_ref isn't set for human genes
query = '''
SELECT *
FROM gene_attrib
LEFT JOIN attrib_type
  ON gene_attrib.attrib_type_id = attrib_type.attrib_type_id
WHERE attrib_type.code = "non_ref"
LIMIT 5
'''
pd.read_sql(sql=query, con=ensg.connection_url)

Unnamed: 0,gene_id,attrib_type_id,value,attrib_type_id.1,code,name,description


In [18]:
ensg.representative_gene_df.alt_allele_attrib.value_counts()

Series([], Name: alt_allele_attrib, dtype: int64)

In [19]:
ensg.representative_gene_df.query("ensembl_gene_id != ensembl_representative_gene_id").head(2)

Unnamed: 0,rs_allele_group,ensembl_gene_id,gene_symbol,ensembl_created_date,seq_region,primary_assembly,alt_allele_group_id,alt_allele_attrib,alt_allele_is_representative,ensembl_representative_gene_id,is_representative_gene
9,1700020N15Rikl,ENSRNOG00000067249,1700020N15Rikl,2021-02-26 12:35:27,X,True,,,False,ENSRNOG00000064273,False
47,4930596D02Rikl2,ENSRNOG00000063038,4930596D02Rikl2,2021-02-26 12:35:27,17,True,,,False,ENSRNOG00000042120,False


# replaced ID converter

A single `old_stable_id` can map to multiple `new_stable_id`. For example, `ENSG00000152006`

https://uswest.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=AzhM62SpkvdiLC4H-6808613

Requested ID | Matched ID(s) | Releases
-- | -- | --
ENSG00000152006 | ENSG00000196273 | 26: ENSG00000196273.1
ENSG00000152006 | ENSG00000197016 | 26: ENSG00000197016.1
ENSG00000152006 | ENSG00000196239 | 26: ENSG00000196239.1

In [20]:
ensg.old_to_new_df.head(2)

Unnamed: 0,old_ensembl_gene_id,new_ensembl_gene_id
0,ENSRNOG00000000132,ENSRNOG00000031425
1,ENSRNOG00000000194,ENSRNOG00000031589


In [21]:
# some ensembl genes replaced by many new ensembl genes
ensg.old_to_new_df.old_ensembl_gene_id.value_counts().head(2)

ENSRNOG00000034138    202
ENSRNOG00000050259     89
Name: old_ensembl_gene_id, dtype: int64

In [22]:
# example
ensg._update_ensembl_gene("ENSG00000152006")

{'ENSG00000152006'}

In [23]:
ensg.old_to_newest_df.head(2)

Unnamed: 0,old_ensembl_gene_id,newest_ensembl_gene_id,is_current
0,ENSRNOG00000000132,ENSRNOG00000031425,True
1,ENSRNOG00000000194,ENSRNOG00000031589,False


In [24]:
len(ensg.old_to_newest_df)

34351

In [25]:
ensg.old_to_newest_df.is_current.value_counts()

True     28635
False     5716
Name: is_current, dtype: int64

## omni-updater

The omni-updater dataset is designed to convert ensembl gene IDs from input data to the current, representative ensembl_gene_ids for this ensembl release. It assumes:

- users want to update outdated genes with their replacements
- users want a dataset of representative genes only, and want to convert alternative alleles to representative genes

An inner join of a dataset with `update_df` on `input_ensembl_gene_id` will do the following:

- produce output ensembl_gene_ids that are current and representatives
- update outdated genes with their current identifiers. Outdated genes with no current replacement will be removed by the inner join.
- update alternative gene alleles with their representatives
- genes that are already represenative and current will map to themselves

In [26]:
ensg.update_df.head(2)

input_ensembl_gene_id    ensembl_gene_id  input_current  input_representative
   ENSRNOG00000033805 ENSRNOG00000033805           True                  True
   ENSRNOG00000033805 ENSRNOG00000033805          False                 False
   ENSRNOG00000034061 ENSRNOG00000034061           True                  True
   ENSRNOG00000034061 ENSRNOG00000034061          False                 False


Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
0,ENSRNOG00000000001,ENSRNOG00000000001,True,True,1,1
1,ENSRNOG00000000007,ENSRNOG00000000007,True,True,1,1


In [27]:
ensg.update_df.sort_values("input_maps_to_n_genes", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
6131,ENSRNOG00000027512,ENSRNOG00000069344,False,True,83,26
4443,ENSRNOG00000027512,ENSRNOG00000067897,False,True,83,40


In [28]:
ensg.update_df.sort_values("n_inputs_map_to_gene", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
7952,ENSRNOG00000026399,ENSRNOG00000062967,False,True,60,147
8020,ENSRNOG00000055339,ENSRNOG00000062967,False,True,13,147


In [29]:
(ensg.update_df.input_maps_to_n_genes == 1).mean()

0.5605829580511329

In [30]:
ensg.update_df.query("ensembl_gene_id == 'ENSG00000256263'")

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene


In [31]:
print(
    f"The omni-updater contains {len(ensg.update_df):,} rows for mapping "
    f"{ensg.update_df.input_ensembl_gene_id.nunique():,} input genes to "
    f"{ensg.update_df.ensembl_gene_id.nunique():,} current, representative genes."
)

The omni-updater contains 56,402 rows for mapping 34,587 input genes to 29,978 current, representative genes.


In [32]:
# https://useast.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=P45VLMbogubpI0QA-6815464
ensg.update_df.query("input_ensembl_gene_id == 'ENSG00000201456'").head(3)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene


## cross-refrences (xrefs)

In [33]:
ensg.xref_df.head()

Unnamed: 0,ensembl_representative_gene_id,ensembl_gene_id,gene_symbol,xref_source,xref_accession,xref_label,xref_description,xref_info_type,xref_linkage_annotation,xref_curie
0,ENSRNOG00000000001,ENSRNOG00000000001,Arsj,ArrayExpress,ENSRNOG00000000001,ENSRNOG00000000001,,DIRECT,,arrayexpress:ENSRNOG00000000001
1,ENSRNOG00000000001,ENSRNOG00000000001,Arsj,EntrezGene,311013,Arsj,"arylsulfatase family, member J",DEPENDENT,,ncbigene:311013
11,ENSRNOG00000000001,ENSRNOG00000000001,Arsj,RGD,1307640,Arsj,"arylsulfatase family, member J",DIRECT,,rgd:1307640
12,ENSRNOG00000000001,ENSRNOG00000000001,Arsj,RGD,15003202,AABR07013255.1,,DIRECT,,rgd:15003202
2,ENSRNOG00000000001,ENSRNOG00000000001,Arsj,Reactome_gene,R-RNO-1430728,R-RNO-1430728,Metabolism,DIRECT,,reactome:R-RNO-1430728


In [34]:
# datasets where there are ensembl_gene_id-xref_source-xref_accession pairs might not be distinct 
xref_dup_df = ensg.xref_df[ensg.xref_df.duplicated(subset=["ensembl_gene_id", "xref_source", "xref_accession"], keep=False)]
xref_dup_df.xref_source.value_counts()

RGD    166
Name: xref_source, dtype: int64

In [35]:
# xref sources versus info_types
df = pd.crosstab(ensg.xref_df.xref_source, ensg.xref_df.xref_info_type, margins=True)
df["bioregistry_prefix"] = df.index.to_series().replace(ensg._xref_prefix_updater).map(normalize_prefix)
df

xref_info_type,DEPENDENT,DIRECT,MISC,PROJECTION,All,bioregistry_prefix
xref_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ArrayExpress,0,30562,0,0,30562,arrayexpress
EntrezGene,31587,0,0,0,31587,ncbigene
MGI,0,0,0,7,7,mgi
RFAM,0,2029,0,0,2029,rfam
RGD,2519,27920,0,0,30439,rgd
Reactome_gene,0,84358,0,0,84358,reactome
Uniprot_gn,53833,0,0,0,53833,uniprot
WikiGene,31587,0,0,0,31587,wikigenes
miRBase,0,0,428,0,428,mirbase
All,119526,144869,428,7,264830,


## Gene Ontology xrefs

In [36]:
ensg.xref_go_df.head(3)

Unnamed: 0,ensembl_gene_id,go_id,go_label,go_evidence_codes,xref_info_types,xref_info_texts,ensembl_transcript_ids,ensembl_representative_gene_id
0,ENSRNOG00000000001,GO:0015629,actin cytoskeleton,IEA,PROJECTION,from homo_sapiens translation ENSP00000320219,ENSRNOT00000055633,ENSRNOG00000000001
1,ENSRNOG00000000007,GO:0004351,glutamate decarboxylase activity,IEA,PROJECTION,"from homo_sapiens translation ENSP00000350928,...",ENSRNOT00000087134,ENSRNOG00000000007
2,ENSRNOG00000000007,GO:0005737,cytoplasm,IEA,PROJECTION,from mus_musculus translation ENSMUSP00000092539,ENSRNOT00000087134,ENSRNOG00000000007


In [37]:
# GO terms for CCR5
# compare to http://useast.ensembl.org/Homo_sapiens/Gene/Ontologies/molecular_function?g=ENSG00000160791
sorted(ensg.xref_go_df.query("ensembl_gene_id == 'ENSG00000160791'").go_label)

[]

## lrg xrefs

In [38]:
ensg.xref_lrg_df.head(2)

Unnamed: 0,ensembl_gene_id,lrg_gene_id


In [39]:
len(ensg.xref_lrg_df)

0

### ncbigene xrefs

In [40]:
ensg.xref_ncbigene_df.head()

Unnamed: 0,ensembl_representative_gene_id,ncbigene_id,gene_symbol,ncbigene_symbol
1,ENSRNOG00000000001,311013,Arsj,Arsj
17,ENSRNOG00000000007,24379,Gad1,Gad1
30,ENSRNOG00000000008,296511,Alx4,Alx4
40,ENSRNOG00000000010,498922,Cbln1,Cbln1
46,ENSRNOG00000000012,296272,Tcf15,Tcf15


In [41]:
# ensembl gene mapped to by multiple ncbigenes
ensg.xref_ncbigene_df.ensembl_representative_gene_id.value_counts().head(3)

ENSRNOG00000000001    1
ENSRNOG00000037100    1
ENSRNOG00000037428    1
Name: ensembl_representative_gene_id, dtype: int64

In [42]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ensembl_representative_gene_id.duplicated().sum()

(20569, 0)

In [43]:
# ncbigene mapped to by multiple ensembl genes, likely due to alt gene alleles
ensg.xref_ncbigene_df.ncbigene_id.value_counts().head(3)

120094551    14
120101277     6
502966        3
Name: ncbigene_id, dtype: int64

In [44]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ncbigene_id.duplicated().sum()

(20569, 64)

In [45]:
# ensg.xref_ncbigene_df.query("ensembl_representative_gene_id == 'ENSG00000231500'")
# ensg.xref_ncbigene_df.query("ncbigene_id == '51206'")

In [46]:
repr_ensembl_gene_ids = set(ensg.gene_df.ensembl_representative_gene_id)
len(repr_ensembl_gene_ids)

29978

In [47]:
# many of these genes should probably be alternative alleles rather than representative
ensg.gene_df.query("not primary_assembly and ensembl_gene_id==ensembl_representative_gene_id")

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
16357,ENSRNOG00000046163,3,Iqcm,RGD,1359286,protein_coding,ensembl,2012-11-09 06:35:19,2021-02-26 12:35:27,mRatBN7.2,...,26164,201125,-1,False,,,IQ motif containing M,RGD Symbol,1359286,ENSRNOG00000046163
16772,ENSRNOG00000047746,2,AABR07000398.1,RGD,15005339,protein_coding,ensembl,2012-11-09 06:35:19,2021-02-26 12:35:27,mRatBN7.2,...,68076,73179,-1,False,,,,,,ENSRNOG00000047746
17833,ENSRNOG00000051956,2,LOC108350980,RGD,11448573,lncRNA,ensembl,2015-04-02 16:53:59,2021-02-26 12:35:27,mRatBN7.2,...,202745,211209,1,False,,,uncharacterized LOC108350980,RGD Symbol,11448573,ENSRNOG00000051956
18249,ENSRNOG00000053042,3,Zfy1,RGD,9203794,protein_coding,ensembl,2015-04-02 16:53:59,2021-02-26 12:35:27,mRatBN7.2,...,234470,273381,1,False,,,"zinc finger protein 1, Y-linked",RGD Symbol,9203794,ENSRNOG00000053042
18848,ENSRNOG00000054648,3,Tspy1,RGD,3912,protein_coding,ensembl,2015-04-02 16:53:59,2021-02-26 12:35:27,mRatBN7.2,...,201814,205128,1,False,,,"testis specific protein, Y-linked 1",RGD Symbol,3912,ENSRNOG00000054648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30073,ENSRNOG00000070731,1,ENSRNOG00000070731,,,lncRNA,ensembl,2021-02-26 12:35:27,2021-02-26 12:35:27,mRatBN7.2,...,9954,25269,-1,False,,,,,,ENSRNOG00000070731
30080,ENSRNOG00000070738,1,ENSRNOG00000070738,,,protein_coding,ensembl,2021-02-26 12:35:27,2021-02-26 12:35:27,mRatBN7.2,...,7877,11254,1,False,,,,,,ENSRNOG00000070738
30161,ENSRNOG00000070819,1,ENSRNOG00000070819,,,protein_coding,ensembl,2021-02-26 12:35:27,2021-02-26 12:35:27,mRatBN7.2,...,58779,63710,1,False,,,,,,ENSRNOG00000070819
30237,ENSRNOG00000070895,1,ENSRNOG00000070895,,,lncRNA,ensembl,2021-02-26 12:35:27,2021-02-26 12:35:27,mRatBN7.2,...,12273,13973,1,False,,,,,,ENSRNOG00000070895
