# Ensembl genes table extraction EDA

This notebook is useful for development as well as exploratory data analysis on the extracted tables.
It is currently automically executed and saved as part of exports using `papermill`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from ensembl_genes import ensembl_genes

In [3]:
# parameters cell
species = "human"
release = "104"

In [4]:
# Parameters
species = "rat"
release = "105"


In [5]:
ensg = ensembl_genes.Ensembl_Gene_Queries(release=release, species=species)
ensg.connection_url

'mysql+mysqlconnector://anonymous@ensembldb.ensembl.org:3306/rattus_norvegicus_core_105_72'

In [6]:
database = ensg.database
database

'rattus_norvegicus_core_105_72'

## Extract data

## gene attrib counts

In [7]:
ensg.run_query("gene_attrib_counts").head(15)

Unnamed: 0,attrib_type_id,code,name,description,attrib_type_count,attrib_type_examples
0,142,GeneGC,Gene GC,Percentage GC content for this gene,30560,"43.48, 40.93, 49.33, 44.32, 39.05, 40.58, 36.2..."
1,127,cds_end_NF,CDS end not found,,0,
2,32,KnwnPCCount,protein_coding_KNOWN,Number of Known Protein Coding,0,
3,90,bacend_well_nam,BACend well name,,0,
4,118,ensembl_name,Ensembl name,Name of equivalent Ensembl chromosome,0,
5,520,proj_parent_g,projection parent gene,Stable identifier of the parent gene this gene...,0,
6,358,PHIbase_mutant,PHI-base mutant,PHI-base phenotype of the mutants,0,
7,532,dbnsfp_revel_pred,dbNSFP REVEL prediction,dbNSFP REVEL prediction,0,
8,16,non_ref,Non Reference,Non Reference Sequence Region,0,
9,55,transcr_class,Transcript class,Transcript class,0,


## genes

In [8]:
ensg.gene_df.head()

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
0,ENSRNOG00000000001,6,Arsj,RGD,1307640,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,214774654,214854612,1,True,,,"arylsulfatase family, member J",RGD Symbol,1307640,ENSRNOG00000000001
1,ENSRNOG00000000007,8,Gad1,RGD,2652,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,55369704,55410333,1,True,,,glutamate decarboxylase 1,RGD Symbol,2652,ENSRNOG00000000007
2,ENSRNOG00000000008,8,Alx4,RGD,1310201,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,79611719,79648260,1,True,,,ALX homeobox 4,RGD Symbol,1310201,ENSRNOG00000000008
3,ENSRNOG00000000009,6,Tmco5b,RGD,1561237,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,100064979,100083289,1,True,,,transmembrane and coiled-coil domains 5B,RGD Symbol,1561237,ENSRNOG00000000009
4,ENSRNOG00000000010,6,Cbln1,RGD,1562813,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,19608716,19612572,1,True,,,cerebellin 1 precursor,RGD Symbol,1562813,ENSRNOG00000000010


In [9]:
# clone-based genes no longer get a symbol and are filled with the stable ID
# https://www.ensembl.info/2021/03/15/retirement-of-clone-based-gene-names/
ensg.gene_df.query("gene_symbol == ensembl_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
20,ENSRNOG00000000053,8,ENSRNOG00000000053,,,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,85124977,85175178,1,True,,,,,,ENSRNOG00000000053
36,ENSRNOG00000000097,7,ENSRNOG00000000097,,,protein_coding,ensembl,2009-07-29 15:36:02,2021-02-26 12:35:27,mRatBN7.2,...,184340599,184375834,-1,True,,,,,,ENSRNOG00000000097


In [10]:
# which external database the gene symbol derives from versus the ensembl source
pd.crosstab(
    ensg.gene_df.ensembl_source,
    ensg.gene_df.gene_symbol_source_db.fillna("missing (clone-based)"),
    margins=True,
)

gene_symbol_source_db,EntrezGene,MGI,RFAM,RGD,Uniprot_gn,miRBase,missing (clone-based),All
ensembl_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RefSeq,0,0,0,37,0,0,0,37
ensembl,43,672,1024,22239,51,4,6490,30523
All,43,672,1024,22276,51,4,6490,30560


In [11]:
ensg.gene_df.coord_system.value_counts().head(10)

primary_assembly    30560
Name: coord_system, dtype: int64

In [12]:
ensg.gene_df.gene_biotype.value_counts().head(10)

protein_coding          23096
lncRNA                   2488
snoRNA                   1706
snRNA                    1512
pseudogene                726
miRNA                     444
rRNA                      210
processed_pseudogene      192
IG_V_gene                  38
scaRNA                     37
Name: gene_biotype, dtype: int64

In [13]:
ensg.gene_df.seq_region_exc_type.value_counts(dropna=False)

NaN    30560
Name: seq_region_exc_type, dtype: int64

In [14]:
ensg.gene_df.mhc.value_counts()

Series([], Name: mhc, dtype: int64)

In [15]:
len(ensg.gene_df)

30560

## alternative gene alleles

Related:

- [OTP: Origin of genes_with_non_reference_ensembl_ids.tsv](https://github.com/opentargets/platform/issues/702)
- [biostars: map between different assemblies of one ensembl release](https://www.biostars.org/p/143956/)
- using `attrib_type.code = "non_ref"` for `primary_assembly` doesn't appear to return any results

In [16]:
ensg.alt_allele_df.head()

Unnamed: 0_level_0,ensembl_gene_id,alt_allele_group_id,alt_allele_is_representative,primary_assembly,seq_region,alt_allele_attrib,ensembl_created_date,ensembl_representative_gene_id,is_representative_gene,representative_gene_method
alt_allele_group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [17]:
# looks like non_ref isn't set for human genes
query = '''
SELECT *
FROM gene_attrib
LEFT JOIN attrib_type
  ON gene_attrib.attrib_type_id = attrib_type.attrib_type_id
WHERE attrib_type.code = "non_ref"
LIMIT 5
'''
pd.read_sql(sql=query, con=ensg.connection_url)

Unnamed: 0,gene_id,attrib_type_id,value,attrib_type_id.1,code,name,description


In [18]:
ensg.alt_allele_df.alt_allele_attrib.value_counts()

Series([], Name: alt_allele_attrib, dtype: int64)

In [19]:
ensg.alt_allele_df.query("is_representative_gene").representative_gene_method.value_counts()

Series([], Name: representative_gene_method, dtype: int64)

In [20]:
ensg.gene_df.query("ensembl_gene_id != ensembl_representative_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id


# replaced ID converter

A single `old_stable_id` can map to multiple `new_stable_id`. For example, `ENSG00000152006`

https://uswest.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=AzhM62SpkvdiLC4H-6808613

Requested ID | Matched ID(s) | Releases
-- | -- | --
ENSG00000152006 | ENSG00000196273 | 26: ENSG00000196273.1
ENSG00000152006 | ENSG00000197016 | 26: ENSG00000197016.1
ENSG00000152006 | ENSG00000196239 | 26: ENSG00000196239.1

In [21]:
ensg.old_to_new_df.head(2)

Unnamed: 0,old_ensembl_gene_id,new_ensembl_gene_id
0,ENSRNOG00000000132,ENSRNOG00000031425
1,ENSRNOG00000000194,ENSRNOG00000031589


In [22]:
# some ensembl genes replaced by many new ensembl genes
ensg.old_to_new_df.old_ensembl_gene_id.value_counts().head(2)

ENSRNOG00000034138    202
ENSRNOG00000050259     89
Name: old_ensembl_gene_id, dtype: int64

In [23]:
# example
ensg._update_ensembl_gene("ENSG00000152006")

{'ENSG00000152006'}

In [24]:
ensg.old_to_newest_df.head(2)

Unnamed: 0,old_ensembl_gene_id,newest_ensembl_gene_id,is_current
0,ENSRNOG00000000132,ENSRNOG00000031425,True
1,ENSRNOG00000000194,ENSRNOG00000031589,False


In [25]:
len(ensg.old_to_newest_df)

34351

In [26]:
ensg.old_to_newest_df.is_current.value_counts()

True     28635
False     5716
Name: is_current, dtype: int64

## omni-updater

The omni-updater dataset is designed to convert ensembl gene IDs from input data to the current, representative ensembl_gene_ids for this ensembl release. It assumes:

- users want to update outdated genes with their replacements
- users want a dataset of representative genes only, and want to convert alternative alleles to representative genes

An inner join of a dataset with `update_df` on `input_ensembl_gene_id` will do the following:

- produce output ensembl_gene_ids that are current and representatives
- update outdated genes with their current identifiers. Outdated genes with no current replacement will be removed by the inner join.
- update alternative gene alleles with their representatives
- genes that are already represenative and current will map to themselves

In [27]:
ensg.update_df.head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
0,ENSRNOG00000000001,ENSRNOG00000000001,True,True,1,1
1,ENSRNOG00000000007,ENSRNOG00000000007,True,True,1,1


In [28]:
ensg.update_df.sort_values("input_maps_to_n_genes", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
18878,ENSRNOG00000034630,ENSRNOG00000064343,False,True,95,59
20641,ENSRNOG00000035405,ENSRNOG00000067020,False,True,95,59


In [29]:
ensg.update_df.sort_values("n_inputs_map_to_gene", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
7950,ENSRNOG00000026365,ENSRNOG00000062967,False,True,22,147
8012,ENSRNOG00000049973,ENSRNOG00000062967,False,True,15,147


In [30]:
(ensg.update_df.input_maps_to_n_genes == 1).mean()

0.5335078976264888

In [31]:
ensg.update_df.query("ensembl_gene_id == 'ENSG00000256263'")

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene


In [32]:
print(
    f"The omni-updater contains {len(ensg.update_df):,} rows for mapping "
    f"{ensg.update_df.input_ensembl_gene_id.nunique():,} input genes to "
    f"{ensg.update_df.ensembl_gene_id.nunique():,} current, representative genes."
)

The omni-updater contains 59,195 rows for mapping 34,585 input genes to 30,560 current, representative genes.


In [33]:
# https://useast.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=P45VLMbogubpI0QA-6815464
ensg.update_df.query("input_ensembl_gene_id == 'ENSG00000201456'").head(3)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene


## cross-refrences (xrefs)

In [34]:
ensg.xref_df.head()

Unnamed: 0,ensembl_gene_id,xref_source,xref_accession,xref_label,xref_description,xref_info_type,xref_linkage_annotation
0,ENSRNOG00000000001,ArrayExpress,ENSRNOG00000000001,ENSRNOG00000000001,,DIRECT,
1,ENSRNOG00000000001,EntrezGene,311013,Arsj,"arylsulfatase family, member J",DEPENDENT,
2,ENSRNOG00000000001,RGD,1307640,Arsj,"arylsulfatase family, member J",DEPENDENT,
3,ENSRNOG00000000001,RGD,15003202,AABR07013255.1,,DIRECT,
4,ENSRNOG00000000001,Uniprot_gn,Q32KJ7,Arsj,,DEPENDENT,


In [35]:
# datasets where there are ensembl_gene_id-xref_source-xref_accession pairs might not be distinct 
xref_dup_df = ensg.xref_df[ensg.xref_df.duplicated(subset=["ensembl_gene_id", "xref_source", "xref_accession"], keep=False)]
xref_dup_df.xref_source.value_counts()

Series([], Name: xref_source, dtype: int64)

In [36]:
# xref sources versus info_types
pd.crosstab(ensg.xref_df.xref_source, ensg.xref_df.xref_info_type, margins=True)

xref_info_type,DEPENDENT,DIRECT,MISC,PROJECTION,All
xref_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ArrayExpress,0,30559,0,0,30559
EntrezGene,23564,0,0,0,23564
MGI,0,0,0,672,672
RFAM,0,2029,0,0,2029
RGD,23235,4396,0,0,27631
Reactome_gene,0,76351,0,0,76351
Uniprot_gn,15124,0,0,0,15124
WikiGene,23564,0,0,0,23564
miRBase,0,0,428,0,428
All,85487,113335,428,672,199922


## Gene Ontology xrefs

In [37]:
ensg.xref_go_df.head(3)

Unnamed: 0,ensembl_gene_id,go_id,go_label,go_evidence_codes,xref_info_types,ensembl_transcript_ids,ensembl_representative_gene_id
0,ENSRNOG00000000001,GO:0003824,catalytic activity,IEA,"DEPENDENT,DIRECT",ENSRNOT00000055633,ENSRNOG00000000001
1,ENSRNOG00000000001,GO:0008484,sulfuric ester hydrolase activity,IEA,"DEPENDENT,DIRECT",ENSRNOT00000055633,ENSRNOG00000000001
2,ENSRNOG00000000001,GO:0015629,actin cytoskeleton,IEA,PROJECTION,ENSRNOT00000055633,ENSRNOG00000000001


In [38]:
# GO terms for CCR5
# compare to http://useast.ensembl.org/Homo_sapiens/Gene/Ontologies/molecular_function?g=ENSG00000160791
sorted(ensg.xref_go_df.query("ensembl_gene_id == 'ENSG00000160791'").go_label)

[]

## lrg xrefs

In [39]:
ensg.xref_lrg_df.head(2)

Unnamed: 0,ensembl_gene_id,lrg_gene_id


In [40]:
len(ensg.xref_lrg_df)

0

### ncbigene xrefs

In [41]:
ensg.xref_ncbigene_df.head()

Unnamed: 0,ensembl_representative_gene_id,ncbigene_id
0,ENSRNOG00000000001,311013
1,ENSRNOG00000000007,24379
2,ENSRNOG00000000008,296511
3,ENSRNOG00000000009,366158
4,ENSRNOG00000000010,498922


In [42]:
# ensembl gene mapped to by multiple ncbigenes
ensg.xref_ncbigene_df.ensembl_representative_gene_id.value_counts().head(3)

ENSRNOG00000063070    13
ENSRNOG00000020119    11
ENSRNOG00000018740     8
Name: ensembl_representative_gene_id, dtype: int64

In [43]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ensembl_representative_gene_id.duplicated().sum()

(23564, 488)

In [44]:
# ncbigene mapped to by multiple ensembl genes, likely due to alt gene alleles
ensg.xref_ncbigene_df.ncbigene_id.value_counts().head(3)

120100122    46
120100121    46
364723        9
Name: ncbigene_id, dtype: int64

In [45]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ncbigene_id.duplicated().sum()

(23564, 167)

In [46]:
# ensg.xref_ncbigene_df.query("ensembl_representative_gene_id == 'ENSG00000231500'")
# ensg.xref_ncbigene_df.query("ncbigene_id == '51206'")

In [47]:
repr_ensembl_gene_ids = set(ensg.gene_df.ensembl_representative_gene_id)
len(repr_ensembl_gene_ids)

30560

In [48]:
# many of these genes should probably be alternative alleles rather than representative
ensg.gene_df.query("not primary_assembly and ensembl_gene_id==ensembl_representative_gene_id")

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
