# Ensembl genes table extraction EDA

This notebook is useful for development as well as exploratory data analysis on the extracted tables.
It is currently automically executed and saved as part of exports using `papermill`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from ensembl_genes import ensembl_genes
from bioregistry import normalize_prefix

In [3]:
# parameters cell
species = "human"
release = "104"

In [4]:
# Parameters
species = "human"
release = "105"


In [5]:
ensg = ensembl_genes.Ensembl_Gene_Queries(release=release, species=species)
ensg.connection_url

'mysql+mysqlconnector://anonymous@ensembldb.ensembl.org:3306/homo_sapiens_core_105_38'

In [6]:
database = ensg.database
database

'homo_sapiens_core_105_38'

## Extract data

## gene attrib counts

In [7]:
ensg.run_query("gene_attrib_counts").head(15)

Unnamed: 0,attrib_type_id,code,name,description,attrib_type_count,attrib_type_examples
0,142,GeneGC,Gene GC,Percentage GC content for this gene,68005,"38.39, 63.16, 42.65, 58.40, 48.00, 46.32, 46.3..."
1,4,name,Name,Alternative/long name,65256,"RP11-165F24.3, CCDC85C, GABRB1, RP11-380P13.1,..."
2,395,xref_id,Xref ID,ID of associated database reference,57107,"OTTHUMG00000177418, OTTHUMG00000172907, OTTHUM..."
3,538,legacy_biotype,Legacy biotype,Obsolete biotype previously assigned to this E...,18812,"sense_intronic, processed_transcript, antisens..."
4,536,Ensembl_Select,Ensembl Select,The Ensembl Select is a transcript identified ...,18584,"ENST00000318602, ENST00000286479, ENST00000370..."
5,380,havana_cv,Havana CV term,Controlled vocabulary terms from Havana,12222,"overlapping locus, retrogene, ncRNA host, refe..."
6,54,remark,Remark,Annotation remark,4459,"Assembled from SLRseq reads (SRP049776), Assem..."
7,382,NoTransRefError,No translations due to reference error,This gene is believed to include protein codin...,8,1
8,1,embl_acc,European Nucleotide Archive (was EMBL) accession,ENA,0,
9,2,status,Status,,0,


## genes

In [8]:
ensg.gene_df.head()

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
0,ENSG00000000003,15,TSPAN6,HGNC,HGNC:11858,protein_coding,ensembl_havana,2008-04-29 11:17:41,2019-06-15 05:41:31,GRCh38,...,100627108,100639991,-1,True,,no,tetraspanin 6,HGNC Symbol,HGNC:11858,ENSG00000000003
1,ENSG00000000005,6,TNMD,HGNC,HGNC:17757,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,100584936,100599885,1,True,,no,tenomodulin,HGNC Symbol,HGNC:17757,ENSG00000000005
2,ENSG00000000419,14,DPM1,HGNC,HGNC:3005,protein_coding,ensembl_havana,2008-04-29 11:17:41,2020-12-11 08:28:43,GRCh38,...,50934867,50959140,-1,True,,no,dolichyl-phosphate mannosyltransferase subunit...,HGNC Symbol,HGNC:3005,ENSG00000000419
3,ENSG00000000457,14,SCYL3,HGNC,HGNC:19285,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,169849631,169894267,-1,True,,no,SCY1 like pseudokinase 3,HGNC Symbol,HGNC:19285,ENSG00000000457
4,ENSG00000000460,17,C1orf112,HGNC,HGNC:25565,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,169662007,169854080,1,True,,no,chromosome 1 open reading frame 112,HGNC Symbol,HGNC:25565,ENSG00000000460


In [9]:
# clone-based genes no longer get a symbol and are filled with the stable ID
# https://www.ensembl.info/2021/03/15/retirement-of-clone-based-gene-names/
ensg.gene_df.query("gene_symbol == ensembl_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
1599,ENSG00000083622,8,ENSG00000083622,,,lncRNA,havana,2008-04-29 11:17:41,2009-05-19 09:47:17,GRCh38,...,117604791,117647415,-1,True,,no,"novel transcript, antisense to CFTR",,,ENSG00000083622
1996,ENSG00000093100,13,ENSG00000093100,,,lncRNA,havana,2008-04-29 11:17:41,2012-06-07 23:07:01,GRCh38,...,17787652,17811497,-1,True,,no,novel transcript,,,ENSG00000093100


In [10]:
# which external database the gene symbol derives from versus the ensembl source
pd.crosstab(
    ensg.gene_df.ensembl_source,
    ensg.gene_df.gene_symbol_source_db.fillna("missing (clone-based)"),
    margins=True,
)

gene_symbol_source_db,EntrezGene,HGNC,RFAM,miRBase,missing (clone-based),All
ensembl_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ensembl,40,4171,1460,0,930,6601
ensembl_havana,26,21262,1,0,195,21484
ensembl_havana_tagene,0,4,0,0,1,5
havana,133,16431,0,2,18305,34871
havana_tagene,6,250,0,0,2827,3083
insdc,0,37,0,0,0,37
mirbase,0,1898,0,0,26,1924
All,205,44053,1461,2,22284,68005


In [11]:
ensg.gene_df.coord_system.value_counts().head(10)

chromosome    67951
scaffold         54
Name: coord_system, dtype: int64

In [12]:
ensg.gene_df.gene_biotype.value_counts().head(10)

protein_coding                        22818
lncRNA                                18812
processed_pseudogene                  10812
unprocessed_pseudogene                 3339
misc_RNA                               2407
snRNA                                  2072
miRNA                                  1924
transcribed_unprocessed_pseudogene     1121
TEC                                    1113
snoRNA                                 1010
Name: gene_biotype, dtype: int64

In [13]:
ensg.gene_df.seq_region_exc_type.value_counts(dropna=False)

None           61541
HAP             4709
PATCH_FIX       1200
PATCH_NOVEL      555
Name: seq_region_exc_type, dtype: int64

In [14]:
ensg.gene_df.mhc.value_counts()

no      65403
MHC      2422
xMHC      180
Name: mhc, dtype: int64

In [15]:
len(ensg.gene_df)

68005

## alternative gene alleles

Related:

- [OTP: Origin of genes_with_non_reference_ensembl_ids.tsv](https://github.com/opentargets/platform/issues/702)
- [biostars: map between different assemblies of one ensembl release](https://www.biostars.org/p/143956/)
- using `attrib_type.code = "non_ref"` for `primary_assembly` doesn't appear to return any results

In [16]:
ensg.alt_allele_df.head()

Unnamed: 0,ensembl_gene_id,alt_allele_group_id,alt_allele_is_representative,primary_assembly,seq_region,alt_allele_attrib,ensembl_created_date,ensembl_representative_gene_id,is_representative_gene,representative_gene_method
0,ENSG00000282572,44429,False,True,7,AUTOMATICALLY_ASSIGNED,2015-06-01 18:57:05,ENSG00000282572,True,primary_assembly
1,ENSG00000281951,44429,False,False,CHR_HSCHR7_2_CTG1,AUTOMATICALLY_ASSIGNED,2015-06-01 18:57:05,ENSG00000282572,False,primary_assembly
2,ENSG00000273644,44430,False,True,7,AUTOMATICALLY_ASSIGNED,2014-06-09 10:49:07,ENSG00000273644,True,primary_assembly
3,ENSG00000282333,44430,False,False,CHR_HSCHR7_2_CTG1,AUTOMATICALLY_ASSIGNED,2015-06-01 18:57:05,ENSG00000273644,False,primary_assembly
4,ENSG00000232325,44431,False,True,7,AUTOMATICALLY_ASSIGNED,2009-05-19 09:47:17,ENSG00000232325,True,primary_assembly


In [17]:
# looks like non_ref isn't set for human genes
query = '''
SELECT *
FROM gene_attrib
LEFT JOIN attrib_type
  ON gene_attrib.attrib_type_id = attrib_type.attrib_type_id
WHERE attrib_type.code = "non_ref"
LIMIT 5
'''
pd.read_sql(sql=query, con=ensg.connection_url)

Unnamed: 0,gene_id,attrib_type_id,value,attrib_type_id.1,code,name,description


In [18]:
ensg.alt_allele_df.alt_allele_attrib.value_counts()

AUTOMATICALLY_ASSIGNED    7302
IS_REPRESENTATIVE         1291
Name: alt_allele_attrib, dtype: int64

In [19]:
ensg.alt_allele_df.query("is_representative_gene").representative_gene_method.value_counts()

primary_assembly                1799
alt_allele_is_representative    1291
first_added                       50
Name: representative_gene_method, dtype: int64

In [20]:
ensg.gene_df.query("ensembl_gene_id != ensembl_representative_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
764,ENSG00000056678,11,KIFC1,HGNC,HGNC:6389,lncRNA,havana,2008-04-29 11:17:41,2010-03-18 16:07:21,GRCh38,...,33561891,33562220,1,False,,no,kinesin family member C1,HGNC Symbol,HGNC:6389,ENSG00000237649
2059,ENSG00000096150,9,RPS18,HGNC,HGNC:10401,protein_coding,ensembl_havana,2008-04-29 11:17:41,2015-06-01 18:57:05,GRCh38,...,33200834,33205337,1,False,,MHC,ribosomal protein S18,HGNC Symbol,HGNC:10401,ENSG00000231500


# replaced ID converter

A single `old_stable_id` can map to multiple `new_stable_id`. For example, `ENSG00000152006`

https://uswest.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=AzhM62SpkvdiLC4H-6808613

Requested ID | Matched ID(s) | Releases
-- | -- | --
ENSG00000152006 | ENSG00000196273 | 26: ENSG00000196273.1
ENSG00000152006 | ENSG00000197016 | 26: ENSG00000197016.1
ENSG00000152006 | ENSG00000196239 | 26: ENSG00000196239.1

In [21]:
ensg.old_to_new_df.head(2)

Unnamed: 0,old_ensembl_gene_id,new_ensembl_gene_id
0,ENSG00000011319,ENSG00000187391
1,ENSG00000077754,ENSG00000197226


In [22]:
# some ensembl genes replaced by many new ensembl genes
ensg.old_to_new_df.old_ensembl_gene_id.value_counts().head(2)

ENSG00000201456    91
ENSG00000193147    90
Name: old_ensembl_gene_id, dtype: int64

In [23]:
# example
ensg._update_ensembl_gene("ENSG00000152006")

{'ENSG00000196239', 'ENSG00000196273', 'ENSG00000197016'}

In [24]:
ensg.old_to_newest_df.head(2)

Unnamed: 0,old_ensembl_gene_id,newest_ensembl_gene_id,is_current
0,ASMPATCHG00000000170,ENSG00000256229,True
1,ASMPATCHG00000000174,ENSG00000188171,True


In [25]:
len(ensg.old_to_newest_df)

19321

In [26]:
ensg.old_to_newest_df.is_current.value_counts()

True     10289
False     9032
Name: is_current, dtype: int64

## omni-updater

The omni-updater dataset is designed to convert ensembl gene IDs from input data to the current, representative ensembl_gene_ids for this ensembl release. It assumes:

- users want to update outdated genes with their replacements
- users want a dataset of representative genes only, and want to convert alternative alleles to representative genes

An inner join of a dataset with `update_df` on `input_ensembl_gene_id` will do the following:

- produce output ensembl_gene_ids that are current and representatives
- update outdated genes with their current identifiers. Outdated genes with no current replacement will be removed by the inner join.
- update alternative gene alleles with their representatives
- genes that are already represenative and current will map to themselves

In [27]:
ensg.update_df.head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
0,ASMPATCHG00000000170,ENSG00000256229,False,True,1,3
2,ASMPATCHG00000000174,ENSG00000188171,False,True,1,3


In [28]:
ensg.update_df.sort_values("input_maps_to_n_genes", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
5604,ENSG00000201456,ENSG00000206832,False,True,90,4
3606,ENSG00000201456,ENSG00000206812,False,True,90,11


In [29]:
ensg.update_df.sort_values("n_inputs_map_to_gene", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
2254,ENSG00000198806,ENSG00000205936,False,True,2,58
2231,ENSG00000197368,ENSG00000205936,False,True,2,58


In [30]:
(ensg.update_df.input_maps_to_n_genes == 1).mean()

0.9164806235562839

In [31]:
ensg.update_df.query("ensembl_gene_id == 'ENSG00000256263'")

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
635,ASMPATCHG00000001513,ENSG00000256263,False,True,1,13
45002,ENSG00000256263,ENSG00000256263,True,True,1,13
636,ENSG00000260270,ENSG00000256263,False,True,1,13
637,ENSG00000262195,ENSG00000256263,False,True,1,13
638,ENSG00000263289,ENSG00000256263,False,True,1,13
639,ENSG00000269129,ENSG00000256263,False,True,1,13
640,ENSG00000269829,ENSG00000256263,False,True,1,13
641,ENSG00000270320,ENSG00000256263,False,True,1,13
642,ENSG00000271301,ENSG00000256263,False,True,1,13
643,ENSG00000271747,ENSG00000256263,False,True,1,13


In [32]:
print(
    f"The omni-updater contains {len(ensg.update_df):,} rows for mapping "
    f"{ensg.update_df.input_ensembl_gene_id.nunique():,} input genes to "
    f"{ensg.update_df.ensembl_gene_id.nunique():,} current, representative genes."
)

The omni-updater contains 77,491 rows for mapping 71,807 input genes to 62,552 current, representative genes.


In [33]:
# https://useast.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=P45VLMbogubpI0QA-6815464
ensg.update_df.query("input_ensembl_gene_id == 'ENSG00000201456'").head(3)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
5675,ENSG00000201456,ENSG00000199482,False,True,90,2
3451,ENSG00000201456,ENSG00000199601,False,True,90,7
5676,ENSG00000201456,ENSG00000199664,False,True,90,3


## cross-refrences (xrefs)

In [34]:
ensg.xref_df.head()

Unnamed: 0,ensembl_representative_gene_id,ensembl_gene_id,gene_symbol,xref_source,xref_accession,xref_label,xref_description,xref_info_type,xref_linkage_annotation,xref_curie
0,ENSG00000000003,ENSG00000000003,TSPAN6,ArrayExpress,ENSG00000000003,ENSG00000000003,,DIRECT,,arrayexpress:ENSG00000000003
1,ENSG00000000003,ENSG00000000003,TSPAN6,EntrezGene,7105,TSPAN6,tetraspanin 6,DEPENDENT,,ncbigene:7105
2,ENSG00000000003,ENSG00000000003,TSPAN6,HGNC,HGNC:11858,TSPAN6,tetraspanin 6,DIRECT,,hgnc:11858
3,ENSG00000000003,ENSG00000000003,TSPAN6,MIM_GENE,300191,TETRASPANIN 6; TSPAN6 [*300191],TETRASPANIN 6; TSPAN6;;TRANSMEMBRANE 4 SUPERFA...,DEPENDENT,,mim:300191
4,ENSG00000000003,ENSG00000000003,TSPAN6,Uniprot_gn,A0A087WYV6,TSPAN6,,DEPENDENT,,uniprot:A0A087WYV6


In [35]:
# datasets where there are ensembl_gene_id-xref_source-xref_accession pairs might not be distinct 
xref_dup_df = ensg.xref_df[ensg.xref_df.duplicated(subset=["ensembl_gene_id", "xref_source", "xref_accession"], keep=False)]
xref_dup_df.xref_source.value_counts()

HGNC    20
Name: xref_source, dtype: int64

In [36]:
# xref sources versus info_types
df = pd.crosstab(ensg.xref_df.xref_source, ensg.xref_df.xref_info_type, margins=True)
df["bioregistry_prefix"] = df.index.to_series().replace(ensg._xref_prefix_updater).map(normalize_prefix)
df

xref_info_type,DEPENDENT,DIRECT,MISC,All,bioregistry_prefix
xref_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ArrayExpress,0,68005,0,68005,arrayexpress
DBASS3,0,19,0,19,
DBASS5,0,13,0,13,
ENS_LRG_gene,0,1324,0,1324,lrg
EntrezGene,29122,0,0,29122,ncbigene
HGNC,1651,42419,0,44070,hgnc
MIM_GENE,18283,0,0,18283,mim
MIM_MORBID,7391,0,0,7391,mim
RFAM,0,5080,0,5080,rfam
Reactome_gene,0,136408,0,136408,reactome


## Gene Ontology xrefs

In [37]:
ensg.xref_go_df.head(3)

Unnamed: 0,ensembl_gene_id,go_id,go_label,go_evidence_codes,xref_info_types,ensembl_transcript_ids,ensembl_representative_gene_id
0,ENSG00000000003,GO:0005515,protein binding,IPI,DIRECT,ENST00000373020,ENSG00000000003
1,ENSG00000000003,GO:0005887,integral component of plasma membrane,IBA,DIRECT,ENST00000373020,ENSG00000000003
2,ENSG00000000003,GO:0016020,membrane,IEA,DIRECT,"ENST00000373020,ENST00000612152,ENST00000614008",ENSG00000000003


In [38]:
# GO terms for CCR5
# compare to http://useast.ensembl.org/Homo_sapiens/Gene/Ontologies/molecular_function?g=ENSG00000160791
sorted(ensg.xref_go_df.query("ensembl_gene_id == 'ENSG00000160791'").go_label)

['C-C chemokine binding',
 'C-C chemokine receptor activity',
 'G protein-coupled receptor activity',
 'G protein-coupled receptor signaling pathway',
 'MAPK cascade',
 'actin binding',
 'calcium ion transport',
 'calcium-mediated signaling',
 'cell chemotaxis',
 'cell surface',
 'cell surface receptor signaling pathway',
 'cell-cell signaling',
 'cellular defense response',
 'cellular response to lipopolysaccharide',
 'chemokine (C-C motif) ligand 5 binding',
 'chemokine receptor activity',
 'chemokine-mediated signaling pathway',
 'chemotaxis',
 'coreceptor activity',
 'cytoplasm',
 'defense response',
 'dendritic cell chemotaxis',
 'endosome',
 'external side of plasma membrane',
 'identical protein binding',
 'immune response',
 'inflammatory response',
 'integral component of membrane',
 'integral component of plasma membrane',
 'membrane',
 'negative regulation of macrophage apoptotic process',
 'phosphatidylinositol phospholipase C activity',
 'plasma membrane',
 'positive regul

## lrg xrefs

In [39]:
ensg.xref_lrg_df.head(2)

Unnamed: 0,ensembl_gene_id,lrg_gene_id
79,ENSG00000000971,LRG_47
109,ENSG00000001084,LRG_1166


In [40]:
len(ensg.xref_lrg_df)

1324

### ncbigene xrefs

In [41]:
ensg.xref_ncbigene_df.head()

Unnamed: 0,ensembl_representative_gene_id,ncbigene_id,gene_symbol,ncbigene_symbol
1,ENSG00000000003,7105,TSPAN6,TSPAN6
9,ENSG00000000005,64102,TNMD,TNMD
15,ENSG00000000419,8813,DPM1,DPM1
39,ENSG00000000457,57147,SCYL3,SCYL3
46,ENSG00000000460,55732,C1orf112,C1orf112


In [42]:
# ensembl gene mapped to by multiple ncbigenes
ensg.xref_ncbigene_df.ensembl_representative_gene_id.value_counts().head(3)

ENSG00000000003    1
ENSG00000199132    1
ENSG00000199158    1
Name: ensembl_representative_gene_id, dtype: int64

In [43]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ensembl_representative_gene_id.duplicated().sum()

(25930, 0)

In [44]:
# ncbigene mapped to by multiple ensembl genes, likely due to alt gene alleles
ensg.xref_ncbigene_df.ncbigene_id.value_counts().head(3)

112724    9
55655     9
51206     9
Name: ncbigene_id, dtype: int64

In [45]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ncbigene_id.duplicated().sum()

(25930, 363)

In [46]:
# ensg.xref_ncbigene_df.query("ensembl_representative_gene_id == 'ENSG00000231500'")
# ensg.xref_ncbigene_df.query("ncbigene_id == '51206'")

In [47]:
repr_ensembl_gene_ids = set(ensg.gene_df.ensembl_representative_gene_id)
len(repr_ensembl_gene_ids)

62552

In [48]:
# many of these genes should probably be alternative alleles rather than representative
ensg.gene_df.query("not primary_assembly and ensembl_gene_id==ensembl_representative_gene_id")

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
16993,ENSG00000196101,9,HLA-DRB3,HGNC,HGNC:4951,protein_coding,ensembl_havana,2008-04-29 11:17:41,2016-06-03 16:05:49,GRCh38,...,32449765,32462852,-1,False,,MHC,"major histocompatibility complex, class II, DR...",HGNC Symbol,HGNC:4951,ENSG00000196101
17066,ENSG00000196299,10,ZNRD1ASP,HGNC,HGNC:13924,lncRNA,havana,2008-04-29 11:17:41,2017-01-25 12:02:05,GRCh38,...,29990282,30052027,-1,False,,MHC,"zinc ribbon domain containing 1 antisense, pse...",HGNC Symbol,HGNC:13924,ENSG00000196299
20405,ENSG00000206232,8,PSMB8-AS1,EntrezGene,100507463,lncRNA,havana,2006-03-10 00:00:00,2015-06-01 18:57:05,GRCh38,...,32822063,32824472,1,False,,MHC,PSMB8 antisense RNA 1 (head to head),NCBI gene (formerly Entrezgene),100507463,ENSG00000206232
20463,ENSG00000206380,11,SNHG32,HGNC,HGNC:19078,protein_coding,ensembl_havana,2006-03-10 00:00:00,2015-06-01 18:57:05,GRCh38,...,31824855,31830009,1,False,,MHC,small nucleolar RNA host gene 32,HGNC Symbol,HGNC:19078,ENSG00000206380
20492,ENSG00000206446,9,HLA-F-AS1,HGNC,HGNC:26645,lncRNA,ensembl_havana,2006-03-10 00:00:00,2015-06-01 18:57:05,GRCh38,...,29720576,29743014,-1,False,,MHC,HLA-F antisense RNA 1,HGNC Symbol,HGNC:26645,ENSG00000206446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67024,ENSG00000288647,1,ENSG00000288647,,,protein_coding,ensembl,2020-06-04 16:53:11,2020-06-04 16:53:11,GRCh38,...,1638229,1665406,-1,False,,no,family with sequence similarity 53 member A,NCBI gene (formerly Entrezgene),152877,ENSG00000288647
67027,ENSG00000288650,1,ENSG00000288650,,,protein_coding,ensembl,2020-06-04 16:53:11,2020-06-04 16:53:11,GRCh38,...,54876141,54890110,1,False,,no,,,,ENSG00000288650
67028,ENSG00000288651,1,ENSG00000288651,,,protein_coding,ensembl,2020-06-04 16:53:11,2020-06-04 16:53:11,GRCh38,...,54871744,54878424,1,False,,no,natural cytotoxicity triggering receptor 1,NCBI gene (formerly Entrezgene),9437,ENSG00000288651
67039,ENSG00000288668,1,ENSG00000288668,,,protein_coding,ensembl,2020-06-04 16:53:11,2020-06-04 16:53:11,GRCh38,...,121652732,121669478,-1,False,,no,,,,ENSG00000288668
