# Ensembl genes table extraction EDA

This notebook is useful for development as well as exploratory data analysis on the extracted tables.
It is currently automically executed and saved as part of exports using `papermill`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from ensembl_genes import ensembl_genes
from bioregistry import normalize_prefix

In [3]:
# parameters cell
species = "human"
release = "111"

In [4]:
# Parameters
species = "human"
release = "113"


In [5]:
ensg = ensembl_genes.Ensembl_Gene_Queries(release=release, species=species)
ensg.connection_url

'mysql+mysqlconnector://anonymous@ensembldb.ensembl.org:3306/homo_sapiens_core_113_38'

In [6]:
database = ensg.database
database

'homo_sapiens_core_113_38'

## Extract data

## gene attrib counts

In [7]:
ensg.run_query("gene_attrib_counts").head(15)

Unnamed: 0,attrib_type_id,code,name,description,attrib_type_count,attrib_type_examples
0,142,GeneGC,Gene GC,Percentage GC content for this gene,86402,"38.16, 50.54, 54.06, 52.45, 41.70, 55.59, 42.6..."
1,4,name,Name,Alternative/long name,84579,"RP11-1217F2.29, RP11-37O16.6, NAT8, ENSG100101..."
2,395,xref_id,Xref ID,ID of associated database reference,56041,"OTTHUMG00000176014, OTTHUMG00000165943, OTTHUM..."
3,538,legacy_biotype,Legacy biotype,Obsolete biotype previously assigned to this E...,36524,"processed_transcript, lincrna, antisense, sens..."
4,54,remark,Remark,Annotation remark,20829,"TAGENE_gene, Assembled from RACEseq reads, Ass..."
5,536,Ensembl_Select,Ensembl Select,The Ensembl Select is a transcript identified ...,19287,"ENST00000440869, ENST00000304749, ENST00000366..."
6,380,havana_cv,Havana CV term,Controlled vocabulary terms from Havana,19208,"retrogene, overlapping locus, ncRNA host, EnsE..."
7,560,artef_dupl,Artifactual duplication,Annotation on artifactual regions of the genom...,19,Artifactual duplication. Real copy of this gen...
8,382,NoTransRefError,No translations due to reference error,This gene is believed to include protein codin...,8,1
9,1,embl_acc,European Nucleotide Archive (was EMBL) accession,ENA,0,


## genes

In [8]:
ensg.gene_df.head()

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
0,ENSG00000000003,16,TSPAN6,HGNC,HGNC:11858,protein_coding,ensembl_havana,2008-04-29 11:17:41,2022-12-27 00:10:08,GRCh38,...,100627108,100639991,-1,True,,no,tetraspanin 6,HGNC Symbol,HGNC:11858,ENSG00000000003
1,ENSG00000000005,6,TNMD,HGNC,HGNC:17757,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,100584936,100599885,1,True,,no,tenomodulin,HGNC Symbol,HGNC:17757,ENSG00000000005
2,ENSG00000000419,14,DPM1,HGNC,HGNC:3005,protein_coding,ensembl_havana,2008-04-29 11:17:41,2020-12-11 08:28:43,GRCh38,...,50934867,50959140,-1,True,,no,dolichyl-phosphate mannosyltransferase subunit...,HGNC Symbol,HGNC:3005,ENSG00000000419
3,ENSG00000000457,14,SCYL3,HGNC,HGNC:19285,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,169849631,169894267,-1,True,,no,SCY1 like pseudokinase 3,HGNC Symbol,HGNC:19285,ENSG00000000457
4,ENSG00000000460,17,FIRRM,HGNC,HGNC:25565,protein_coding,ensembl_havana,2008-04-29 11:17:41,2018-11-21 17:23:49,GRCh38,...,169662007,169854080,1,True,,no,FIGNL1 interacting regulator of recombination ...,HGNC Symbol,HGNC:25565,ENSG00000000460


In [9]:
# clone-based genes no longer get a symbol and are filled with the stable ID
# https://www.ensembl.info/2021/03/15/retirement-of-clone-based-gene-names/
ensg.gene_df.query("gene_symbol == ensembl_gene_id").head(2)

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
1599,ENSG00000083622,8,ENSG00000083622,,,lncRNA,havana,2008-04-29 11:17:41,2009-05-19 09:47:17,GRCh38,...,117604791,117647415,-1,True,,no,"novel transcript, antisense to CFTR",,,ENSG00000083622
1996,ENSG00000093100,13,ENSG00000093100,,,lncRNA,havana,2008-04-29 11:17:41,2012-06-07 23:07:01,GRCh38,...,17787652,17811497,-1,True,,no,novel transcript,,,ENSG00000093100


In [10]:
# which external database the gene symbol derives from versus the ensembl source
pd.crosstab(
    ensg.gene_df.ensembl_source,
    ensg.gene_df.gene_symbol_source_db.fillna("missing (clone-based)"),
    margins=True,
)

gene_symbol_source_db,EntrezGene,HGNC,RFAM,miRBase,missing (clone-based),All
ensembl_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ensembl,0,4219,1470,0,855,6544
ensembl_havana,30,21637,1,0,277,21945
ensembl_havana_tagene,0,6,0,0,1,7
havana,269,18463,0,2,17236,35970
havana_tagene,14,312,0,0,19628,19954
insdc,0,37,0,0,0,37
mirbase,0,1918,0,1,26,1945
All,313,46592,1471,3,38023,86402


In [11]:
ensg.gene_df.coord_system.value_counts().head(10)

chromosome    78724
scaffold       7678
Name: coord_system, dtype: int64

In [12]:
ensg.gene_df.gene_biotype.value_counts().head(10)

lncRNA                                36524
protein_coding                        23258
processed_pseudogene                  10235
unprocessed_pseudogene                 2809
misc_RNA                               2419
snRNA                                  2094
miRNA                                  1945
transcribed_unprocessed_pseudogene     1814
transcribed_processed_pseudogene       1229
TEC                                    1085
Name: gene_biotype, dtype: int64

In [13]:
pd.crosstab(ensg.gene_df.coord_system, ensg.gene_df.primary_assembly, margins=True)

primary_assembly,False,True,All
coord_system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chromosome,0,78724,78724
scaffold,7678,0,7678
All,7678,78724,86402


In [14]:
ensg.gene_df.mhc.value_counts()

no      85674
MHC       494
xMHC      234
Name: mhc, dtype: int64

In [15]:
len(ensg.gene_df)

86402

## alternative gene alleles

Related:

- [OTP: Origin of genes_with_non_reference_ensembl_ids.tsv](https://github.com/opentargets/platform/issues/702)
- [biostars: map between different assemblies of one ensembl release](https://www.biostars.org/p/143956/)
- using `attrib_type.code = "non_ref"` for `primary_assembly` doesn't appear to return any results

In [16]:
ensg.representative_gene_df.head()

Unnamed: 0,rs_allele_group,ensembl_gene_id,gene_symbol,ensembl_created_date,seq_region,primary_assembly,alt_allele_group_id,alt_allele_attrib,alt_allele_is_representative,ensembl_representative_gene_id,is_representative_gene
0,44430,ENSG00000273644,,2014-06-09 10:49:07,7,True,44430.0,IS_REPRESENTATIVE,True,ENSG00000273644,True
2,44430,ENSG00000282333,,2015-06-01 18:57:05,HSCHR7_2_CTG1,False,44430.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000273644,False
3,44431,ENSG00000281993,,2015-06-01 18:57:05,HSCHR7_1_CTG1,False,44431.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000281993,True
4,44431,ENSG00000282645,,2015-06-01 18:57:05,HSCHR7_2_CTG1,False,44431.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000281993,False
5,44431,ENSG00000288372,,2019-06-15 05:41:31,HG1309_PATCH,False,44431.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000281993,False


In [17]:
# looks like non_ref isn't set for human genes
query = '''
SELECT *
FROM gene_attrib
LEFT JOIN attrib_type
  ON gene_attrib.attrib_type_id = attrib_type.attrib_type_id
WHERE attrib_type.code = "non_ref"
LIMIT 5
'''
pd.read_sql(sql=query, con=ensg.connection_url)

Unnamed: 0,gene_id,attrib_type_id,value,attrib_type_id.1,code,name,description


In [18]:
ensg.representative_gene_df.alt_allele_attrib.value_counts()

AUTOMATICALLY_ASSIGNED    6469
IS_REPRESENTATIVE         3905
MANUALLY_ASSIGNED           74
IS_PAR                      45
Name: alt_allele_attrib, dtype: int64

In [19]:
ensg.representative_gene_df.query("ensembl_gene_id != ensembl_representative_gene_id").head(2)

Unnamed: 0,rs_allele_group,ensembl_gene_id,gene_symbol,ensembl_created_date,seq_region,primary_assembly,alt_allele_group_id,alt_allele_attrib,alt_allele_is_representative,ensembl_representative_gene_id,is_representative_gene
2,44430,ENSG00000282333,,2015-06-01 18:57:05,HSCHR7_2_CTG1,False,44430.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000273644,False
4,44431,ENSG00000282645,,2015-06-01 18:57:05,HSCHR7_2_CTG1,False,44431.0,AUTOMATICALLY_ASSIGNED,False,ENSG00000281993,False


# replaced ID converter

A single `old_stable_id` can map to multiple `new_stable_id`. For example, `ENSG00000152006`

https://uswest.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=AzhM62SpkvdiLC4H-6808613

Requested ID | Matched ID(s) | Releases
-- | -- | --
ENSG00000152006 | ENSG00000196273 | 26: ENSG00000196273.1
ENSG00000152006 | ENSG00000197016 | 26: ENSG00000197016.1
ENSG00000152006 | ENSG00000196239 | 26: ENSG00000196239.1

In [20]:
ensg.old_to_new_df.head(2)

Unnamed: 0,old_ensembl_gene_id,new_ensembl_gene_id
0,ENSG00000011319,ENSG00000187391
1,ENSG00000077754,ENSG00000197226


In [21]:
# some ensembl genes replaced by many new ensembl genes
ensg.old_to_new_df.old_ensembl_gene_id.value_counts().head(2)

ENSG00000201456    91
ENSG00000193147    90
Name: old_ensembl_gene_id, dtype: int64

In [22]:
# example
ensg._update_ensembl_gene("ENSG00000152006")

{'ENSG00000196239', 'ENSG00000196273', 'ENSG00000197016'}

In [23]:
ensg.old_to_newest_df.head(2)

Unnamed: 0,old_ensembl_gene_id,newest_ensembl_gene_id,is_current
0,ASMPATCHG00000000170,ENSG00000256229,True
1,ASMPATCHG00000000174,ENSG00000188171,True


In [24]:
len(ensg.old_to_newest_df)

20470

In [25]:
ensg.old_to_newest_df.is_current.value_counts()

True     11432
False     9038
Name: is_current, dtype: int64

## omni-updater

The omni-updater dataset is designed to convert ensembl gene IDs from input data to the current, representative ensembl_gene_ids for this ensembl release. It assumes:

- users want to update outdated genes with their replacements
- users want a dataset of representative genes only, and want to convert alternative alleles to representative genes

An inner join of a dataset with `update_df` on `input_ensembl_gene_id` will do the following:

- produce output ensembl_gene_ids that are current and representatives
- update outdated genes with their current identifiers. Outdated genes with no current replacement will be removed by the inner join.
- update alternative gene alleles with their representatives
- genes that are already represenative and current will map to themselves

In [26]:
ensg.update_df.head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
0,ASMPATCHG00000000170,ENSG00000256229,False,True,1,3
2,ASMPATCHG00000000174,ENSG00000188171,False,True,1,3


In [27]:
ensg.update_df.sort_values("input_maps_to_n_genes", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
3610,ENSG00000201456,ENSG00000206801,False,True,90,6
4046,ENSG00000201456,ENSG00000206918,False,True,90,6


In [28]:
ensg.update_df.sort_values("n_inputs_map_to_gene", ascending=False).head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
36216,ENSG00000238560,ENSG00000202251,True,False,1,914
21056,ENSG00000207231,ENSG00000202251,True,False,1,914


In [29]:
(ensg.update_df.input_maps_to_n_genes == 1).mean()

0.9439004062923221

In [30]:
ensg.update_df.query("ensembl_gene_id == 'ENSG00000256263'")

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
636,ASMPATCHG00000001513,ENSG00000256263,False,True,1,13
44509,ENSG00000256263,ENSG00000256263,True,True,1,13
637,ENSG00000260270,ENSG00000256263,False,True,1,13
638,ENSG00000262195,ENSG00000256263,False,True,1,13
639,ENSG00000263289,ENSG00000256263,False,True,1,13
640,ENSG00000269129,ENSG00000256263,False,True,1,13
641,ENSG00000269829,ENSG00000256263,False,True,1,13
642,ENSG00000270320,ENSG00000256263,False,True,1,13
643,ENSG00000271301,ENSG00000256263,False,True,1,13
644,ENSG00000271747,ENSG00000256263,False,True,1,13


In [31]:
print(
    f"The omni-updater contains {len(ensg.update_df):,} rows for mapping "
    f"{ensg.update_df.input_ensembl_gene_id.nunique():,} input genes to "
    f"{ensg.update_df.ensembl_gene_id.nunique():,} current, representative genes."
)

The omni-updater contains 95,990 rows for mapping 91,361 input genes to 78,043 current, representative genes.


In [32]:
# https://useast.ensembl.org/Homo_sapiens/Tools/IDMapper/Results?tl=P45VLMbogubpI0QA-6815464
ensg.update_df.query("input_ensembl_gene_id == 'ENSG00000201456'").head(3)

Unnamed: 0,input_ensembl_gene_id,ensembl_gene_id,input_current,input_representative,input_maps_to_n_genes,n_inputs_map_to_gene
5701,ENSG00000201456,ENSG00000199482,False,True,90,2
3469,ENSG00000201456,ENSG00000199601,False,True,90,7
5702,ENSG00000201456,ENSG00000199664,False,True,90,3


## cross-refrences (xrefs)

In [33]:
ensg.xref_df.head()

Unnamed: 0,ensembl_representative_gene_id,ensembl_gene_id,gene_symbol,xref_source,xref_accession,xref_label,xref_description,xref_info_type,xref_linkage_annotation,xref_curie
0,ENSG00000000003,ENSG00000000003,TSPAN6,ArrayExpress,ENSG00000000003,ENSG00000000003,,DIRECT,,arrayexpress:ENSG00000000003
1,ENSG00000000003,ENSG00000000003,TSPAN6,EntrezGene,7105,TSPAN6,tetraspanin 6,DEPENDENT,,ncbigene:7105
2,ENSG00000000003,ENSG00000000003,TSPAN6,GeneCards,11858,TSPAN6,tetraspanin 6,DEPENDENT,,genecards:11858
3,ENSG00000000003,ENSG00000000003,TSPAN6,HGNC,HGNC:11858,TSPAN6,tetraspanin 6,DIRECT,,hgnc:11858
4,ENSG00000000003,ENSG00000000003,TSPAN6,MIM_GENE,300191,TETRASPANIN 6; TSPAN6 [*300191],TETRASPANIN 6; TSPAN6;;TRANSMEMBRANE 4 SUPERFA...,DEPENDENT,,omim:300191


In [34]:
# datasets where there are ensembl_gene_id-xref_source-xref_accession pairs might not be distinct 
xref_dup_df = ensg.xref_df[ensg.xref_df.duplicated(subset=["ensembl_gene_id", "xref_source", "xref_accession"], keep=False)]
xref_dup_df.xref_source.value_counts()

Series([], Name: xref_source, dtype: int64)

In [35]:
# xref sources versus info_types
df = pd.crosstab(ensg.xref_df.xref_source, ensg.xref_df.xref_info_type, margins=True)
df["bioregistry_prefix"] = df.index.to_series().replace(ensg._xref_prefix_updater).map(normalize_prefix)
df

xref_info_type,DEPENDENT,DIRECT,MISC,All,bioregistry_prefix
xref_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ArrayExpress,0,86402,0,86402,arrayexpress
DBASS3,0,19,0,19,
DBASS5,0,14,0,14,
ENS_LRG_gene,0,1324,0,1324,lrg
EntrezGene,36711,0,0,36711,ncbigene
GeneCards,44881,0,0,44881,genecards
HGNC,1715,44881,0,46596,hgnc
MIM_GENE,19206,0,0,19206,omim
MIM_MORBID,8173,0,0,8173,omim
RFAM,0,5122,0,5122,rfam


## Gene Ontology xrefs

In [36]:
ensg.xref_go_df.head(3)

Unnamed: 0,ensembl_gene_id,go_id,go_label,go_evidence_codes,xref_info_types,xref_info_texts,ensembl_transcript_ids,ensembl_representative_gene_id
0,ENSG00000000003,GO:0005515,protein binding,IPI,DIRECT,UniProt,ENST00000373020,ENSG00000000003
1,ENSG00000000003,GO:0005886,plasma membrane,IBA,DIRECT,GO_Central,ENST00000373020,ENSG00000000003
2,ENSG00000000003,GO:0016020,membrane,IEA,"DEPENDENT,DIRECT",",InterPro,UniProt","ENST00000373020,ENST00000612152",ENSG00000000003


In [37]:
# GO terms for CCR5
# compare to http://useast.ensembl.org/Homo_sapiens/Gene/Ontologies/molecular_function?g=ENSG00000160791
sorted(ensg.xref_go_df.query("ensembl_gene_id == 'ENSG00000160791'").go_label)

['C-C chemokine binding',
 'C-C chemokine receptor activity',
 'G protein-coupled receptor activity',
 'G protein-coupled receptor signaling pathway',
 'MAPK cascade',
 'actin binding',
 'calcium ion transport',
 'calcium-mediated signaling',
 'cell chemotaxis',
 'cell surface',
 'cell surface receptor signaling pathway',
 'cell-cell signaling',
 'cellular defense response',
 'cellular response to lipopolysaccharide',
 'chemokine (C-C motif) ligand 5 binding',
 'chemokine receptor activity',
 'chemokine-mediated signaling pathway',
 'chemotaxis',
 'coreceptor activity',
 'cytoplasm',
 'defense response',
 'dendritic cell chemotaxis',
 'endosome',
 'external side of plasma membrane',
 'identical protein binding',
 'immune response',
 'inflammatory response',
 'membrane',
 'negative regulation of macrophage apoptotic process',
 'phosphatidylinositol phospholipase C activity',
 'plasma membrane',
 'positive regulation of cytosolic calcium ion concentration',
 'protein binding',
 'release 

## lrg xrefs

In [38]:
ensg.xref_lrg_df.head(2)

Unnamed: 0,ensembl_gene_id,lrg_gene_id
100,ENSG00000000971,LRG_47
155,ENSG00000001084,LRG_1166


In [39]:
len(ensg.xref_lrg_df)

1324

### ncbigene xrefs

In [40]:
ensg.xref_ncbigene_df.head()

Unnamed: 0,ensembl_representative_gene_id,ncbigene_id,gene_symbol,ncbigene_symbol
1,ENSG00000000003,7105,TSPAN6,TSPAN6
9,ENSG00000000005,64102,TNMD,TNMD
16,ENSG00000000419,8813,DPM1,DPM1
45,ENSG00000000457,57147,SCYL3,SCYL3
53,ENSG00000000460,55732,FIRRM,FIRRM


In [41]:
# ensembl gene mapped to by multiple ncbigenes
ensg.xref_ncbigene_df.ensembl_representative_gene_id.value_counts().head(3)

ENSG00000000003    1
ENSG00000207653    1
ENSG00000207651    1
Name: ensembl_representative_gene_id, dtype: int64

In [42]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ensembl_representative_gene_id.duplicated().sum()

(27536, 0)

In [43]:
# ncbigene mapped to by multiple ensembl genes, likely due to alt gene alleles
ensg.xref_ncbigene_df.ncbigene_id.value_counts().head(3)

124906683    9
9437         7
147798       7
Name: ncbigene_id, dtype: int64

In [44]:
len(ensg.xref_ncbigene_df), ensg.xref_ncbigene_df.ncbigene_id.duplicated().sum()

(27536, 93)

In [45]:
# ensg.xref_ncbigene_df.query("ensembl_representative_gene_id == 'ENSG00000231500'")
# ensg.xref_ncbigene_df.query("ncbigene_id == '51206'")

In [46]:
repr_ensembl_gene_ids = set(ensg.gene_df.ensembl_representative_gene_id)
len(repr_ensembl_gene_ids)

78043

In [47]:
# many of these genes should probably be alternative alleles rather than representative
ensg.gene_df.query("not primary_assembly and ensembl_gene_id==ensembl_representative_gene_id")

Unnamed: 0,ensembl_gene_id,ensembl_gene_version,gene_symbol,gene_symbol_source_db,gene_symbol_source_id,gene_biotype,ensembl_source,ensembl_created_date,ensembl_modified_date,coord_system_version,...,seq_region_start,seq_region_end,seq_region_strand,primary_assembly,lrg_gene_id,mhc,gene_description,gene_description_source_db,gene_description_source_id,ensembl_representative_gene_id
16981,ENSG00000196101,10,HLA-DRB3,HGNC,HGNC:4951,protein_coding,ensembl_havana,2008-04-29 11:17:41,2023-04-14 17:13:51,GRCh38,...,3715355,3728422,-1,False,,no,"major histocompatibility complex, class II, DR...",HGNC Symbol,HGNC:4951,ENSG00000196101
23326,ENSG00000215523,7,ENSG00000215523,,,lncRNA,havana,2007-09-07 00:01:32,2015-06-01 18:57:05,GRCh38,...,2053740,2085765,-1,False,,no,chromosome 6 open reading frame 214 (putative),,,ENSG00000215523
24775,ENSG00000223359,2,ENSG00000223359,,,unprocessed_pseudogene,havana,2009-05-19 09:47:17,2010-03-18 16:07:21,GRCh38,...,3789961,3790056,-1,False,,no,"major histocompatibility complex, class II, DR...",,,ENSG00000223359
24914,ENSG00000223557,1,ENSG00000223557,,,lncRNA,havana,2009-05-19 09:47:17,2009-05-19 09:47:17,GRCh38,...,3361186,3361479,1,False,,no,novel transcript,,,ENSG00000223557
25134,ENSG00000223844,1,ENSG00000223844,,,processed_pseudogene,havana,2009-05-19 09:47:17,2009-05-19 09:47:17,GRCh38,...,3948261,3949391,1,False,,no,"family with sequence similarity 8, member A5 p...",,,ENSG00000223844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86126,ENSG00000310277,1,ENSG00000310277,,,lncRNA,havana_tagene,2024-04-29 16:53:40,2024-04-29 16:53:40,GRCh38,...,67572,97360,-1,False,,no,novel transcript,,,ENSG00000310277
86196,ENSG00000310347,1,ENSG00000310347,,,lncRNA,havana_tagene,2024-04-29 16:53:40,2024-04-29 16:53:40,GRCh38,...,6600,8034,1,False,,no,novel transcript,,,ENSG00000310347
86225,ENSG00000310376,1,ENSG00000310376,,,lncRNA,havana_tagene,2024-04-29 16:53:40,2024-04-29 16:53:40,GRCh38,...,113837,140657,1,False,,no,novel transcript,,,ENSG00000310376
86241,ENSG00000310392,1,ENSG00000310392,,,lncRNA,havana_tagene,2024-04-29 16:53:40,2024-04-29 16:53:40,GRCh38,...,120506,121345,-1,False,,no,novel transcript,,,ENSG00000310392
