# To obtain the Human Reference files
### (Genome Sequence + Annotation coordinates)

Loading packages required:

In [1]:
import os
import pyensembl
from Scripts.manipulationOfGenomicFeatures import GenomicFeatures

Download human reference genome **hg38** and annotations from ensembl (**release 90**):

In [2]:
os.environ['PYENSEMBL_CACHE_DIR'] = "."
ensemblDB = pyensembl.EnsemblRelease(species="human", release="90")
ensemblDB.download()
ensemblDB.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rluis/DataspellProjects/POINT_Protocol/GenomicReferences/pyensembl/GRCh38/ensembl90/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rluis/DataspellProjects/POINT_Protocol/GenomicReferences/pyensembl/GRCh38/ensembl90/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rluis/DataspellProjects/POINT_Protocol/GenomicReferences/pyensembl/GRCh38/ensembl90/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


Based on the GTF file, BED files are created different transcriptional units.

**Genes coordinates (bed format):**

In [3]:
geneIDs = ensemblDB.gene_ids()
genes_bedEntries = []
for ID in geneIDs:
    genes_bedEntries.append(GenomicFeatures.createGeneBED(ensemblDB, ID))
genes_bedEntries = GenomicFeatures.sortBedList(genes_bedEntries)
genes_bedEntries[:5]

[['1', '11869', '14409', 'ENSG00000223972', '0', '+'],
 ['1', '14404', '29570', 'ENSG00000227232', '0', '-'],
 ['1', '17369', '17436', 'ENSG00000278267', '0', '-'],
 ['1', '29554', '31109', 'ENSG00000243485', '0', '+'],
 ['1', '30366', '30503', 'ENSG00000284332', '0', '+']]

**Transcritps coordinates (bed format):**

In [4]:
transcritpsIDs = ensemblDB.transcript_ids()
transcritps_bedEntries = []
for transID in transcritpsIDs:
    transcritps_bedEntries.append(GenomicFeatures.createTransBED(ensemblDB, transID))
transcritps_bedEntries = GenomicFeatures.sortBedList(transcritps_bedEntries)
transcritps_bedEntries[:5]

[['1', '11869', '14409', 'ENST00000456328', '0', '+'],
 ['1', '12010', '13670', 'ENST00000450305', '0', '+'],
 ['1', '14404', '29570', 'ENST00000488147', '0', '-'],
 ['1', '17369', '17436', 'ENST00000619216', '0', '-'],
 ['1', '29554', '31097', 'ENST00000473358', '0', '+']]

**Exons coordinates (bed format):**


In [5]:
exons_bedEntries = []
for transID in transcritpsIDs:
    for exonID in ensemblDB.exon_ids_of_transcript_id(transID):
        exon = GenomicFeatures.createExonBED(ensemblDB, exonID, transID)
        exons_bedEntries.append(exon)
exons_bedEntries = GenomicFeatures.sortBedList(exons_bedEntries)
exons_bedEntries[:5]

[['1', '11869', '12227', 'ENSE00002234944_ENST00000456328', 0, '+'],
 ['1', '12010', '12057', 'ENSE00001948541_ENST00000450305', 0, '+'],
 ['1', '12179', '12227', 'ENSE00001671638_ENST00000450305', 0, '+'],
 ['1', '12613', '12697', 'ENSE00001758273_ENST00000450305', 0, '+'],
 ['1', '12613', '12721', 'ENSE00003582793_ENST00000456328', 0, '+']]

**Introns coordinates (bed format):**


In [6]:
introns_bedEntries = []
for transID in transcritpsIDs:
    introns_bedEntries += GenomicFeatures.transcriptID_intronsBedList(ensemblDB, transID)
introns_bedEntries = GenomicFeatures.sortBedList(introns_bedEntries)
introns_bedEntries[:5]

[['1',
  '12057',
  '12179',
  'ENSE00001948541_ENSE00001671638_ENST00000450305',
  '0',
  '+'],
 ['1',
  '12227',
  '12613',
  'ENSE00001671638_ENSE00001758273_ENST00000450305',
  '0',
  '+'],
 ['1',
  '12227',
  '12613',
  'ENSE00002234944_ENSE00003582793_ENST00000456328',
  '0',
  '+'],
 ['1',
  '12697',
  '12975',
  'ENSE00001758273_ENSE00001799933_ENST00000450305',
  '0',
  '+'],
 ['1',
  '12721',
  '13221',
  'ENSE00003582793_ENSE00002312635_ENST00000456328',
  '0',
  '+']]

Number of transcription features per class:

In [7]:
print(len(genes_bedEntries),
      len(transcritps_bedEntries),
      len(exons_bedEntries),
      len(introns_bedEntries))

58302 200310 1199851 999541


Save into BED files:

In [8]:
GenomicFeatures.writeBedFile(genes_bedEntries, "allGenes_ensembl_hg38_v90.bed")
GenomicFeatures.writeBedFile(transcritps_bedEntries, "allTranscripts_ensembl_hg38_v90.bed")
GenomicFeatures.writeBedFile(exons_bedEntries, "allExons_ensembl_hg38_v90.bed")
GenomicFeatures.writeBedFile(introns_bedEntries, "allIntrons_ensembl_hg38_v90.bed")