<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/data/Random_Exon_Seqs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Random Genome Sequences

This notebook generates `N` random of the length `K` with the minimal distance `MIN_DIST` to any Ensembl gene.

Note that some sequences may be missing in the human genome (= consists of 'N' letters only).

## Setup

Installation for colab environment.

In [1]:
!pip install biopython pyensembl

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/a8/66/134dbd5f885fc71493c61b6cf04c9ea08082da28da5ed07709b02857cbd0/biopython-1.77-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.8MB/s 
[?25hCollecting pyensembl
[?25l  Downloading https://files.pythonhosted.org/packages/6b/d2/ef4b305af520ff7cc478d24c9b78560db20bb37be63fde7c2f83e9c6460e/pyensembl-1.8.7.tar.gz (57kB)
[K     |████████████████████████████████| 61kB 6.9MB/s 
Collecting typechecks>=0.0.2
  Downloading https://files.pythonhosted.org/packages/62/21/15129201c1f52f6af1e7809e96dce46da4b406c2c07fe9425e92f51edc5c/typechecks-0.1.0.tar.gz
Collecting datacache>=1.1.4
  Downloading https://files.pythonhosted.org/packages/f0/b9/a7114c1ac7b18bec9e0d232e65620aeb469b396ceaa85bcb6e81fe89a19d/datacache-1.1.5.tar.gz
Collecting memoized-property>=1.0.2
  Downloading https://files.pythonhosted.org/packages/70/db/23f8b5d86c9385299586c2469b58087f658f58eaeb414be0fd64cfd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pyensembl install --release 97 --species human

2020-06-27 15:39:24,691 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=97, species='homo_sapiens')
2020-06-27 15:39:24,691 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz from URL ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 15:39:24,691 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz to /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 15:39:27,732 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2020-06-27 15:39:27,732 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz to /root/.

In [4]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm

from Bio import SeqIO   # for reading fasta files
from pyensembl import EnsemblRelease   # to get the gene list

ENSEMBL_RELEASE = 97
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"

# to generate random sequences
N = 50_000    # how many
K = 200       # how long
MIN_DIST = 50 # how far should they be from the gene
OUTPUT_FILE = '/content/drive/My Drive/data/random/random_exons.csv'   # where to save them
# OUTPUT_FILE = 'random_seqs.csv'

CHRS = [str(chr) for chr in range(1,23)] + ['X', 'Y', 'MT']

## Get exon list

In [5]:
# release 97 uses human reference genome GRCh38
data = EnsemblRelease(ENSEMBL_RELEASE)

In [6]:
human_exons = data.exon_ids()
len(human_exons)

745513

In [7]:
human_exons[0], data.exon_by_id(human_exons[0])


('ENSE00000327880',
 Exon(exon_id='ENSE00000327880', gene_id='ENSG00000009780', gene_name='FAM76A', contig='1', start=27732603, end=27732657, strand='+'))

In [8]:
exons_full_info  = [data.exon_by_id(exon) for exon in human_exons]

In [9]:
human_exon_tuples = [(x.exon_id, x.gene_id, x.gene_name, x.contig, x.start, x.end, x.strand) for x in exons_full_info]
human_exon_table = pd.DataFrame.from_records(human_exon_tuples, columns=["id", "gene_id", "gene_symbol", "chr", "start", "end", "strand"])
assert all(human_exon_table.start <= human_exon_table.end)

human_exon_table.head()

Unnamed: 0,id,gene_id,gene_symbol,chr,start,end,strand
0,ENSE00000327880,ENSG00000009780,FAM76A,1,27732603,27732657,+
1,ENSE00000328922,ENSG00000034533,ASTE1,3,131018506,131018716,-
2,ENSE00000329326,ENSG00000129295,LRRC6,8,132583694,132583779,-
3,ENSE00000330966,ENSG00000040608,RTN4R,22,20241417,20243110,-
4,ENSE00000331106,ENSG00000063515,GSC2,22,19148576,19149095,-


In [12]:
selected_regions = human_exon_table[["chr", "start", "end"]][human_exon_table.chr.isin(CHRS) & ((human_exon_table.end - human_exon_table.start) >= K)].copy()
selected_regions.chr.value_counts()

1     23714
2     18929
3     15756
11    15213
19    15123
17    15102
12    14259
16    12452
7     12396
5     12357
6     12345
4     10457
8     10156
15     9543
14     9305
9      9156
10     8957
X      7854
20     5810
22     5591
18     4871
13     4699
21     3189
Y       914
MT       15
Name: chr, dtype: int64

In [13]:
selected_regions['length'] = selected_regions.end - selected_regions.start + 1
selected_regions['random_pos'] = [np.random.randint(c_len - K) for c_len in selected_regions.length]
selected_regions['random_start'] = selected_regions.start + selected_regions.random_pos
selected_regions.head()

Unnamed: 0,chr,start,end,length,random_pos,random_start
1,3,131018506,131018716,211,1,131018507
3,22,20241417,20243110,1694,931,20242348
4,22,19148576,19149095,520,2,19148578
8,11,2412754,2413012,259,40,2412794
31,15,40899144,40901014,1871,190,40899334


## Random exon selection

In [14]:
sample_regions = selected_regions.sample(N)
sample_regions.shape

(50000, 6)

## Get actual genomic sequences

In [15]:
seqs = sample_regions[['chr', 'random_start']].copy().reset_index(drop=True)
seqs['random_end'] = seqs['random_start'] + K - 1
seqs['seq'] = ''

In [16]:
seqs.head()

Unnamed: 0,chr,random_start,random_end,seq
0,1,236763182,236763381,
1,17,8289582,8289781,
2,3,100455979,100456178,
3,10,94852754,94852953,
4,1,209756085,209756284,


In [17]:
def which(self):
    try:
        self = list(iter(self))
    except TypeError as e:
        raise Exception("""'which' method can only be applied to iterables.
        {}""".format(str(e)))
    indices = [i for i, x in enumerate(self) if bool(x) == True]
    return(indices)

with gzip.open(DNA_TOPLEVEL_FASTA_PATH, "rt") as handle:
    for record in tqdm(SeqIO.parse(handle, "fasta"), total=24):
        sel_seqs = which(seqs.chr == record.id)
        for i in sel_seqs:
            seqs.loc[i, "seq"] = str(record.seq[(seqs.random_start[i]-1):seqs.random_end[i]])
        
        if record.id == "MT": 
            # stop, do not read small contigs
            break

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))

In [18]:
seqs.head()

Unnamed: 0,chr,random_start,random_end,seq
0,1,236763182,236763381,GACTACCCAAAAGATTTTTGGCACTTACAATTTTTAAAATAGTTTA...
1,17,8289582,8289781,AGCCCCCGTAAGGAGGAAGGAACGGGCGGGTATCTCATGACTGGGT...
2,3,100455979,100456178,TGAGCCCCAATTCACCATTTCAGGATGTGGATGGGGGCGGGGTTGG...
3,10,94852754,94852953,AGGGCCTGGCCCGCATGGAGCTGTTTTTATTCCTGACCTTCATTTT...
4,1,209756085,209756284,GGGTTTGCGAACCTTGTGGCTTTGTCTGTTTCCTGTTTCAGCAAGG...


In [19]:
len(seqs.seq.values[0])

200

In [20]:
seqs.seq.values[1]

'AGCCCCCGTAAGGAGGAAGGAACGGGCGGGTATCTCATGACTGGGTTCCCAGGAGAATCGGGCTGGGAGGGACAGAACAGGGAGACTCACTGGTGGGATCCTCCAAGGAAGCAGGAGTGGGCCAGAGGTTTGGGGTAACTGATACCCAGGTCCTCTGGGGAACAGGAATTGGCAATTTCCAAACCTCAGGCCTGGATGAT'

In [21]:
any(seqs.seq.str.contains('N'))

False

## Save generated sequences to file

In [22]:
seqs.to_csv(OUTPUT_FILE, index=False)