<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/data/Random_Intron_Seqs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Random Intron Sequences

## Setup

Installation for colab environment.

In [1]:
!pip install biopython pyensembl

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/a8/66/134dbd5f885fc71493c61b6cf04c9ea08082da28da5ed07709b02857cbd0/biopython-1.77-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 8.4MB/s 
[?25hCollecting pyensembl
[?25l  Downloading https://files.pythonhosted.org/packages/6b/d2/ef4b305af520ff7cc478d24c9b78560db20bb37be63fde7c2f83e9c6460e/pyensembl-1.8.7.tar.gz (57kB)
[K     |████████████████████████████████| 61kB 8.8MB/s 
Collecting typechecks>=0.0.2
  Downloading https://files.pythonhosted.org/packages/62/21/15129201c1f52f6af1e7809e96dce46da4b406c2c07fe9425e92f51edc5c/typechecks-0.1.0.tar.gz
Collecting datacache>=1.1.4
  Downloading https://files.pythonhosted.org/packages/f0/b9/a7114c1ac7b18bec9e0d232e65620aeb469b396ceaa85bcb6e81fe89a19d/datacache-1.1.5.tar.gz
Collecting memoized-property>=1.0.2
  Downloading https://files.pythonhosted.org/packages/70/db/23f8b5d86c9385299586c2469b58087f658f58eaeb414be0fd64cfd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pyensembl install --release 97 --species human

2020-06-27 23:01:41,272 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=97, species='homo_sapiens')
2020-06-27 23:01:41,272 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz from URL ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 23:01:41,272 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz to /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 23:01:42,038 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2020-06-27 23:01:42,038 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz to /root/.

In [4]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm

from Bio import SeqIO   # for reading fasta files
from pyensembl import EnsemblRelease   # to get the gene list

ENSEMBL_RELEASE = 97
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"

# to generate random sequences
N = 50_000    # how many
K = 200       # how long
OUTPUT_FILE = '/content/drive/My Drive/data/random/random_introns.csv'   # where to save them

CHRS = [str(chr) for chr in range(1,23)] + ['X', 'Y', 'MT']

## Get exon list

In [5]:
# release 97 uses human reference genome GRCh38
data = EnsemblRelease(ENSEMBL_RELEASE)

In [6]:
human_exons = data.exon_ids()
len(human_exons)

745513

In [7]:
human_exons[0], data.exon_by_id(human_exons[0])


('ENSE00000327880',
 Exon(exon_id='ENSE00000327880', gene_id='ENSG00000009780', gene_name='FAM76A', contig='1', start=27732603, end=27732657, strand='+'))

In [8]:
exons_full_info  = [data.exon_by_id(exon) for exon in human_exons]

In [9]:
human_exon_tuples = [(x.exon_id, x.gene_id, x.gene_name, x.contig, x.start, x.end, x.strand) for x in exons_full_info]
human_exon_table = pd.DataFrame.from_records(human_exon_tuples, columns=["id", "gene_id", "gene_symbol", "chr", "start", "end", "strand"])
assert all(human_exon_table.start <= human_exon_table.end)

human_exon_table['exon_noneverlaping_id'] = 0
human_exon_table = human_exon_table.sort_values(['gene_id', 'start', 'end'])
human_exon_table.head()

Unnamed: 0,id,gene_id,gene_symbol,chr,start,end,strand,exon_noneverlaping_id
634617,ENSE00003730948,ENSG00000000003,TSPAN6,X,100627109,100629986,-,0
79011,ENSE00001459322,ENSG00000000003,TSPAN6,X,100628670,100629986,-,0
14691,ENSE00000868868,ENSG00000000003,TSPAN6,X,100630759,100630866,-,0
634956,ENSE00003731560,ENSG00000000003,TSPAN6,X,100632063,100632068,-,0
197,ENSE00000401072,ENSG00000000003,TSPAN6,X,100632485,100632568,-,0


In [11]:
human_exon_table_grouped = human_exon_table.groupby('gene_id')
human_exon_table_grouped.groups

{'ENSG00000000003': Int64Index([634617,  79011,  14691, 634956,    197, 191550, 502849, 586420,
             213112, 501640, 586426, 583014, 598849, 469616, 589333, 635984,
             209012, 194362, 198020, 182183],
            dtype='int64'),
 'ENSG00000000005': Int64Index([79026, 196, 239672, 2936, 463117, 570997, 206526, 2937, 14690,
             79022],
            dtype='int64'),
 'ENSG00000000419': Int64Index([449306, 520614, 176662,  95469, 506965, 662691,   2159, 213338,
             492329, 498730,  77952, 434496, 491847, 462698, 571216, 235382,
             492125, 556961, 465244, 596259, 580994, 597235,  96528,  77959,
             477543, 534873, 211038],
            dtype='int64'),
 'ENSG00000000457': Int64Index([619880,  75286,   9893, 142436,  58990,  11416, 207713, 506024,
             612718, 515827, 553265, 454640, 614570,  11415, 182566, 478999,
             533617, 517724, 564784, 464117, 535860, 531090, 556279,  75285,
             545176, 584972, 706936, 178032

In [12]:
human_exon_table_grouped.get_group('ENSG00000066827')

Unnamed: 0,id,gene_id,gene_symbol,chr,start,end,strand,exon_noneverlaping_id
282639,ENSE00002128386,ENSG00000066827,ZFAT,8,134477788,134478721,-,0
274686,ENSE00002100849,ENSG00000066827,ZFAT,8,134477792,134478701,-,0
491833,ENSE00003540302,ENSG00000066827,ZFAT,8,134477792,134478721,-,0
591972,ENSE00003665707,ENSG00000066827,ZFAT,8,134477792,134478721,-,0
540808,ENSE00003601450,ENSG00000066827,ZFAT,8,134478025,134478721,-,0
...,...,...,...,...,...,...,...,...
279775,ENSE00002118161,ENSG00000066827,ZFAT,8,134696431,134696558,-,0
281171,ENSE00002123129,ENSG00000066827,ZFAT,8,134712845,134712962,-,0
698989,ENSE00003841474,ENSG00000066827,ZFAT,8,134712845,134713031,-,0
92792,ENSE00001535723,ENSG00000066827,ZFAT,8,134712845,134713038,-,0


In [18]:
def get_introns(df):
  if df.shape[0] <=1: 
    return pd.DataFrame({'gene_id': [], 'chr': [], 'start': [], 'end': [], 'length': []})
  else:
    candidates = pd.DataFrame({'gene_id': df.gene_id.values[:-1], 'chr': df.chr.values[:-1], 'start': df.end.values[:-1]+1, 'end': df.start.values[1:]-1})
    candidates['length'] = candidates['end'] - candidates['start'] + 1 
    return(candidates[candidates.length > K])

get_introns(human_exon_table_grouped.get_group('ENSG00000066827'))

Unnamed: 0,gene_id,chr,start,end,length
7,ENSG00000066827,8,134478722,134509618,30897
11,ENSG00000066827,8,134509750,134510061,312
12,ENSG00000066827,8,134510183,134510695,513
13,ENSG00000066827,8,134511083,134512474,1392
16,ENSG00000066827,8,134512602,134520882,8281
18,ENSG00000066827,8,134521002,134532833,11832
20,ENSG00000066827,8,134532973,134564986,32014
24,ENSG00000066827,8,134565672,134583831,18160
27,ENSG00000066827,8,134584006,134588245,4240
29,ENSG00000066827,8,134588396,134590267,1872


In [19]:
human_introns = human_exon_table_grouped.apply(get_introns)
human_introns

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_id,chr,start,end,length
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,1,ENSG00000000003,X,100629987.0,100630758.0,772.0
ENSG00000000003,2,ENSG00000000003,X,100630867.0,100632062.0,1196.0
ENSG00000000003,3,ENSG00000000003,X,100632069.0,100632484.0,416.0
ENSG00000000003,5,ENSG00000000003,X,100632569.0,100633404.0,836.0
ENSG00000000003,8,ENSG00000000003,X,100633540.0,100633930.0,391.0
...,...,...,...,...,...,...
ENSG00000288110,0,ENSG00000288110,8,4496495.0,4497861.0,1367.0
ENSG00000288110,1,ENSG00000288110,8,4498056.0,4499121.0,1066.0
ENSG00000288110,2,ENSG00000288110,8,4499222.0,4501372.0,2151.0
ENSG00000288111,0,ENSG00000288111,3,130181410.0,130182746.0,1337.0


In [22]:
human_introns = human_introns.reset_index(drop=True)
human_introns['start'] = human_introns['start'].astype('int')
human_introns['end'] = human_introns['end'].astype('int')
human_introns['length'] = human_introns['length'].astype('int')
human_introns

Unnamed: 0,gene_id,chr,start,end,length
0,ENSG00000000003,X,100629987,100630758,772
1,ENSG00000000003,X,100630867,100632062,1196
2,ENSG00000000003,X,100632069,100632484,416
3,ENSG00000000003,X,100632569,100633404,836
4,ENSG00000000003,X,100633540,100633930,391
...,...,...,...,...,...
258769,ENSG00000288110,8,4496495,4497861,1367
258770,ENSG00000288110,8,4498056,4499121,1066
258771,ENSG00000288110,8,4499222,4501372,2151
258772,ENSG00000288111,3,130181410,130182746,1337


In [25]:
selected_regions = human_introns.copy()
selected_regions['random_pos'] = [np.random.randint(c_len-K) for c_len in selected_regions.length]
selected_regions['random_start'] = selected_regions.start + selected_regions.random_pos
selected_regions['random_end'] = selected_regions['random_start'] + K - 1
selected_regions.head()

Unnamed: 0,gene_id,chr,start,end,length,random_pos,random_start,random_end
0,ENSG00000000003,X,100629987,100630758,772,369,100630356,100630555
1,ENSG00000000003,X,100630867,100632062,1196,836,100631703,100631902
2,ENSG00000000003,X,100632069,100632484,416,161,100632230,100632429
3,ENSG00000000003,X,100632569,100633404,836,281,100632850,100633049
4,ENSG00000000003,X,100633540,100633930,391,142,100633682,100633881


## Random exon selection

In [34]:
sample_regions = selected_regions.sample(int(N*1.1))
sample_regions.shape

(55000, 8)

## Get actual genomic sequences

In [35]:
seqs = sample_regions[['gene_id', 'chr', 'random_start', 'random_end']].copy().reset_index(drop=True)
seqs['seq'] = ''

In [36]:
seqs.head()

Unnamed: 0,gene_id,chr,random_start,random_end,seq
0,ENSG00000114416,3,180950056,180950255,
1,ENSG00000144596,3,14493879,14494078,
2,ENSG00000169398,8,140740948,140741147,
3,ENSG00000106299,7,123695595,123695794,
4,ENSG00000197147,1,89552494,89552693,


In [37]:
def which(self):
    try:
        self = list(iter(self))
    except TypeError as e:
        raise Exception("""'which' method can only be applied to iterables.
        {}""".format(str(e)))
    indices = [i for i, x in enumerate(self) if bool(x) == True]
    return(indices)

with gzip.open(DNA_TOPLEVEL_FASTA_PATH, "rt") as handle:
    for record in tqdm(SeqIO.parse(handle, "fasta"), total=24):
        sel_seqs = which(seqs.chr == record.id)
        for i in sel_seqs:
            seqs.loc[i, "seq"] = str(record.seq[(seqs.random_start[i]-1):seqs.random_end[i]])
        
        if record.id == "MT": 
            # stop, do not read small contigs
            break

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))

In [38]:
seqs.head()

Unnamed: 0,gene_id,chr,random_start,random_end,seq
0,ENSG00000114416,3,180950056,180950255,GTAGTATTAAAATATTAAGCTTATCATTATATCTCTGTCTTATCCT...
1,ENSG00000144596,3,14493879,14494078,TGAAGGGAGCAAGAGGCATGTGATGTCCTAAAGATGGGTCCTGCCC...
2,ENSG00000169398,8,140740948,140741147,ATCTAGGCAACAAAGTGAGATCTCATCTCTACAAAAAAATCAAAAA...
3,ENSG00000106299,7,123695595,123695794,ACAGTGTATAAAGCCAAGGTACACAAAACCTTAGTCTAAGTCTTTA...
4,ENSG00000197147,1,89552494,89552693,CTGAGTCAGTTAAAAGCAGACTAATCAGTAAAATACATTTTCTGGT...


In [39]:
len(seqs.seq.values[0]), seqs.seq.values[0]

(200,
 'GTAGTATTAAAATATTAAGCTTATCATTATATCTCTGTCTTATCCTAAGGTACTCTGTTTTGTCCTGTTCTTCCCTGCTACACTAGTAGGATACTTGTTCTTCCCACTCACTCACACTTTCTAGATCTAGATGAACACATGTGCATGCACACACTGTGATGTCTCAGAGAAATTAAGATATGTTTATTATCCTTAGGACA')

In [40]:
sum(seqs.seq.str.contains('N'))

6

## Save generated sequences to file

In [41]:
output = seqs[~seqs.seq.str.contains('N')]
output.shape

(54994, 5)

In [42]:
output[:N].to_csv(OUTPUT_FILE, index=False)