<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/data/Random_3UTR_Seqs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Random 3'UTR Sequences


## Setup

Installation for colab environment.

In [1]:
!pip install biopython pyensembl

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/a8/66/134dbd5f885fc71493c61b6cf04c9ea08082da28da5ed07709b02857cbd0/biopython-1.77-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 5.5MB/s 
[?25hCollecting pyensembl
[?25l  Downloading https://files.pythonhosted.org/packages/6b/d2/ef4b305af520ff7cc478d24c9b78560db20bb37be63fde7c2f83e9c6460e/pyensembl-1.8.7.tar.gz (57kB)
[K     |████████████████████████████████| 61kB 7.1MB/s 
Collecting typechecks>=0.0.2
  Downloading https://files.pythonhosted.org/packages/62/21/15129201c1f52f6af1e7809e96dce46da4b406c2c07fe9425e92f51edc5c/typechecks-0.1.0.tar.gz
Collecting datacache>=1.1.4
  Downloading https://files.pythonhosted.org/packages/f0/b9/a7114c1ac7b18bec9e0d232e65620aeb469b396ceaa85bcb6e81fe89a19d/datacache-1.1.5.tar.gz
Collecting memoized-property>=1.0.2
  Downloading https://files.pythonhosted.org/packages/70/db/23f8b5d86c9385299586c2469b58087f658f58eaeb414be0fd64cfd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pyensembl install --release 97 --species human

2020-06-27 22:05:44,208 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=97, species='homo_sapiens')
2020-06-27 22:05:44,208 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz from URL ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 22:05:44,208 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz to /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz
2020-06-27 22:05:48,702 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2020-06-27 22:05:48,702 - datacache.download - INFO - Downloading ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz to /root/.

In [4]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm

from Bio import SeqIO   # for reading fasta files
from pyensembl import EnsemblRelease   # to get the gene list

ENSEMBL_RELEASE = 97
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"

# to generate random sequences
N = 50_000    # how many
K = 200       # how long
OUTPUT_FILE = '/content/drive/My Drive/data/random/random_3utr.csv'   # where to save them

CHRS = [str(chr) for chr in range(1,23)] + ['X', 'Y', 'MT']

## Get transcript list

In [5]:
# release 97 uses human reference genome GRCh38
data = EnsemblRelease(ENSEMBL_RELEASE)

In [6]:
human_transcripts = data.transcript_ids()
len(human_transcripts)

226788

In [7]:
human_transcripts[0], data.transcript_by_id(human_transcripts[0])


('ENST00000000233',
 Transcript(transcript_id='ENST00000000233', transcript_name='ARF5-201', gene_id='ENSG00000004059', biotype='protein_coding', contig='7', start=127588411, end=127591700, strand='+', genome='GRCh38'))

In [8]:
transcripts_full_info  = [data.transcript_by_id(transcript) for transcript in human_transcripts]

In [19]:
human_transcript_tuples = [(x.transcript_id, x.gene_id, x.biotype, x.contig, x.start, x.end, x.strand, x.three_prime_utr_sequence) for x in transcripts_full_info if x.contains_start_codon & x.contains_stop_codon]
human_transcript_table = pd.DataFrame.from_records(human_transcript_tuples, columns=["id", "gene_id", "biotype", "chr", "start", "end", "strand", "three_prime_utr_sequence"])
assert all(human_transcript_table.start <= human_transcript_table.end)

human_transcript_table.head()

Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,three_prime_utr_sequence
0,ENST00000000233,ENSG00000004059,protein_coding,7,127588411,127591700,+,CCAGCCAGGGGCAGGCCCCTGATGCCCGGAAGCTCCTGCGTGCATC...
1,ENST00000000412,ENSG00000003056,protein_coding,12,8940361,8949645,-,ATTGCACTTTATATGTCCAGCCTCTTCCTCAGTCCCCCAAACCAAA...
2,ENST00000000442,ENSG00000173153,protein_coding,11,64305524,64316743,+,GGCAAGGGGTGGGACTGGTGGGGGTTCTGGCAGGACCTGCCTAGCA...
3,ENST00000001008,ENSG00000004478,protein_coding,12,2794970,2805423,+,CCCCTCTCCACCAGCCCTACTCCTGCGGCTGCCTGCCCCCCAGTCT...
4,ENST00000001146,ENSG00000003137,protein_coding,2,72129238,72148038,-,CCCAAGACCCACCCGCCTCAGCCCAGCCCAGGCAGCGGGGTGGTGG...


In [20]:
assert ~human_transcript_table.three_prime_utr_sequence.str.contains('N').any()

human_transcript_table['length'] = human_transcript_table.three_prime_utr_sequence.apply(len)
selected_regions = human_transcript_table[human_transcript_table.length > K].copy()

human_transcript_table.shape, selected_regions.shape

((68734, 9), (53061, 9))

In [21]:
selected_regions['random_start'] = [np.random.randint(c_len - K) for c_len in selected_regions.length]
selected_regions['random_end'] = selected_regions['random_start'] + K - 1
selected_regions['seq'] = ''
for i in range(selected_regions.shape[0]):
  selected_regions['seq'].iloc[i] = (selected_regions['three_prime_utr_sequence'].iloc[i])[selected_regions['random_start'].iloc[i]:selected_regions['random_start'].iloc[i]+200]
selected_regions.head()

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,three_prime_utr_sequence,length,random_start,random_end,seq
0,ENST00000000233,ENSG00000004059,protein_coding,7,127588411,127591700,+,CCAGCCAGGGGCAGGCCCCTGATGCCCGGAAGCTCCTGCGTGCATC...,401,2,201,AGCCAGGGGCAGGCCCCTGATGCCCGGAAGCTCCTGCGTGCATCCC...
1,ENST00000000412,ENSG00000003056,protein_coding,12,8940361,8949645,-,ATTGCACTTTATATGTCCAGCCTCTTCCTCAGTCCCCCAAACCAAA...,1457,278,477,TACAATAGGCATATGGGCAAAATGGTGTAGCAGGCTCACTGGCCGT...
2,ENST00000000442,ENSG00000173153,protein_coding,11,64305524,64316743,+,GGCAAGGGGTGGGACTGGTGGGGGTTCTGGCAGGACCTGCCTAGCA...,777,1,200,GCAAGGGGTGGGACTGGTGGGGGTTCTGGCAGGACCTGCCTAGCAT...
3,ENST00000001008,ENSG00000004478,protein_coding,12,2794970,2805423,+,CCCCTCTCCACCAGCCCTACTCCTGCGGCTGCCTGCCCCCCAGTCT...,2165,1607,1806,GACTGTGCCACTGCACGCCAGCCTGGGCAAGAGAGTGAGACCATCT...
4,ENST00000001146,ENSG00000003137,protein_coding,2,72129238,72148038,-,CCCAAGACCCACCCGCCTCAGCCCAGCCCAGGCAGCGGGGTGGTGG...,2989,974,1173,CCTGGACTGAAGTCCGGTGCCTCTGCCTTATCCCTGGTGGAGATGG...


## Random transcript selection

In [22]:
sample_regions = selected_regions.sample(N)
sample_regions.shape

(50000, 12)

In [23]:
seqs = sample_regions[['id', 'chr', 'random_start', 'random_end', 'seq']].copy().reset_index(drop=True)
seqs.head()

Unnamed: 0,id,chr,random_start,random_end,seq
0,ENST00000552870,12,116,315,AAAACAAGGACTGCAGCCTAAATTCCAAATACCAGAGACTGAAATT...
1,ENST00000496961,1,2645,2844,GGAGTCATTTATATTCTGCAGGAGGAAGGGGCCCCAGCTGTCGCCT...
2,ENST00000355654,2,428,627,GTAGAGGGTGTTTTCACCTTCCAAGACATGGGGCAAAGTTTGGAGA...
3,ENST00000616793,5,3835,4034,GGGTTATCACTTAGGTAGAGAGCAAATGTGTTCTCCACTAGTAATA...
4,ENST00000541924,12,508,707,CCCCACGGTGAGCGCCCTGTGCCCCACACAGCAGGAGATGATGATA...


In [25]:
len(seqs.seq.values[0]), seqs.seq.values[0]

(200,
 'AAAACAAGGACTGCAGCCTAAATTCCAAATACCAGAGACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAGGGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTATTTTCTATTCCATACTTCTGCCCACGTTGTTT')

## Save generated sequences to file

In [26]:
seqs.to_csv(OUTPUT_FILE, index=False)