<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/data/Random_5UTR_Seqs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Random 5'UTR Sequences


## Setup

Installation for colab environment.

In [None]:
!pip install biopython pyensembl

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyensembl
  Downloading pyensembl-2.3.13-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting typechecks<1.0.0,>=0.0.2 (from pyensembl)
  Downloading typechecks-0.1.0.tar.gz (3.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datacache<2.0.0,>=1.4.0 (from pyensembl)
  Downloading datacache-1.4.1-py3-none-any.whl (20 kB)
Collecting memoized-property>=1.0.2 (from pyensembl)
  Downloading memoized-property-1.0.3.tar.gz (5.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinytimer<1.0.0,>=0.0.0 (from pyensembl)
  Downloading tinytimer-0.0.0.tar.gz (2.1 kB)
  Preparing metadata (setup.py) ... [?25l

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pyensembl install --release 97 --species human

2024-05-13 18:49:20,016 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=97, species='homo_sapiens')
2024-05-13 18:49:20,016 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz from URL https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
2024-05-13 18:49:20,017 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz to /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz
2024-05-13 18:49:22,712 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL https://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2024-05-13 18:49:22,714 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz to

In [None]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm

from Bio import SeqIO   # for reading fasta files
from pyensembl import EnsemblRelease   # to get the gene list
import random

ENSEMBL_RELEASE = 97
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"

# to generate random sequences
N = 5500
K = [random.randint(200, 500) for _ in range(5500)]
OUTPUT_FILE = '/content/drive/My Drive/data/random/5utr.csv'

CHRS = [str(chr) for chr in range(1,23)] + ['X', 'Y', 'MT']

## Get transcript list

In [None]:
# release 97 uses human reference genome GRCh38
data = EnsemblRelease(ENSEMBL_RELEASE)

In [None]:
human_transcripts = data.transcript_ids()
len(human_transcripts)

226788

In [None]:
human_transcripts[0], data.transcript_by_id(human_transcripts[0])


('ENST00000000233',
 Transcript(transcript_id='ENST00000000233', transcript_name='ARF5-201', gene_id='ENSG00000004059', biotype='protein_coding', contig='7', start=127588411, end=127591700, strand='+', genome='GRCh38'))

In [None]:
transcripts_full_info  = [data.transcript_by_id(transcript) for transcript in human_transcripts]

In [None]:
human_transcript_tuples = [(x.transcript_id, x.gene_id, x.biotype, x.contig, x.start, x.end, x.strand, x.five_prime_utr_sequence) for x in transcripts_full_info if x.contains_start_codon & x.contains_stop_codon]
human_transcript_table = pd.DataFrame.from_records(human_transcript_tuples, columns=["id", "gene_id", "biotype", "chr", "start", "end", "strand", "five_prime_utr_sequence"])
assert all(human_transcript_table.start <= human_transcript_table.end)

human_transcript_table.head()

Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,five_prime_utr_sequence
0,ENST00000000233,ENSG00000004059,protein_coding,7,127588411,127591700,+,CTGCTGCTGCTGCGCCCCATCCCCCCGCGGCCGGCCAGTTCCAGCC...
1,ENST00000000412,ENSG00000003056,protein_coding,12,8940361,8949645,-,AGAGTGGGGCACAGCGAGGCGCTAGGGGGAACGCTGGCCTCTGAAA...
2,ENST00000000442,ENSG00000173153,protein_coding,11,64305524,64316743,+,GTCAGCTGGAGGAAGCGGAGTAGGAAGCGGCCGCGATGTCCTTTTG...
3,ENST00000001008,ENSG00000004478,protein_coding,12,2794970,2805423,+,CCTACCCCAGCTCTCGCGCCGCGTGCAGAGGTGCTCAAGCCTCCTC...
4,ENST00000001146,ENSG00000003137,protein_coding,2,72129238,72148038,-,AGGCAATTTTTTTCCTCCCTCTCTCCGCTCCCCTCGCAGCCTCCAC...


In [None]:
assert ~human_transcript_table.five_prime_utr_sequence.str.contains('N').any()

human_transcript_table['length'] = human_transcript_table.five_prime_utr_sequence.apply(len)
selected_regions = human_transcript_table[human_transcript_table.length > 500].copy()

human_transcript_table.shape, selected_regions.shape

((68734, 9), (5927, 9))

In [None]:
sample_regions = selected_regions.sample(N)
sample_regions['random_start'] = [np.random.randint(c_len - K[i]) for i, c_len in enumerate(sample_regions.length)]
sample_regions['random_end'] = sample_regions['random_start'] + K - 1
sample_regions['seq'] = ''
for i in range(sample_regions.shape[0]):
  sample_regions['seq'].iloc[i] = (sample_regions['five_prime_utr_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
sample_regions.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_regions['seq'].iloc[i] = (sample_regions['five_prime_utr_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_regions['seq'].iloc[i] = (sample_regions['five_prime_utr_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,five_prime_utr_sequence,length,random_start,random_end,seq
48284,ENST00000534834,ENSG00000198393,protein_coding,12,133004404,133012127,+,TATGTTCTGCTGATTTCAGTGCTCACCGTTGTATCCAGCTTCCCAC...,2665,2056,2469,TGCATGTATAAATGTTCATTCCCATCTACAAGATAGAAAGTTTCTT...
11941,ENST00000335524,ENSG00000186143,protein_coding,2,27136848,27139410,-,AGCTCCCATTGGGGTCGGCGAACCTGGTGCCACCCCTTAGACAAAG...,526,128,413,CCTGTGAAACACAGCCCAGAGGAGTTCTCATTGGTCCTATGGGCTT...
26956,ENST00000396420,ENSG00000085998,protein_coding,1,46188682,46220291,-,GAATCAATTTTGGGGGAGACCACATCTAATGGATAAAAGAGCCACA...,637,26,512,TAATGGATAAAAGAGCCACAGGATTACAAAAGGGTGGAATAGTTGT...
51121,ENST00000546933,ENSG00000186298,protein_coding,12,110720921,110732983,-,ATCAGAACGAGACTTAGTTACATATTTTGGGGTCCAAAAGTGAACT...,1000,572,993,TTTCTTTTTTTTTTTTGAGACGGAGTCTCGCTGTGTTGCCAGGCTG...
9779,ENST00000320122,ENSG00000048540,protein_coding,12,16548424,16605379,-,ATATCCTAATAGATTGAGATTCAAACTGATGCAGCACATTTTTTAC...,523,0,385,ATATCCTAATAGATTGAGATTCAAACTGATGCAGCACATTTTTTAC...


## Random transcript selection

In [None]:
sample_regions.shape

(5500, 12)

In [None]:
seqs = sample_regions[['id', 'chr', 'start', 'end', 'random_start', 'random_end', 'seq']].copy().reset_index(drop=True)
seqs.head()

Unnamed: 0,id,chr,start,end,random_start,random_end,seq
0,ENST00000534834,12,133004404,133012127,2056,2469,TGCATGTATAAATGTTCATTCCCATCTACAAGATAGAAAGTTTCTT...
1,ENST00000335524,2,27136848,27139410,128,413,CCTGTGAAACACAGCCCAGAGGAGTTCTCATTGGTCCTATGGGCTT...
2,ENST00000396420,1,46188682,46220291,26,512,TAATGGATAAAAGAGCCACAGGATTACAAAAGGGTGGAATAGTTGT...
3,ENST00000546933,12,110720921,110732983,572,993,TTTCTTTTTTTTTTTTGAGACGGAGTCTCGCTGTGTTGCCAGGCTG...
4,ENST00000320122,12,16548424,16605379,0,385,ATATCCTAATAGATTGAGATTCAAACTGATGCAGCACATTTTTTAC...


In [None]:
len(seqs.seq.values[0]), seqs.seq.values[0]

(414,
 'TGCATGTATAAATGTTCATTCCCATCTACAAGATAGAAAGTTTCTTGAAGACAGAACTGGATTTTATTCATCCTGAAATCTTCAGGACCCAAGAGAGTACCGGCAAATAGAAGCTCCTTGATTGATTTTTTCTTTCTTTCTTTTTTTTTTTTTTAGATGGAGTCTCACTCTGTCGCCGAGGCTGGAGTGCAGTGGCACGATTTTGGCTTGCTGCAACCTCTGCTGCCTGGGTTCAAGTGATTCTCCTGCCTCAGCCACCTGAGTAGCTAGGATTACAAGTGCCTGCCACTGCGCCCAGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATCTTGGTCAGGCTGGTCTTGAACTCCTGCCCTTGTGATCCACCCGCCTTGGCCTCCGAAAGTGCTGGCGTGAGCCACC')

## Save generated sequences to file

In [None]:
seqs.to_csv('/content/drive/My Drive/genomic_data/5utr.csv', index=False)