# Generate Random Transcript Sequences


## Setup

Installation for colab environment.

In [None]:
!pip install biopython pyensembl

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyensembl
  Downloading pyensembl-2.3.13-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting typechecks<1.0.0,>=0.0.2 (from pyensembl)
  Downloading typechecks-0.1.0.tar.gz (3.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datacache<2.0.0,>=1.4.0 (from pyensembl)
  Downloading datacache-1.4.1-py3-none-any.whl (20 kB)
Collecting memoized-property>=1.0.2 (from pyensembl)
  Downloading memoized-property-1.0.3.tar.gz (5.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinytimer<1.0.0,>=0.0.0 (from pyensembl)
  Downloading tinytimer-0.0.0.tar.gz (2.1 kB)
  Preparing metadata (setup.py) ... [?25l

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pyensembl install --release 97 --species human

2024-05-13 17:27:44,590 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=97, species='homo_sapiens')
2024-05-13 17:27:44,590 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz from URL https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
2024-05-13 17:27:44,590 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz to /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.97.gtf.gz
2024-05-13 17:27:47,538 - pyensembl.download_cache - INFO - Fetching /root/.cache/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL https://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2024-05-13 17:27:47,539 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz to

In [None]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm
from scipy.stats import truncnorm

from Bio import SeqIO   # for reading fasta files
from pyensembl import EnsemblRelease   # to get the gene list
import random

ENSEMBL_RELEASE = 97
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"

# to generate random sequences
N = 20000    # how many
K = []       # how long
K = [random.randint(200, 500) for _ in range(20000)]
# target_mean = 401
# target_std = 184.3

# # 设置目标平均值和标准差
# target_mean = 401
# target_std = 184.3
# lower, upper = 200, 650

# # 计算截断正态分布的参数
# a, b = (lower - target_mean) / target_std, (upper - target_mean) / target_std
# K = truncnorm(a=a, b=b, loc=target_mean, scale=target_std).rvs(size=N)
# K = K.astype(int)

# # 验证结果
# actual_mean = np.mean(K)
# actual_std = np.std(K)
# actual_max = np.max(K)

# print(f"Actual Mean: {actual_mean}, Actual Std: {actual_std}, Actual Max: {actual_max}")


OUTPUT_FILE = '/content/drive/My Drive/data/random/random_transcripts.csv'   # where to save them

CHRS = [str(chr) for chr in range(1,23)] + ['X', 'Y', 'MT']

## Get transcript list

In [None]:
# release 97 uses human reference genome GRCh38
data = EnsemblRelease(ENSEMBL_RELEASE)

In [None]:
human_transcripts = data.transcript_ids()
len(human_transcripts)

226788

In [None]:
human_transcripts[0], data.transcript_by_id(human_transcripts[0])


('ENST00000000233',
 Transcript(transcript_id='ENST00000000233', transcript_name='ARF5-201', gene_id='ENSG00000004059', biotype='protein_coding', contig='7', start=127588411, end=127591700, strand='+', genome='GRCh38'))

In [None]:
transcripts_full_info  = [data.transcript_by_id(transcript) for transcript in human_transcripts]

In [None]:
human_transcript_tuples = [(x.transcript_id, x.gene_id, x.biotype, x.contig, x.start, x.end, x.strand, x.coding_sequence) for x in transcripts_full_info if x.contains_start_codon & x.contains_stop_codon]
human_transcript_table = pd.DataFrame.from_records(human_transcript_tuples, columns=["id", "gene_id", "biotype", "chr", "start", "end", "strand", "coding_sequence"])
assert all(human_transcript_table.start <= human_transcript_table.end)

human_transcript_table.head()

Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,coding_sequence
0,ENST00000000233,ENSG00000004059,protein_coding,7,127588411,127591700,+,ATGGGCCTCACCGTGTCCGCGCTCTTTTCGCGGATCTTCGGGAAGA...
1,ENST00000000412,ENSG00000003056,protein_coding,12,8940361,8949645,-,ATGTTCCCTTTCTACAGCTGCTGGAGGACTGGACTGCTACTACTAC...
2,ENST00000000442,ENSG00000173153,protein_coding,11,64305524,64316743,+,ATGTCCAGCCAGGTGGTGGGCATTGAGCCTCTCTACATCAAGGCAG...
3,ENST00000001008,ENSG00000004478,protein_coding,12,2794970,2805423,+,ATGACAGCCGAGGAGATGAAGGCGACCGAGAGCGGGGCGCAGTCGG...
4,ENST00000001146,ENSG00000003137,protein_coding,2,72129238,72148038,-,ATGCTCTTTGAGGGCTTGGATCTGGTGTCGGCGCTGGCCACCCTCG...


In [None]:
print(human_transcript_table['end'].iloc[0] - human_transcript_table['start'].iloc[0], len(human_transcript_table['coding_sequence'].iloc[0]))
length = 0
for exon in data.transcript_by_id(human_transcripts[0]).exons:
  print(exon.end - exon.start)
print(data.transcript_by_id(human_transcripts[0]).exons)
print(data.transcript_by_id(human_transcripts[0]).coding_sequence_position_ranges)

3289 543
154
80
109
71
125
487
[Exon(exon_id='ENSE00001872691', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127588411, end=127588565, strand='+'), Exon(exon_id='ENSE00003494180', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127589083, end=127589163, strand='+'), Exon(exon_id='ENSE00003504066', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127589485, end=127589594, strand='+'), Exon(exon_id='ENSE00003678978', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127590066, end=127590137, strand='+'), Exon(exon_id='ENSE00003676786', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127590963, end=127591088, strand='+'), Exon(exon_id='ENSE00000882271', gene_id='ENSG00000004059', gene_name='ARF5', contig='7', start=127591213, end=127591700, strand='+')]
[(127588499, 127588565), (127589083, 127589163), (127589485, 127589594), (127590066, 127590137), (127590963, 127591088), (127591213, 127591296)]


In [None]:
assert ~human_transcript_table.coding_sequence.str.contains('N').any()

human_transcript_table['length'] = human_transcript_table.coding_sequence.apply(len)
selected_regions = human_transcript_table[human_transcript_table.length > 700].copy()

human_transcript_table.shape, selected_regions.shape

((68734, 9), (43931, 9))

In [None]:
sample_regions = selected_regions.sample(N)
sample_regions['random_start'] = [np.random.randint(c_len - K[i]) for i, c_len in enumerate(sample_regions.length)]
sample_regions['random_end'] = sample_regions['random_start'] + K - 1
sample_regions['seq'] = ''
for i in range(sample_regions.shape[0]):
  sample_regions['seq'].iloc[i] = (sample_regions['coding_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
sample_regions.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_regions['seq'].iloc[i] = (sample_regions['coding_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_regions['seq'].iloc[i] = (sample_regions['coding_sequence'].iloc[i])[sample_regions['random_start'].iloc[i]:sample_regions['random_start'].iloc[i]+K[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Unnamed: 0,id,gene_id,biotype,chr,start,end,strand,coding_sequence,length,random_start,random_end,seq
34560,ENST00000435907,ENSG00000182022,protein_coding,10,124007668,124093598,-,ATGAGGCACTGCATTAATTGCTGCATACAGCTGTTACCCGACGGCG...,1686,386,797,TGACACAAAGGAGCATCACCACCAATCCTCTGTAAATAATATTTCA...
64590,ENST00000640107,ENSG00000124155,nonsense_mediated_decay,20,45416141,45426195,+,ATGGCGGCGGCTATGCCGCTTGCTCTGCTCGTCCTGTTGCTCCTGG...,723,12,285,ATGCCGCTTGCTCTGCTCGTCCTGTTGCTCCTGGGGCCCGGCGGCT...
61952,ENST00000620430,ENSG00000136051,protein_coding,12,105107772,105167000,+,ATGGCGGTGGAGACTCTGTCCCCGGACTGGGAGTTTGACCGCGTTG...,3525,787,1038,CCTGTATAGAACAACAATTTGATTCTCTCAATGGAGGAGTATCTGT...
11699,ENST00000334293,ENSG00000161082,nonsense_mediated_decay,19,3225485,3297016,+,ATGGCGCGGCCAATCCAGGTGAAGCCTGCGGACAGTGAAAGCCGCG...,963,14,235,CCAGGTGAAGCCTGCGGACAGTGAAAGCCGCGGAGGTAGGGACCGG...
41468,ENST00000493237,ENSG00000196549,protein_coding,3,155080111,155180628,+,ATGGGCAAGTCAGAAAGTCAGATGGATATAACTGATATCAACACTC...,2253,1447,1707,CTGATGACATTGTTTCAAATGATAACAAACTGAATAATGAGTACCT...


In [None]:
sample_regions.biotype.value_counts()

biotype
protein_coding             19026
nonsense_mediated_decay      967
polymorphic_pseudogene         7
Name: count, dtype: int64

## Random transcript selection

In [None]:
sample_regions.shape

(20000, 12)

In [None]:
seqs = sample_regions[['id', 'chr', 'start', 'end', 'random_start', 'random_end', 'seq']].copy().reset_index(drop=True)
seqs.head()

Unnamed: 0,id,chr,start,end,random_start,random_end,seq
0,ENST00000435907,10,124007668,124093598,386,797,TGACACAAAGGAGCATCACCACCAATCCTCTGTAAATAATATTTCA...
1,ENST00000640107,20,45416141,45426195,12,285,ATGCCGCTTGCTCTGCTCGTCCTGTTGCTCCTGGGGCCCGGCGGCT...
2,ENST00000620430,12,105107772,105167000,787,1038,CCTGTATAGAACAACAATTTGATTCTCTCAATGGAGGAGTATCTGT...
3,ENST00000334293,19,3225485,3297016,14,235,CCAGGTGAAGCCTGCGGACAGTGAAAGCCGCGGAGGTAGGGACCGG...
4,ENST00000493237,3,155080111,155180628,1447,1707,CTGATGACATTGTTTCAAATGATAACAAACTGAATAATGAGTACCT...


In [None]:
len(seqs.seq.values[0]), seqs.seq.values[0]

(412,
 'TGACACAAAGGAGCATCACCACCAATCCTCTGTAAATAATATTTCATACATGAAGGACTATCCAAGCATTAAATTAATTATCAACAGCATCACAACTAGGATTGAGTTCACGACCAGACAGCTCCCAGACTTAGAAGACCTTAAGAAGCAGGAGTTGCATATGTTTTCAGTCATCCCCAACAAATTCCTTCCAAACAGTAAGAGCCCCTGTTGGTACGAGGAGTTCTCGGGGCAGAACACCACCGACCCCTACCTCACCAACTCCTACGTGCTCTACTCCAAGCGCTTCCGCTCCACCTTCGACGCCCTGCGCAAGGCCTTCTGGGGCCACCTGGCGCACGCGCACGGGAAGCACTTCCGCCTGCGCTGCCTGCCGCACTTCTACATCATAGGGCAGCCCAAGTGCGGGACC')

## Save generated sequences to file

In [None]:
seqs.to_csv('/content/drive/My Drive/genomic_data/coding_sequences.csv', index=False)