In [50]:
from collections import Counter

In [51]:
#متن را از فایل بخوانید
with open('RNA_Sequence_Data.txt', 'r') as file:
    seq = file.read().replace('\n', '').upper()

In [52]:
# تعداد n-grams متن را محاسبه کنید
def count_ngrams(text, n):
    return len(text) - n + 1

In [53]:
# n-grams بالا را در متن پیدا کنید
def find_top_ngrams(text, n, top):
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    ngram_counts = Counter(ngrams)
    top_ngrams = ngram_counts.most_common(top)
    return top_ngrams

In [54]:
# جانشین های احتمالی بعد از یک n-gram خاص را در متن پیدا کنید
def find_successors(text, ngram):
    n = len(ngram)
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    successors = [ngrams[i+n-1] for i in range(len(ngrams)-n) if ngrams[i:i+n-1] == ngram[:-1]]
    return successors

In [55]:
# محاسبه احتمال حداکثر احتمال (MLE) برای یک n-grams معین
def calculate_MLE_probability(ngram, text):
    n = len(ngram)
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    ngram_counts = Counter(ngrams)
    total_count = len(ngrams)
    mle_probability = ngram_counts[ngram] / total_count
    return mle_probability

In [56]:
# با استفاده از مدل n-gram دنباله ای از نوکلئوتیدهای RNA تولید کنید
def generate_sequence(text, n, length=20):
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    sequence = []
    while len(sequence) < length:
        candidates = [ngrams[i+n-1] for i in range(len(ngrams)-n) if ngrams[i:i+n-1] == sequence[-n+1:]]
        if candidates:
            next_nucleotide = max(set(candidates), key=candidates.count)
        else:
            next_nucleotide = max(set([ngram[-1] for ngram in ngrams]), key=[ngram[-1] for ngram in ngrams].count)
        sequence.append(next_nucleotide)
    return ''.join(sequence)

In [58]:
# تعداد unigrams و  trigrams متن را محاسبه کنید
num_unigrams = count_ngrams(seq, 1)
num_trigrams = count_ngrams(seq, 3)
print("Number of unigrams in the text:", num_unigrams)
print("Number of trigrams in the text:", num_trigrams)

Number of unigrams in the text: 75578
Number of trigrams in the text: 75576


In [59]:
# 10 unigrams برتر را در متن پیدا کنید
top_10_unigrams = find_top_ngrams(seq, 1, 10)
print("Top 10 most frequent unigrams:")
for unigram, count in top_10_unigrams:
    print(unigram, "-", count)

Top 10 most frequent unigrams:
G - 30092
C - 16064
A - 15382
T - 13420
N - 620


In [61]:
# 10 trigrams برتر را در متن پیدا کنید
top_10_trigrams = find_top_ngrams(seq, 3, 10)
print("Top 10 most frequent trigrams:")
for trigram, count in top_10_trigrams:
    print(trigram, "-", count)

Top 10 most frequent trigrams:
GGG - 6245
GTG - 4036
GAG - 3505
GGA - 2922
TGG - 2763
AGG - 2594
CCC - 2323
GGC - 2008
CTG - 1911
GAA - 1828


In [62]:
# جانشین های احتمالی بعد از "GTG" را در متن پیدا کنید
successors_GTG = find_successors(seq, "GTG")
print("Successors after 'GTG':", successors_GTG)

Successors after 'GTG': []


In [63]:
# محاسبه احتمال MLE برای "GTG"
mle_probability_GTG = calculate_MLE_probability("GTG", seq)
print("MLE probability for 'GTG':", mle_probability_GTG)

MLE probability for 'GTG': 0.05340319678204721


In [65]:
# محاسبه احتمال MLE برای  trigrams     "GTA "، "TAC" ، "ACC"
mle_probability_GTA = calculate_MLE_probability("GTA", seq)
mle_probability_TAC = calculate_MLE_probability("TAC", seq)
mle_probability_ACC = calculate_MLE_probability("ACC", seq)
print("MLE probability for trigrams 'GTA':", mle_probability_GTA)
print("MLE probability for trigrams 'TAC':", mle_probability_TAC)
print("MLE probability for trigrams 'ACC':", mle_probability_ACC)

MLE probability for trigrams 'GTA': 0.005014819519424156
MLE probability for trigrams 'TAC': 0.003744574997353657
MLE probability for trigrams 'ACC': 0.013456652905684345


In [66]:
# با استفاده از  trigram model  ، دنباله ای از 20 نوکلئوتید RNA تولید کنید
generated_sequence = generate_sequence(seq, 3, length=20)
print("دنباله تولید شده:", generated_sequence)

دنباله تولید شده: GGGGGGGGGGGGGGGGGGGG
