In [1]:
# based on https://www.nltk.org/api/nltk.lm.html

In [19]:
# set order of n-grams (1=unigram, 2=bigram, ...)
n = 4

In [20]:
from nltk.util import bigrams, trigrams, ngrams
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from Bio import SeqIO, SeqRecord 
from Bio.Seq import Seq

seqs = []
lens = []
# for record in SeqIO.parse("data/reference_data/Russ_994_random.fasta", "fasta"):
for record in SeqIO.parse("data/reference_data/tautomerase_2953.fasta", "fasta"):
#     print(record)
    seqs.append(list(record.seq))
    lens.append(len(record.seq))

print(sum(lens)/len(lens)) # avg length
print(lens[len(lens)//2]) # median length

59.80121909922113
60


In [21]:
# Default preprocessing for a sequence of sentences.

# Creates two iterators:
# - sentences padded and turned into sequences of `nltk.util.everygrams`
# - sentences padded as above and chained together for a flat stream of words

# :param order: Largest ngram length produced by `everygrams`.
# :param text: Text to iterate over. Expected to be an iterable of sentences:
# Iterable[Iterable[str]]
# :return: iterator over text as ngrams, iterator over text as vocabulary data
train, vocab = padded_everygram_pipeline(n, seqs)

In [22]:
lm = MLE(n)

lm.fit(train, vocab)

print(lm.vocab)
print(lm.counts)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 23 items>
<NgramCounter with 4 ngram orders and 759526 ngrams>


In [23]:
# NOTE that <s> and <\s> delineate start and end of sequence

In [24]:
# provide random_seed for consistent output
lm.generate(5, random_seed=0)

['T', 'R', 'E', 'G', 'S']

In [25]:
# condition on preceding text. verify this works correctly
print(lm.generate(50, text_seed=['<s>']))

['P', 'F', 'I', 'N', 'D', 'M', 'P', 'E', 'G', 'T', 'N', 'T', 'A', 'S', 'E', 'I', 'T', 'R', 'V', 'M', 'V', 'K', 'V', 'T', 'N', 'A', 'Q', 'K', 'Q', 'K', 'L', 'E', 'L', 'R', 'L', 'T', 'E', 'V', 'V', 'S', 'R', 'S', 'L', 'A', 'E', 'H', 'V', 'H', 'V', 'L']


In [17]:
results = []
sequences = []
while len(results) < 226:
    r = lm.generate(92, text_seed=['<s>'])
    if not '<s>' in r and not '</s>' in r:
        sequences.append(SeqRecord.SeqRecord(id=str(len(results)),seq=Seq(''.join(r))))
        results.append(r)

with open("Russ_994_random_4gram_generation_seed_<s>_len92.fasta", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

In [18]:
results = []
sequences = []
while len(results) < 226:
    r = lm.generate(92, text_seed=['<s>', 'M', 'T', 'S', 'E', 'N', 'P', 'L', 'L', 'A', 'L', 'R', 'E', 'K', 'I', 'S', 'A', 'L', 'D', 'E', 'K'])
    if not '<s>' in r and not '</s>' in r:
        sequences.append(SeqRecord.SeqRecord(id=str(len(results)),seq=Seq(''.join(r))))
        results.append(r)

with open("Russ_994_random_4gram_generation_seed_<s>MTSENPLLALREKISALDEK_len92.fasta", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

In [None]:
results = []
sequences = []
while len(results) < 738:
    r = lm.generate(60, text_seed=['<s>'])
    if not '<s>' in r and not '</s>' in r:
        sequences.append(SeqRecord.SeqRecord(id=str(len(results)),seq=Seq(''.join(r))))
        results.append(r)

with open("tautomerase_2953_4gram_generation_seed_<s>_len60.fasta", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

In [None]:
results = []
sequences = []
while len(results) < 738:
    r = lm.generate(60, text_seed=text_seed=['<s>', 'M', 'T', 'S', 'E', 'N', 'P', 'L', 'L', 'A', 'L', 'R', 'E', 'K', 'I', 'S', 'A', 'L', 'D', 'E', 'K'])
    if not '<s>' in r and not '</s>' in r:
        sequences.append(SeqRecord.SeqRecord(id=str(len(results)),seq=Seq(''.join(r))))
        results.append(r)

with open("tautomerase_2953_4gram_generation_seed_<s>MTSENPLLALREKISALDEK_len60.fasta", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

### Other stuff

In [59]:
lm.counts['A']

9779

In [61]:
lm.counts[['A']]['C']

73

In [62]:
lm.score("A")

0.09160055078355518

In [65]:
# prob of C preceded by A
lm.score("C", ["A"])

0.007464975968912977

In [66]:
# log to avoid underflow (too small numbers multiplied)
lm.logscore("A")

-3.448499916667411

In [79]:
test = [tuple('AC')]

# Calculate cross-entropy of model for given evaluation text.
lm.entropy(test)

# Calculates the perplexity of the given text.
# This is simply 2 ** cross-entropy for the text, so the arguments are the same.
lm.perplexity(test)

In [73]:
# Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary.
words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
from nltk.lm import Vocabulary
vocab = Vocabulary(words, unk_cutoff=2)