In [43]:
from nltk import word_tokenize
from nltk.corpus import gutenberg

from src.data.dataset import TextGenerationDataset
from src.data.tokenizer import Tokenizer
from src.utils import load_config

In [44]:
config = load_config()
config

Namespace(data='milton-paradise.txt', test_fraction=0.1, seed=42, window_step=1, context_length=40, output_length=1, batch_size=64, epochs=100, save_interval=10, eval_interval=1, best_score_metric='accuracy', learning_rate=0.0001, weight_decay=1e-05, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.25)

In [45]:
original_text = gutenberg.raw(config.data)
print(original_text[:600])

[Paradise Lost by John Milton 1667] 
 
 
Book I 
 
 
Of Man's first disobedience, and the fruit 
Of that forbidden tree whose mortal taste 
Brought death into the World, and all our woe, 
With loss of Eden, till one greater Man 
Restore us, and regain the blissful seat, 
Sing, Heavenly Muse, that, on the secret top 
Of Oreb, or of Sinai, didst inspire 
That shepherd who first taught the chosen seed 
In the beginning how the heavens and earth 
Rose out of Chaos: or, if Sion hill 
Delight thee more, and Siloa's brook that flowed 
Fast by the oracle of God, I thence 
Invoke thy aid to my adventur


In [46]:
total_characters = len(original_text)
unique_characters = len(set(original_text))
words = word_tokenize(original_text)
unique_words = len(set(words))
total_words = len(words)

print(f"Total characters: {total_characters}")
print(f"Unique characters: {unique_characters}")
print(f"Total words: {total_words}")
print(f"Unique words: {unique_words}")

Total characters: 468220
Unique characters: 80
Total words: 95716
Unique words: 10986


In [47]:
tokenizer = Tokenizer.init_from_text(original_text)
print(dict(sorted(tokenizer.vocab.token_freq.items(), key=lambda item: item[1], reverse=True)))

{' ': 81196, 'e': 44815, 't': 29572, 'o': 26121, 'a': 24655, 'n': 24539, 'h': 23596, 's': 22899, 'r': 22665, 'i': 22214, 'd': 16773, 'l': 15000, 'u': 10718, '\n': 10635, ',': 10228, 'm': 8269, 'f': 8030, 'w': 7698, 'c': 7275, 'g': 7231, 'p': 5776, 'y': 5067, 'b': 4683, 'v': 3756, ';': 2326, 'T': 2168, 'k': 1982, 'A': 1729, 'H': 1295, '.': 1283, 'I': 1243, 'O': 1146, 'S': 1137, 'W': 993, ':': 771, 'B': 686, 'F': 666, "'": 606, 'M': 600, '-': 594, 'E': 493, 'G': 476, 'x': 452, 'C': 408, 'N': 405, 'j': 399, 'D': 398, 'P': 365, '?': 322, 'L': 272, '!': 267, 'q': 245, 'R': 208, 'z': 177, 'U': 160, 'Y': 99, 'V': 73, 'J': 63, ')': 60, '(': 59, '"': 57, 'K': 57, '$': 9, '0': 9, 'Z': 8, 'Q': 7, '*': 6, 'X': 5, '1': 4, '2': 4, '[': 2, '6': 2, ']': 2, '8': 2, '4': 2, '9': 2, '\x1a': 2, '7': 1, '5': 1, '3': 1, '#': 0}


In [48]:
dataset = TextGenerationDataset(config)
print(dataset.text[:600])

of man's first disobedience, and the fruit
of that forbidden tree whose mortal taste
brought death into the world, and all our woe,
with loss of eden, till one greater man
restore us, and regain the blissful seat,
sing, heavenly muse, that, on the secret top
of oreb, or of sinai, didst inspire
that shepherd who first taught the chosen seed
in the beginning how the heavens and earth
rose out of chaos: or, if sion hill
delight thee more, and siloa's brook that flowed
fast by the oracle of god, i thence
invoke thy aid to my adventurous song,
that with no middle flight intends to soar
above th' ao


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\robert\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [49]:
total_characters = len(dataset.text)
unique_characters = len(set(dataset.text))
words = word_tokenize(dataset.text)
unique_words = len(set(words))
total_words = len(words)

print(f"Total characters: {total_characters}")
print(f"Unique characters: {unique_characters}")
print(f"Total words: {total_words}")
print(f"Unique words: {unique_words}")

Total characters: 449068
Unique characters: 36
Total words: 94080
Unique words: 9202


In [50]:
print(dict(sorted(dataset.tokenizer.vocab.token_freq.items(), key=lambda item: item[1], reverse=True)))

{' ': 68053, 'e': 44620, 't': 31299, 'o': 26843, 'a': 26002, 'n': 24582, 'h': 24513, 's': 23712, 'i': 23125, 'r': 22515, 'd': 16940, 'l': 15032, 'u': 10720, '\n': 10402, ',': 10085, 'm': 8740, 'f': 8572, 'w': 8562, 'g': 7594, 'c': 7566, 'p': 6046, 'b': 5286, 'y': 5092, 'v': 3764, ';': 2312, 'k': 2003, '.': 1258, ':': 764, "'": 596, '-': 576, 'j': 458, 'x': 444, '?': 305, '!': 256, 'q': 248, 'z': 183, '#': 0}


In [51]:
train_subset, test_subset = dataset.split(test_fraction=0.1)
print("Number of sequence pairs in data:", len(dataset))
print("Vocabulary size:", len(dataset.tokenizer.vocab))
print("Training subset size:", len(train_subset))
print("Test subset size:", len(test_subset))

Number of sequence pairs in data: 449027
Vocabulary size: 37
Training subset size: 404125
Test subset size: 44902


In [52]:
print(dataset.text[-1500:])

d i fell asleep: but now lead on;
in me is no delay; with thee to go,
is to stay here; without thee here to stay,
is to go hence unwilling; thou to me
who for my wilful crime art banished hence.
this further consolation yet secure
i carry hence; though all by me is lost,
such favour i unworthy am vouchsafed,
by me the promised seed shall all restore.
so spake our mother eve; and adam heard
well pleased, but answered not: for now, too nigh
the arch-angel stood; and, from the other hill
to their fixed station, all in bright array
the cherubim descended; on the ground
gliding meteorous, as evening-mist
risen from a river o'er the marish glides,
and gathers ground fast at the labourer's heel
homeward returning. high in front advanced,
the brandished sword of god before them blazed,
fierce as a comet; which with torrid heat,
and vapour as the libyan air adust,
began to parch that temperate clime; whereat
in either hand the hastening angel caught
our lingering parents, and to the eastern gat