In [42]:
import tqdm
import tensorflow as tf
import numpy as np 
import pickle
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout
from string import punctuation
import requests
import re

In [32]:
content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text
open("wonderland.txt",'w',encoding='utf-8').write(content)

167674

In [43]:
seq_length = 100
batch_sz = 128
epochs = 30
f_path = 'wonderland.txt'
basename = os.path.basename(f_path)
txt = open(f_path,encoding='utf-8').read()
txt = txt.lower()
txt = txt.translate(str.maketrans("","",punctuation))

In [44]:
chars_to_remove = "ù—‘’“”•™﻿"
txt = txt.translate(str.maketrans("", "", chars_to_remove))

In [45]:
n_chars = len(txt)
vocab = ''.join(sorted(set(txt)))
n_uq_chars = len(vocab)
print("Vocabulary:",vocab)
print("Number of Characters:",n_chars)
print("Number of unique Characters",n_uq_chars)

Vocabulary: 
 0123456789abcdefghijklmnopqrstuvwxyz
Number of Characters: 158608
Number of unique Characters 38


In [16]:
char2int = {c:i for i,c in enumerate(vocab)}
int2char = {i:c for i,c in enumerate(vocab)}

In [17]:
pickle.dump(char2int,open(f"{basename}-char2int.pickle",'wb'))
pickle.dump(int2char,open(f"{basename}-int2char.pickle",'wb'))

In [19]:
encoded_txt = [char2int[x] for x in txt]
encoded_txt[:5]

[31, 19, 16, 1, 27]

In [20]:
encoded_txt = np.array(encoded_txt)

In [21]:
char_data = tf.data.Dataset.from_tensor_slices(encoded_txt)

In [24]:
for s in char_data.take(8):
    print(s.numpy(),int2char[s.numpy()])

31 t
19 h
16 e
1  
27 p
29 r
26 o
21 j


In [36]:
seq = char_data.batch(2*seq_length+1,drop_remainder=True)
for s in seq.take(2):
    print(''.join([int2char[i] for i in s.numpy()]))
    print(len(''.join([int2char[i] for i in s.numpy()])))
    print('-'*50)

the project gutenberg ebook of alices adventures in wonderland

    

this ebook is for the use of anyone anywhere in the united states and

most other parts of the world at no cost and with almost no 
201
--------------------------------------------------
restrictions

whatsoever you may copy it give it away or reuse it under the terms

of the project gutenberg license included with this ebook or online

at wwwgutenbergorg if you are not located in the 
201
--------------------------------------------------


In [55]:
def split_sample(sample):
    ds = tf.data.Dataset.from_tensors((sample[:seq_length], sample[seq_length]))
    for i in range(1, len(sample) - seq_length):
        input_ = sample[i:i+seq_length]
        target = sample[i+seq_length]
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds

dataset = seq.flat_map(split_sample)

In [56]:
def one_hot_samples(input_, target):
    return tf.one_hot(input_, n_uq_chars), tf.one_hot(target, n_uq_chars)

dataset = dataset.map(one_hot_samples)

In [57]:
dataset

<MapDataset element_spec=(TensorSpec(shape=(100, 38), dtype=tf.float32, name=None), TensorSpec(shape=(38,), dtype=tf.float32, name=None))>

In [58]:
# print first 2 samples
for element in dataset.take(2):
    print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print("Target:", int2char[np.argmax(element[1].numpy())])
    print("Input shape:", element[0].shape)
    print("Target shape:", element[1].shape)
    print("="*50, "\n")

Input: the project gutenberg ebook of alices adventures in wonderland

    

this ebook is for the use of a
Target: n
Input shape: (100, 38)
Target shape: (38,)

Input: he project gutenberg ebook of alices adventures in wonderland

    

this ebook is for the use of an
Target: y
Input shape: (100, 38)
Target shape: (38,)



In [59]:
ds = dataset.repeat().shuffle(1024).batch(batch_sz,drop_remainder=True)

In [62]:
model = Sequential([
    LSTM(256, input_shape=(seq_length, n_uq_chars), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(n_uq_chars, activation="softmax"),
])

In [65]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(ds, steps_per_epoch=(len(encoded_txt) - seq_length) // batch_sz, epochs=1)



<keras.callbacks.History at 0x2237e666100>

In [75]:
# As We can see below we need more data, epochs, computation power

In [69]:
vocab_sz = len(char2int)
seed = "chapter xiii"
s = seed
generated = ""
n = 400
for i in tqdm.tqdm(range(n), "Generating text"):
    X = np.zeros((1, seq_length, vocab_sz))
    for t, char in enumerate(seed):
        X[0, (seq_length - len(seed)) + t, char2int[char]] = 1
    predicted = model.predict(X, verbose=0)[0]
    next_index = np.argmax(predicted)
    next_char = int2char[next_index]
    generated += next_char
    seed = seed[1:] + next_char

print("Seed:", s)
print("Generated text:")
print(generated)

Generating text: 100%|███████████████████████████████████████████████████████████████| 400/400 [00:59<00:00,  6.76it/s]

Seed: chapter xiii
Generated text:
cation and the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of the project of





In [76]:
X.max(axis=1)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0.]])