In [1]:
import requests
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

In [2]:
urls = ["https://www.gutenberg.org/cache/epub/1342/pg1342.txt",

        "https://www.gutenberg.org/cache/epub/158/pg158.txt",

        "https://www.gutenberg.org/cache/epub/161/pg161.txt"
        ]

In [3]:
pride_text = ""
emma_text = ""
sense_text = ""


pride_text = urls[0]
emma_text = urls[1]
sense_text = urls[2]

In [4]:
# Pride and Prejudice text
response = requests.get(pride_text)
pride_text = response.text

# Emma text
response = requests.get(emma_text)
emma_text = response.text

# Sense and Sensibility text
response = requests.get(sense_text)
sense_text = response.text

In [5]:
# Pride and Prejudice - Text Clean Up
pride_text=pride_text[35866:]
pride_text=pride_text[:708210]

In [6]:
# Emma - Text Clean Up
emma_text=emma_text[1699:]
emma_text=emma_text[:895997]

In [7]:
# Sense and Sensibility - Text Clean Up
sense_text=sense_text[1612:]
sense_text=sense_text[:682514]

In [8]:
#combine all three texts into one variable
all_text = ""

all_text = pride_text + emma_text + sense_text

In [9]:
# place all_text into a .txt file
with open("combined_austen.txt", "w", encoding="utf-8") as file:
  file.write(all_text)

In [10]:
import re
file_path = '/content/combined_austen.txt'

# Function to load the raw text from a file
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def clean_text(text):
    # Remove anything in square brackets
    text = re.sub(r'\[.*?\]', '', text)

    # Clean up extra newlines or spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    # Remove spaces before punctuation
    text = re.sub(r' \.', '.', text)
    text = re.sub(r' \,', ',', text)
    text = re.sub(r' \?', '?', text)

    return text.strip()

# Split the text into chapters
def split_into_chapters(text):
    chapter_pattern = r'(CHAPTER [IVXLCDM]+[\.\s]?)|(\bChapter \d+\b)'  # Match variations of chapter headings

    # Split the text into chapters using the regex pattern
    chapters = re.split(chapter_pattern, text)

    # Filter out empty strings and clean up extra spaces, ignoring chapter headings
    chapters = [chapter.strip() for chapter in chapters if chapter and not re.match(chapter_pattern, chapter.strip())]

    return chapters

# Concatenate all chapters
def format_chapters(chapters):
    return ''.join(chapters)

# Function to save the updated text back to a file
def save_text_to_file(cleaned_text, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

# Main function to process and format the text
def process_text(input_file, output_file):
    # Load the raw text
    raw_text = load_text(input_file)

    # Clean the text
    cleaned_text = clean_text(raw_text)

    # Step 3: Split the text into chapters
    chapters = split_into_chapters(cleaned_text)

    # Step 4: Format the chapters with appropriate headers or formatting
    formatted_text = format_chapters(chapters)

    # Step 5: Save the cleaned and formatted text
    save_text_to_file(formatted_text, output_file)

# Example usage
input_file = '/content/combined_austen.txt'  # Path to your raw text file
output_file = '/content/formatted_austen.txt'  # Path where you want to save the cleaned text

process_text(input_file, output_file)

In [11]:
# Function to read the formatted text from a file
def load_processed_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Load the formatted text into a variable
formatted_text = load_processed_text('/content/formatted_austen.txt')

# Check the first 500 characters to make sure it's loaded correctly
print(formatted_text[:500])


Chapter I.] It is a truth universally acknowledged, that a single man in possession of a good fortune must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. “My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?” Mr.


In [12]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = pad_punctuation(formatted_text)

In [13]:
import nltk
nltk.download('punkt_tab')  # Download sentence tokenizer

# Split formatted_text into sentences
text_data = nltk.sent_tokenize(text_data)  # Tokenize by sentence



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [14]:
BATCH_SIZE = 32
MAX_LEN = 200

In [15]:
#create text dataset from raw text
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [16]:
#create vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    #max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [17]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [18]:
# Print the size of the vocabulary and the first 10 tokens
print(f"Vocabulary size: {len(vocab)}")
print(f"Sample vocabulary: {vocab[:10]}")

Vocabulary size: 13207
Sample vocabulary: ['', '[UNK]', ',', '.', 'the', 'to', 'of', 'and', 'her', 'a']


In [19]:
VOCAB_SIZE = len(vocab)

In [20]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")


0: 
1: [UNK]
2: ,
3: .
4: the
5: to
6: of
7: and
8: her
9: a


In [21]:
# Create the training set text and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

In [22]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("Text Generated = ", max_tokens=100, temperature=1.0)

In [23]:
# Tokenize starting prompt
text_generator = TextGenerator(vocab)

**MODEL 1: one layer, 128 N_UNITS**

In [None]:
#VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [None]:
# train with one layer

inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [None]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 1.7694
generated text:
Text Generated =  inmates subscribe saying of and , sorry . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 56ms/step - loss: 1.7682
Epoch 2/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.5718
generated text:
Text Generated =  place , within not ! 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 57ms/step - loss: 0.5717
Epoch 3/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.5262
generated text:
Text Generated =  argued for it . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 57ms/step - loss: 0.5262
Epoch 4/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.4969
generated text:
Text Generated =  head up ; and she had been in your friends . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7f6812c1c280>

In [None]:
# print probabilities of next token
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [None]:
info = text_generator.generate(
    "Emma saw that", max_tokens=25, temperature=0.25
)


generated text:
Emma saw that he had been in town , and was the time of her own , and the most natural and frozen maid .



In [None]:
print_probs(info, vocab)


PROMPT: Emma saw that
he:   	79.85%
she:   	15.4%
the:   	2.35%
there:   	0.85%
it:   	0.73%
--------


PROMPT: Emma saw that he
had:   	92.2%
was:   	7.49%
should:   	0.27%
would:   	0.04%
might:   	0.0%
--------


PROMPT: Emma saw that he had
been:   	99.34%
not:   	0.63%
a:   	0.02%
never:   	0.0%
done:   	0.0%
--------


PROMPT: Emma saw that he had been
in:   	53.06%
a:   	22.4%
the:   	9.82%
so:   	9.22%
at:   	4.33%
--------


PROMPT: Emma saw that he had been in
the:   	57.13%
town:   	38.65%
love:   	2.26%
a:   	1.42%
london:   	0.53%
--------


PROMPT: Emma saw that he had been in town
,:   	87.74%
;:   	8.92%
.:   	3.21%
before:   	0.04%
to:   	0.02%
--------


PROMPT: Emma saw that he had been in town ,
and:   	99.95%
but:   	0.02%
or:   	0.02%
for:   	0.0%
was:   	0.0%
--------


PROMPT: Emma saw that he had been in town , and
the:   	50.7%
she:   	28.64%
was:   	7.6%
had:   	3.91%
to:   	2.07%
--------


PROMPT: Emma saw that he had been in town , and was
a:   	47.31%
th

In [None]:
info = text_generator.generate(
    "Pride had taken it all", max_tokens=50, temperature=0.50
)


generated text:
Pride had taken it all over in her power to be done . 



In [None]:
info = text_generator.generate(
    "How is it that", max_tokens=50, temperature=1.0
)


generated text:
How is it that i have not occurred to him ? 



**MODEL 2: two layers + 128 N_UNITS**

In [None]:
#VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [None]:
#train with two layers
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.LSTM(N_UNITS)(x)

outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - loss: 1.8085
generated text:
Text Generated =  and from _ to to , the has at say ; communication borne it the not shall the to must to was of good all aye own blunders insipidity i to had better the declare—and eldest . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 67ms/step - loss: 1.8072
Epoch 2/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.5717
generated text:
Text Generated =  and and take , you was held raise him of his claims . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 65ms/step - loss: 0.5717
Epoch 3/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.5188
generated text:
Text Generated =  company , as the curiosity could he had always foolishly indifference to give one sinking ; that he had without that buried to quite what hurt the twenty farewell me , i not off

<keras.src.callbacks.history.History at 0x7ef9141165c0>

In [None]:
info = text_generator.generate(
    "In a situation that", max_tokens=25, temperature=1.0
)


generated text:
In a situation that painful solemnity could be in beauty . 



In [None]:
info = text_generator.generate(
    "In a situation that", max_tokens=25, temperature=0.5
)


generated text:
In a situation that is so much preferable to his own . 



In [None]:
info = text_generator.generate(
    "Jane had given", max_tokens=25, temperature=0.2
)


generated text:
Jane had given her pleasure in her own mind , she was not in the least to say that she was not to be supposed



In [None]:
info = text_generator.generate(
    "Mr. Darcy wanted", max_tokens=25, temperature=1.0
)


generated text:
Mr. Darcy wanted to render their excuse , that my mind left no further amends to own the distress of the feelings and while he



**MODEL 3: one layer and 256 N_UNITS**

In [None]:
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [None]:
#train with N_UNITS = 256 and one layer (trained on GPU)
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)

loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 1.7767
generated text:
Text Generated =  cow approbation without sat together to ? 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 55ms/step - loss: 1.7755
Epoch 2/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 0.5683
generated text:
Text Generated =  invalids “never of nothing it as ridicule means felt . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 56ms/step - loss: 0.5683
Epoch 3/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.5246
generated text:
Text Generated =  throats to on the body and make , i look with common dear else . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 58ms/step - loss: 0.5246
Epoch 4/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.5020
generated text:
Text Generated =  giving delaf

<keras.src.callbacks.history.History at 0x7936b9141f00>

In [None]:
info = text_generator.generate(
    "I was certain that", max_tokens=25, temperature=1.0
)


generated text:
I was certain that it was impossible for sir john there . 



In [None]:
print_probs(info, vocab)


PROMPT: I was certain that
he:   	26.49%
she:   	12.5%
i:   	6.77%
it:   	5.58%
they:   	4.8%
--------


PROMPT: I was certain that it
was:   	41.62%
would:   	11.63%
should:   	9.38%
is:   	5.02%
had:   	4.57%
--------


PROMPT: I was certain that it was
not:   	21.06%
to:   	5.11%
.:   	3.42%
in:   	2.8%
a:   	2.54%
--------


PROMPT: I was certain that it was impossible
for:   	29.89%
to:   	23.38%
.:   	22.63%
;:   	8.27%
that:   	6.19%
--------


PROMPT: I was certain that it was impossible for
it:   	10.16%
him:   	8.97%
the:   	8.91%
her:   	6.85%
mr:   	6.72%
--------


PROMPT: I was certain that it was impossible for sir
john:   	40.0%
william:   	24.25%
john’s:   	10.36%
.:   	5.23%
,:   	4.94%
--------


PROMPT: I was certain that it was impossible for sir john
and:   	21.44%
.:   	16.38%
,:   	12.45%
;:   	7.07%
de:   	3.46%
--------


PROMPT: I was certain that it was impossible for sir john there
was:   	27.77%
,:   	17.49%
.:   	7.61%
;:   	7.55%
would:   	7.2%
--------

In [None]:
info = text_generator.generate(
    "I was certain that", max_tokens=25, temperature=0.5
)


generated text:
I was certain that it was not so . 



In [None]:
print_probs(info, vocab)


PROMPT: I was certain that
he:   	69.85%
she:   	15.55%
i:   	4.57%
it:   	3.1%
they:   	2.3%
--------


PROMPT: I was certain that it
was:   	84.34%
would:   	6.59%
should:   	4.28%
is:   	1.23%
had:   	1.02%
--------


PROMPT: I was certain that it was
not:   	82.52%
to:   	4.85%
.:   	2.18%
in:   	1.46%
a:   	1.2%
--------


PROMPT: I was certain that it was not
in:   	32.54%
to:   	32.22%
so:   	8.67%
a:   	3.83%
at:   	2.73%
--------


PROMPT: I was certain that it was not so
much:   	47.02%
,:   	10.16%
very:   	8.58%
;:   	6.67%
.:   	6.59%
--------


PROMPT: I was certain that it was not so .
:   	100.0%
):   	0.0%
and:   	0.0%
]:   	0.0%
on:   	0.0%
--------



In [None]:
info = text_generator.generate(
      "Emma said that", max_tokens=50, temperature=1.0
)


generated text:
Emma said that he was not in their conclusions . 



In [None]:
info = text_generator.generate(
      "Emma said that", max_tokens=50, temperature=0.5
)


generated text:
Emma said that he would not allow him to be the best of the gentleman . 



**MODEL 4: two layers, 256 N_UNITS**

In [24]:
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [25]:
#train with two layers
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)

outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()


In [26]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 1.3631
generated text:
Text Generated =  woodhouse but agitated her till ; ; know befall before and as know the gouldings the desirous . 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 95ms/step - loss: 1.3623
Epoch 2/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 0.5536
generated text:
Text Generated =  inquiries , harriet there send as it is down . represented 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 96ms/step - loss: 0.5535
Epoch 3/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - loss: 0.4933
generated text:
Text Generated =  on his son , you are no persuaded , ” “we was my son very sentiments ? 

[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 97ms/step - loss: 0.4933
Epoch 4/25
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/st

<keras.src.callbacks.history.History at 0x7a5087ed7580>

In [27]:
info = text_generator.generate(
      "Mr. Bingley was good-looking", max_tokens=50, temperature=1.0
)


generated text:
Mr. Bingley was good-looking enough ; it was not mistaken so ; and did not give off the opportunity of saying to miss bennet , who considered the impression of some others , but by its uncertain situation , that they did not enjoy them with each melancholy . 



In [30]:
info = text_generator.generate(
      "He danced all the dances", max_tokens=50, temperature=1.0
)


generated text:
He danced every dance towards the table , when sir john went on , which must be just supported , and by supplying the sound of the meeting , were all to resist harriet’s happiness . 

