In [18]:
import os
import re
import random

In [3]:
# Define the path to the data folder
data_folder = '../data/'

In [4]:
# Load one of the downloaded books
file_path = os.path.join(data_folder, 'pg1342.txt')

In [5]:
# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
# Display the first 1000 characters
print(text[:1000])

In [9]:
# Identify the start and end of the main text
start_marker = '*** START OF THIS PROJECT GUTENBERG EBOOK'
end_marker = '*** END OF THIS PROJECT GUTENBERG EBOOK'

# Find the positions of the markers
start_pos = text.find(start_marker)
end_pos = text.find(end_marker)

# Display the positions of the start and end markers
print(f"Start marker found at: {start_pos}")
print(f"End marker found at: {end_pos}")


Start marker found at: -1
End marker found at: -1


In [None]:
# Remove the preamble and postamble
if start_pos != -1:
    text = text[start_pos + len(start_marker):]

if end_pos != -1:
    text = text[:end_pos]

# Remove non-letter characters
cleaned_text = re.sub(r'[^A-Za-z. ]', '', text).upper()

# Display some text after removing preamble and postamble
print(text[:1000])

In [None]:
# Strip any remaining trailing whitespace
text = text.strip()

# Display the first 1000 characters of the cleaned text
print(text[:1000])

In [None]:
# loop through the cleaned text, starting from index 0 and stopping two characters before the end (len(cleaned_text) - 2) to avoid going out of bounds.
# At each iteration, extract a trigram using cleaned_text[i:i+3].
# If the trigram is already in the dictionary (trigram_model), increment its count.
# If it's not in the dictionary, initialize the count to 1.

In [None]:
# Initialize an empty dictionary
trigram_model = {}

# Iterate over the cleaned text to extract trigrams
for i in range(len(cleaned_text) - 2):  # Stop 2 characters before the end
    trigram = cleaned_text[i:i+3]  # Extract a sequence of 3 characters
    
    # Increment the count for this trigram in the dictionary
    if trigram in trigram_model:
        trigram_model[trigram] += 1  # If the trigram is already in the dictionary, increase the count
    else:
        trigram_model[trigram] = 1   # Otherwise, initialize the count at 1

# Display the first 10 trigrams to test
print({k: trigram_model[k] for k in list(trigram_model)[:10]})


In [19]:
def get_next_char(bigram, trigram_model):
    """
    Given a bigram, find all trigrams that start with this bigram
    and use the trigram model to choose the next character based on frequencies.
    """
    # Find trigrams that start with the given bigram
    candidates = {tri: count for tri, count in trigram_model.items() if tri.startswith(bigram)}
    
    if not candidates:
        # If no trigrams are found, return a space
        return ' '
    
    # Extract the third characters and their corresponding counts
    next_chars = [tri[2] for tri in candidates]  # The third character of each trigram
    weights = [count for count in candidates.values()]  # Counts of each trigram
    
    # Randomly choose the next character based on the trigram frequencies
    return random.choices(next_chars, weights=weights, k=1)[0]

In [None]:
# Parameters for text generation
generated_length = 10000  # Total number of characters
seed = "TH"  # Start string

# Initialize the generated text with the seed
generated_text = seed

# Loop to generate characters until we reach the desired length
for _ in range(generated_length - len(seed)):  # Subtract seed length from the target length
    bigram = generated_text[-2:]  # Get the last two characters
    next_char = get_next_char(bigram, trigram_model)  # Get the next character using the trigram model
    generated_text += next_char  # Append the next character to the generated string

# Output the first 1000 characters of the generated text for verification
print(generated_text[:1000])

# Display the total length to verify it is 10,000 characters
print(f"Generated text length: {len(generated_text)}")
