In [24]:
import os
import re
import random
import json

In [3]:
# Define the path to the data folder
data_folder = '../data/'

In [31]:
def load_text(file_path):
    """
    Reads the content of a file and returns the text as a string.
        
    Returns:
        str: The raw text from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [34]:
def clean_text(text):
    """
    Cleans the input text by removing non-letter characters, 
    keeping spaces and periods, and converting to uppercase.
        
    Returns:
        str: The cleaned text.
    """
    # Markers to remove preamble and postamble from Project Gutenberg texts
    start_marker = '*** START OF THIS PROJECT GUTENBERG EBOOK'
    end_marker = '*** END OF THIS PROJECT GUTENBERG EBOOK'
    
    # Find start and end positions
    start_pos = text.find(start_marker)
    end_pos = text.find(end_marker)
    
    # Remove preamble and postamble if found
    if start_pos != -1:
        text = text[start_pos + len(start_marker):]
    if end_pos != -1:
        text = text[:end_pos]
    
    # Remove non-letter characters and convert to uppercase
    cleaned_text = re.sub(r'[^A-Za-z. ]', '', text).upper()
    return cleaned_text.strip()


In [None]:
# Strip any remaining trailing whitespace
text = text.strip()

# Display the first 1000 characters of the cleaned text
print(text[:1000])

In [None]:
# loop through the cleaned text, starting from index 0 and stopping two characters before the end (len(cleaned_text) - 2) to avoid going out of bounds.
# At each iteration, extract a trigram using cleaned_text[i:i+3].
# If the trigram is already in the dictionary (trigram_model), increment its count.
# If it's not in the dictionary, initialize the count to 1.

In [None]:
# Initialize an empty dictionary
trigram_model = {}

# Iterate over the cleaned text to extract trigrams
for i in range(len(cleaned_text) - 2):  # Stop 2 characters before the end
    trigram = cleaned_text[i:i+3]  # Extract a sequence of 3 characters
    
    # Increment the count for this trigram in the dictionary
    if trigram in trigram_model:
        trigram_model[trigram] += 1  # If the trigram is already in the dictionary, increase the count
    else:
        trigram_model[trigram] = 1   # Otherwise, initialize the count at 1

# Display the first 10 trigrams to test
print({k: trigram_model[k] for k in list(trigram_model)[:10]})


In [19]:
def get_next_char(bigram, trigram_model):
    """
    Given a bigram, find all trigrams that start with this bigram
    and use the trigram model to choose the next character based on frequencies.
    """
    # Find trigrams that start with the given bigram
    candidates = {tri: count for tri, count in trigram_model.items() if tri.startswith(bigram)}
    
    if not candidates:
        # If no trigrams are found, return a space
        return ' '
    
    # Extract the third characters and their corresponding counts
    next_chars = [tri[2] for tri in candidates]  # The third character of each trigram
    weights = [count for count in candidates.values()]  # Counts of each trigram
    
    # Randomly choose the next character based on the trigram frequencies
    return random.choices(next_chars, weights=weights, k=1)[0]

In [None]:
# Parameters for text generation
generated_length = 10000  # Total number of characters
seed = "TH"  # Start string

# Initialize the generated text with the seed
generated_text = seed

# Loop to generate characters until we reach the desired length
for _ in range(generated_length - len(seed)):  # Subtract seed length from the target length
    bigram = generated_text[-2:]  # Get the last two characters
    next_char = get_next_char(bigram, trigram_model)  # Get the next character using the trigram model
    generated_text += next_char  # Append the next character to the generated string

# Output the first 1000 characters of the generated text for verification
print(generated_text[:1000])

# Display the total length to verify it is 10,000 characters
print(f"Generated text length: {len(generated_text)}")


In [None]:
# Load the list of valid English words from 'words.txt'
with open('../data/words.txt', 'r') as file:
    valid_words = set(file.read().splitlines())  # Store valid words in a set

# Display the number of valid words loaded
print(f"Number of valid English words: {len(valid_words)}")


In [None]:
# Split the generated text into words
generated_words = generated_text.split()

# Display the first 10 words
print(f"First 10 words in the generated text: {generated_words[:10]}")


In [30]:
# Count the number of valid words in the generated text
valid_word_count = sum(1 for word in generated_words if word in valid_words)

# Calculate the percentage of valid words
total_words = len(generated_words)
valid_word_percentage = (valid_word_count / total_words) * 100

# Display the results
print(f"Total words in generated text: {total_words}")
print(f"Valid English words: {valid_word_count}")
print(f"Percentage of valid English words: {valid_word_percentage:.2f}%")


Total words in generated text: 1715
Valid English words: 353
Percentage of valid English words: 20.58%


In [29]:
# Path for the output JSON file
output_file = '../data/trigrams.json'

# Export the trigram model to a JSON file with proper formatting
with open(output_file, 'w') as file:
    json.dump(trigram_model, file, indent=4, sort_keys=True)

# Confirm the file was saved
print(f"Trigram model exported to {output_file}")

Trigram model exported to ../data/trigrams.json


In [35]:
def main():
    # Load the text
    file_path = os.path.join(data_folder, 'pg1342.txt')
    raw_text = load_text(file_path)
    cleaned_text = clean_text(raw_text)

if __name__ == "__main__":
    main()

THE PROJECT GUTENBERG EBOOK OF PRIDE AND PREJUDICE    THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE IN THE UNITED STATES ANDMOST OTHER PARTS OF THE WORLD AT NO COST AND WITH ALMOST NO RESTRICTIONSWHATSOEVER. YOU MAY COPY IT GIVE IT AWAY OR REUSE IT UNDER THE TERMSOF THE PROJECT GUTENBERG LICENSE INCLUDED WITH THIS EBOOK OR ONLINEAT WWW.GUTENBERG.ORG. IF YOU ARE NOT LOCATED IN THE UNITED STATESYOU WILL HAVE TO CHECK THE LAWS OF THE COUNTRY WHERE YOU ARE LOCATEDBEFORE USING THIS EBOOK.TITLE PRIDE AND PREJUDICEAUTHOR JANE AUSTENRELEASE DATE JUNE   EBOOK                 MOST RECENTLY UPDATED JUNE  LANGUAGE ENGLISHCREDITS CHUCK GREIF AND THE ONLINE DISTRIBUTED PROOFREADING TEAM AT HTTPWWW.PGDP.NET THIS FILE WAS PRODUCED FROM IMAGES AVAILABLE AT THE INTERNET ARCHIVE START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE                             ILLUSTRATION                             GEORGE ALLEN                               PUBLISHER                         CHARING CROSS ROAD         