In [24]:
import os
import re
import random
import json

In [3]:
# Define the path to the data folder
data_folder = '../data/'

In [31]:
def load_text(file_path):
    """
    Reads the content of a file and returns the text as a string.
        
    Returns:
        str: The raw text from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [34]:
def clean_text(text):
    """
    Cleans the input text by removing non-letter characters, 
    keeping spaces and periods, and converting to uppercase.
        
    Returns:
        str: The cleaned text.
    """
    # Markers to remove preamble and postamble from Project Gutenberg texts
    start_marker = '*** START OF THIS PROJECT GUTENBERG EBOOK'
    end_marker = '*** END OF THIS PROJECT GUTENBERG EBOOK'
    
    # Find start and end positions
    start_pos = text.find(start_marker)
    end_pos = text.find(end_marker)
    
    # Remove preamble and postamble if found
    if start_pos != -1:
        text = text[start_pos + len(start_marker):]
    if end_pos != -1:
        text = text[:end_pos]
    
    # Remove non-letter characters and convert to uppercase
    cleaned_text = re.sub(r'[^A-Za-z. ]', '', text).upper()
    return cleaned_text.strip()


In [36]:
def generate_trigrams(cleaned_text):
    """
    Generates a trigram model by counting occurrences of trigrams in the text.
        
    Returns:
        dict: A dictionary where keys are trigrams and values are their counts.
    """
    trigram_model = {}
    for i in range(len(cleaned_text) - 2):
        trigram = cleaned_text[i:i + 3]
        if trigram in trigram_model:
            trigram_model[trigram] += 1
        else:
            trigram_model[trigram] = 1
    return trigram_model

In [19]:
def get_next_char(bigram, trigram_model):
    """
    Given a bigram, find all trigrams that start with this bigram
    and use the trigram model to choose the next character based on frequencies.
    """
    # Find trigrams that start with the given bigram
    candidates = {tri: count for tri, count in trigram_model.items() if tri.startswith(bigram)}
    
    if not candidates:
        # If no trigrams are found, return a space
        return ' '
    
    # Extract the third characters and their corresponding counts
    next_chars = [tri[2] for tri in candidates]  # The third character of each trigram
    weights = [count for count in candidates.values()]  # Counts of each trigram
    
    # Randomly choose the next character based on the trigram frequencies
    return random.choices(next_chars, weights=weights, k=1)[0]

In [40]:
def generate_text(trigram_model, seed="TH", length=10000):
    """
    Generates a string of the specified length using the trigram model.
        
    Returns:
        str: The generated text.
    """
    generated_text = seed
    for _ in range(length - len(seed)):
        bigram = generated_text[-2:]
        next_char = get_next_char(bigram, trigram_model)
        generated_text += next_char
    return generated_text

In [41]:
def count_valid_words(generated_text, word_list):
    """
    Counts valid English words in the generated text.
        
    Returns:
        tuple: The count of valid words and total words.
    """
    generated_words = generated_text.split()
    valid_word_count = sum(1 for word in generated_words if word in word_list)
    return valid_word_count, len(generated_words)

In [None]:
# Load the list of valid English words from 'words.txt'
with open('../data/words.txt', 'r') as file:
    valid_words = set(file.read().splitlines())  # Store valid words in a set

# Display the number of valid words loaded
print(f"Number of valid English words: {len(valid_words)}")


In [29]:
# Path for the output JSON file
output_file = '../data/trigrams.json'

# Export the trigram model to a JSON file with proper formatting
with open(output_file, 'w') as file:
    json.dump(trigram_model, file, indent=4, sort_keys=True)

# Confirm the file was saved
print(f"Trigram model exported to {output_file}")

Trigram model exported to ../data/trigrams.json


In [42]:
def main():
    # Load and clean the text
    file_path = os.path.join(data_folder, 'pg1342.txt')
    raw_text = load_text(file_path)
    cleaned_text = clean_text(raw_text)

    # Quick check: Display the first 1000 characters of the cleaned text
    print(cleaned_text[:1000])
    
    # Generate the trigram model from the cleaned text
    trigram_model = generate_trigrams(cleaned_text)

    # Print sample of the trigram model to verify
    print({k: trigram_model[k] for k in list(trigram_model)[:10]})
    
    # Generate a 10,000-character text based on the trigram model
    generated_text = generate_text(trigram_model)
    print(generated_text[:1000])  # Print the first 1000 characters
    
    # Load the list of valid English words from 'words.txt'
    word_list_path = os.path.join(data_folder, 'words.txt')
    with open(word_list_path, 'r') as file:
        valid_words = set(file.read().splitlines())
    
    # Count valid words in the generated text
    valid_word_count, total_word_count = count_valid_words(generated_text, valid_words)
    
    # Calculate the percentage of valid words
    valid_word_percentage = (valid_word_count / total_word_count) * 100
    
    # Display the results
    print(f"Total words in generated text: {total_word_count}")
    print(f"Valid English words: {valid_word_count}")
    print(f"Percentage of valid English words: {valid_word_percentage:.2f}%")