In [19]:

import re
import os
import random
import json

In [20]:
# Define the path to the data folder
data_folder = '../data/'

In [22]:
def load_text(file_path):
    """
    Reads the content of a file and returns the text as a string.
        
    Returns:
        str: The raw text from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [23]:
def clean_text(text):
    """
    Cleans the input text by removing non-letter characters, 
    keeping spaces and periods, and converting to uppercase.
        
    Returns:
        str: The cleaned text.
    """
    # Markers to remove preamble and postamble from Project Gutenberg texts
    start_marker = '*** START OF THIS PROJECT GUTENBERG EBOOK'
    end_marker = '*** END OF THIS PROJECT GUTENBERG EBOOK'
    
    # Find start and end positions
    start_pos = text.find(start_marker)
    end_pos = text.find(end_marker)
    
    # Remove preamble and postamble if found
    if start_pos != -1:
        text = text[start_pos + len(start_marker):]
    if end_pos != -1:
        text = text[:end_pos]
    
    # Remove non-letter characters and convert to uppercase
    cleaned_text = re.sub(r'[^A-Za-z. ]', '', text).upper()
    return cleaned_text.strip()


In [24]:
def generate_trigrams(cleaned_text):
    """
    Generates a trigram model by counting occurrences of trigrams in the text.
        
    Returns:
        dict: A dictionary where keys are trigrams and values are their counts.
    """
    trigram_model = {}
    for i in range(len(cleaned_text) - 2):
        trigram = cleaned_text[i:i + 3]
        if trigram in trigram_model:
            trigram_model[trigram] += 1
        else:
            trigram_model[trigram] = 1
    return trigram_model

In [26]:
def get_next_char(bigram, trigram_model):
    """
    Given a bigram, find all trigrams that start with this bigram
    and use the trigram model to choose the next character based on frequencies.
    """
    # Find trigrams that start with the given bigram
    candidates = {tri: count for tri, count in trigram_model.items() if tri.startswith(bigram)}
    
    if not candidates:
        # If no trigrams are found, return a space
        return ' '
    
    # Extract the third characters and their corresponding counts
    next_chars = [tri[2] for tri in candidates]  # The third character of each trigram
    weights = [count for count in candidates.values()]  # Counts of each trigram
    
    # Randomly choose the next character based on the trigram frequencies
    return random.choices(next_chars, weights=weights, k=1)[0]

In [27]:
def generate_text(trigram_model, seed="TH", length=10000):
    """
    Generates a string of the specified length using the trigram model.
        
    Returns:
        str: The generated text.
    """
    generated_text = seed
    for _ in range(length - len(seed)):
        bigram = generated_text[-2:]
        next_char = get_next_char(bigram, trigram_model)
        generated_text += next_char
    return generated_text

In [28]:
def count_valid_words(generated_text, word_list):
    """
    Counts valid English words in the generated text.
        
    Returns:
        tuple: The count of valid words and total words.
    """
    generated_words = generated_text.split()
    valid_word_count = sum(1 for word in generated_words if word in word_list)
    return valid_word_count, len(generated_words)

In [29]:
def export_trigram_model(trigram_model, output_file):
    """
    Exports the trigram model to a JSON file.
    
    Parameters:
        trigram_model (dict): The trigram model to export.
        output_file (str): The path to the JSON output file.
    """
    with open(output_file, 'w') as file:
        json.dump(trigram_model, file, indent=4, sort_keys=True)
    print(f"Trigram model exported to {output_file}")

In [30]:
def load_text(file_path):
    """Reads the content of a file and returns the text as a string."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            print(f"File {file_path} loaded successfully.")
            return file.read()
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return ""

In [32]:
def main():
    # Define the path to the data folder
    data_folder = 'data'
    combined_trigram_model = {}

    # Loop over each text file in the data folder
    for filename in os.listdir(data_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(data_folder, filename)
            
            # Load and clean the text
            raw_text = load_text(file_path)
            
            if not raw_text:
                print(f"Error: {filename} could not be loaded.")
                continue

            cleaned_text = clean_text(raw_text)
            print(f"First 1000 characters of cleaned text from {filename}:\n{cleaned_text[:1000]}\n")

            # Generate the trigram model for the current text
            trigram_model = generate_trigrams(cleaned_text)

            # Merge the current trigram model into the combined model
            for trigram, count in trigram_model.items():
                if trigram in combined_trigram_model:
                    combined_trigram_model[trigram] += count
                else:
                    combined_trigram_model[trigram] = count

    # Print a sample of the combined trigram model to verify
    print("Sample of combined trigram model:", {k: combined_trigram_model[k] for k in list(combined_trigram_model)[:10]}, "\n")

    # Generate a 10,000-character text based on the combined trigram model
    generated_text = generate_text(combined_trigram_model)
    print(f"First 1000 characters of generated text:\n{generated_text[:1000]}\n")
    
    # Load the list of valid English words from 'words.txt'
    word_list_path = os.path.join(data_folder, 'words.txt')
    with open(word_list_path, 'r') as file:
        valid_words = set(file.read().splitlines())
    
    # Count valid words in the generated text
    valid_word_count, total_word_count = count_valid_words(generated_text, valid_words)
    
    # Calculate the percentage of valid words
    valid_word_percentage = (valid_word_count / total_word_count) * 100
    
    # Display the results
    print(f"Total words in generated text: {total_word_count}")
    print(f"Valid English words: {valid_word_count}")
    print(f"Percentage of valid English words: {valid_word_percentage:.2f}%\n")

if __name__ == "__main__":
    main()

File data/pride-and-prejudice.txt loaded successfully.
First 1000 characters of cleaned text from pride-and-prejudice.txt:
THE PROJECT GUTENBERG EBOOK OF PRIDE AND PREJUDICE    THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE IN THE UNITED STATES ANDMOST OTHER PARTS OF THE WORLD AT NO COST AND WITH ALMOST NO RESTRICTIONSWHATSOEVER. YOU MAY COPY IT GIVE IT AWAY OR REUSE IT UNDER THE TERMSOF THE PROJECT GUTENBERG LICENSE INCLUDED WITH THIS EBOOK OR ONLINEAT WWW.GUTENBERG.ORG. IF YOU ARE NOT LOCATED IN THE UNITED STATESYOU WILL HAVE TO CHECK THE LAWS OF THE COUNTRY WHERE YOU ARE LOCATEDBEFORE USING THIS EBOOK.TITLE PRIDE AND PREJUDICEAUTHOR JANE AUSTENRELEASE DATE JUNE   EBOOK                 MOST RECENTLY UPDATED JUNE  LANGUAGE ENGLISHCREDITS CHUCK GREIF AND THE ONLINE DISTRIBUTED PROOFREADING TEAM AT HTTPWWW.PGDP.NET THIS FILE WAS PRODUCED FROM IMAGES AVAILABLE AT THE INTERNET ARCHIVE START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE                             ILLUSTRATION          

FileNotFoundError: [Errno 2] No such file or directory: 'data/words.txt'