In [28]:
# Emerging Technology Tasks

# Task 1: Building a trigram model from Project Gutenberg books

# This project reads and processes five books from Project Gutenberg to build a trigram model.
# A trigram is a sequence of three characters
# The trigram model counts how often each trigram appears in the text.

import re
from collections import defaultdict

In [29]:
# Paths to the five text files
# These represent the text files of the books to be processed
file_paths = ['Book1.txt', 'Book2.txt', 'Book3.txt',  'Book4.txt',  'Book5.txt'   
]

In [30]:
# Function to read text from a file
# This function opens the file, reads the entire content, and returns it as a string
def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [31]:
# Function for cleaning text up
# This function removes unnecessary parts of the text, like the preamble and postamble
# It also removes all characters such as ASCII letters, spaces, full stops, 
# It also converts the text to uppercase
def clean_text(text):
    # Remove preamble and postamble
    start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    if start and end:
        text = text[start.end():end.start()]

    # Remove all non-ASCII letters except full stops and spaces
    text = re.sub(r'[^A-Za-z. ]', '', text)

    # Convert all letters to uppercase for consistency
    text = text.upper()

    return text

In [32]:
# Function to create a trigram model
# This function takes the cleaned text and creates a trigram model
# It counts how many times each sequence of three characters appears in the text
def create_trigram_model(text):
    # Use a defaultdict to store trigram counts. Default value is 0 for any trigram not yet encountered
    trigram_model = defaultdict(int)
    
    # Slide through the text to create trigrams and count their occurrences
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Extract a sequence of 3 characters 
        trigram_model[trigram] += 1  # Increment the count for this trigram

    return trigram_model

In [33]:
# Read and clean texts from all files
# This reads and cleans each of the five books, removing unwanted characters and standardizing the format.
texts = [clean_text(read_text(file_path)) for file_path in file_paths]

# Combine all cleaned texts into one
# Joins the texts from all the books into a single large block of text.
combined_text = ' '.join(texts)

# Create the trigram model using the combined text
# This generates the trigram model, counting how often each trigram appears.
trigram_model = create_trigram_model(combined_text)

In [None]:
# Output some of the trigram model
# Here, we print the first 10 trigrams and their counts from the model to see the results.
for trigram, count in list(trigram_model.items())[:10]:
    print(f'{trigram}: {count}')

In [35]:
# Task 2
import random  # Import the random module for random selection of characters

# Function to get the next character based on the last two characters of the current text
def get_next_char(trigram_model, last_two_chars):
    # Find all trigrams that start with the last two characters
    # This dictionary comprehension loops through the trigram model and selects trigrams that start with the 'last_two_chars'
    matching_trigrams = {trigram: count for trigram, count in trigram_model.items() if trigram.startswith(last_two_chars)}
    
    # If there are no trigrams that start with the last two characters, return None
    if not matching_trigrams:
        return None
    
    # Get the third characters of the matching trigrams
    # For example, if the trigrams are "THE", "THA", and "THI", this step extracts ['E', 'A', 'I']
    third_chars = [trigram[2] for trigram in matching_trigrams.keys()]
    
    # Get the counts of how many times each trigram appears in the text
    # These counts will be used as weights for selecting the next character
    counts = list(matching_trigrams.values())
    
    # Randomly select the next character using the counts as weights
    # The 'weights' parameter ensures that more frequent trigrams have a higher chance of being chosen
    next_char = random.choices(third_chars, weights=counts, k=1)[0]
    
    # Return the selected character, which will be added to the generated string
    return next_char

In [36]:
# Function to generate a string of text based on the trigram model
def generate_text(trigram_model, length=10000):
    # Start the generated text with the initial seed string "TH"
    generated_text = "TH"
    
    # Keep generating characters until the length of the text reaches the specified length (default is 10,000 characters)
    while len(generated_text) < length:
        # Get the last two characters of the currently generated text
        last_two_chars = generated_text[-2:]
        
        # Use the trigram model to get the next character based on the last two characters
        next_char = get_next_char(trigram_model, last_two_chars)
        
        # If no next character is found (i.e., no matching trigrams), stop generating the text
        if next_char is None:
            break  # Stop the generation process if no matching trigrams are found
        
        # Append the next character to the generated text
        generated_text += next_char
    
    # Return the fully generated text once the loop finishes or when no more trigrams are found
    return generated_text


In [None]:
# 'trigram_model' is the dictionary that contains trigrams as keys and their counts as values.
generated_text = generate_text(trigram_model, length=10000)

# This prints the entire generated string of text that was created by the 'generate_text' function.
# The text is 10,000 characters long and is generated based on patterns found in the trigram model.
print(generated_text)

In [None]:
# Task 3

import re

# Read the list of valid English words from words.txt
def load_english_words(file_path):
    with open(file_path, 'r') as f:
        # Store the words in a set for quick lookup
        valid_words = set(word.strip().lower() for word in f.readlines())
    return valid_words

# Extract words from the generated text
def extract_words(text):
    # Use regular expression to extract words, removing non-alphabetic characters
    words = re.findall(r'\b[A-Za-z]+\b', text)
    # Convert to lowercase for case-insensitive matching
    return [word.lower() for word in words]

# Calculate the percentage of valid words in the generated text
def calculate_word_percentage(generated_text, valid_words):
    words_in_text = extract_words(generated_text)  # Get the words from the generated text
    total_words = len(words_in_text)  # Total number of words
    valid_word_count = sum(1 for word in words_in_text if word in valid_words)  # Count how many words are valid
    if total_words == 0:
        return 0  # Avoid division by zero
    # Calculate percentage of valid words
    return (valid_word_count / total_words) * 100

# Step 4: Use the functions to load the words and calculate the percentage
valid_words = load_english_words('words.txt')  # Load the list of valid English words
generated_text = generate_text(trigram_model, length=10000)  # Generate the 10,000 character string

# Calculate the percentage of valid English words
percentage_valid_words = calculate_word_percentage(generated_text, valid_words)

# Print out percentage 
print(f"Percentage of valid English words: {percentage_valid_words:.2f}%")

In [None]:
# Task 4
import json

# Export the trigram model as JSON
def export_trigram_model_to_json(trigram_model, file_path):
    with open(file_path, 'w') as json_file:
        # Convert the trigram model (a dictionary) to JSON and save it
        json.dump(trigram_model, json_file, indent=4)