In [2]:
# Emerging Technology Tasks

# Task 1: Building a trigram model from Project Gutenberg books

# This project reads and processes five books from Project Gutenberg to build a trigram model.
# A trigram is a sequence of three characters
# The trigram model counts how often each trigram appears in the text.

import re
from collections import defaultdict

In [3]:
# Paths to the five text files
# These represent the text files of the books to be processed.
file_paths = ['Book1.txt', 'Book2.txt', 'Book3.txt',  'Book4.txt',  'Book5.txt'   
]

In [4]:
# Function to read text from a file
# This function opens the given file, reads the entire content, and returns it as a string.
def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [5]:
# Function to clean the text
# This function removes unnecessary parts of the text, like the Project Gutenberg preamble and postamble.
# It also removes all characters that are not ASCII letters, full stops, or spaces, and converts the text to uppercase.
def clean_text(text):
    # Remove preamble and postamble (e.g., legal information, licensing)
    start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    if start and end:
        text = text[start.end():end.start()]

    # Remove all non-ASCII letters except full stops and spaces
    text = re.sub(r'[^A-Za-z. ]', '', text)

    # Convert all letters to uppercase for consistency
    text = text.upper()

    return text

In [6]:
# Function to create a trigram model
# This function takes the cleaned text and creates a trigram model.
# It counts how many times each sequence of three characters (trigram) appears in the text.
def create_trigram_model(text):
    # Use a defaultdict to store trigram counts. Default value is 0 for any trigram not yet encountered.
    trigram_model = defaultdict(int)
    
    # Slide through the text to create trigrams and count their occurrences
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Extract a sequence of 3 characters (trigram)
        trigram_model[trigram] += 1  # Increment the count for this trigram

    return trigram_model

In [7]:
# Read and clean texts from all files
# This reads and cleans each of the five books, removing unwanted characters and standardizing the format.
texts = [clean_text(read_text(file_path)) for file_path in file_paths]

# Combine all cleaned texts into one
# Joins the texts from all the books into a single large block of text.
combined_text = ' '.join(texts)

# Create the trigram model using the combined text
# This generates the trigram model, counting how often each trigram appears.
trigram_model = create_trigram_model(combined_text)

In [8]:
# Output some of the trigram model
# Here, we print the first 10 trigrams and their counts from the model to see the results.
for trigram, count in list(trigram_model.items())[:10]:
    print(f'{trigram}: {count}')

THE: 27554
HE : 24462
E P: 2554
 PR: 2524
PRO: 1957
ROJ: 456
OJE: 456
JEC: 745
ECT: 2128
CT : 1070


In [None]:
# Task 2
