In [9]:
# Task 1
# This project reads and processes five books from Project Gutenberg to build a trigram model. A trigram is a sequence of three characters, and the model counts how often each trigram appears in the text

import re
from collections import defaultdict

# Paths to the five text files
file_paths = [
    'Book1.txt',  
    'Book2.txt',  
    'Book3.txt',       
    'Book4.txt',     
    'Book5.txt'          
]

def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


def clean_text(text):
    # Remove preamble and postamble
    start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    if start and end:
        text = text[start.end():end.start()]
    
    # Keep only ASCII letters, full stops, and spaces
    text = re.sub(r'[^A-Za-z. ]', '', text)
    
    # Letters all converted to uppercase
    text = text.upper()
    
    return text


def create_trigram_model(text):
    trigram_model = defaultdict(int)
    for i in range(len(text) - 2):
        trigram = text[i:i+3]
        trigram_model[trigram] += 1
    return trigram_model

# Cleaned texts
texts = [clean_text(read_text(file_path)) for file_path in file_paths]

# All texts combined 
combined_text = ' '.join(texts)

# Create trigram model
trigram_model = create_trigram_model(combined_text)

# Print some of the trigram model
for trigram, count in list(trigram_model.items())[:10]:
    print(f'{trigram}: {count}')



THE: 27554
HE : 24462
E P: 2554
 PR: 2524
PRO: 1957
ROJ: 456
OJE: 456
JEC: 745
ECT: 2128
CT : 1070
