`define function` word_count_summary(file_path, search_terms)

In [6]:
import re

def validate_word(word):
    """Check if the string contains only valid characters (A-Z, a-z, 0-9, and underscore)."""
    
    if not re.fullmatch(r'[\w]+', word):  # \w matches [A-Za-z0-9_]
        raise ValueError(f"The word '{word}' contains invalid characters. Only A-Z, a-z, 0-9, and underscore '_' are allowed.")

def validate_search_terms(search_terms):
    """Validate search_terms as a string or a list of strings, checking whether each of the string in the list fits the stanard. And check each of the string in the list contains only allowed characters."""
    if isinstance(search_terms, str):
        validate_word(search_terms)  # Validate single string directly
        return [search_terms]  # Return as a list for uniform processing
    elif isinstance(search_terms, list):
        for term in search_terms:
            if not isinstance(term, str):  # Ensure each term is a string
                raise TypeError(f"Expected a string, but got {type(term).__name__} for term '{term}'.")
            validate_word(term)
        return search_terms  # Return the original list if all are valid
    else:
        raise TypeError("search_terms should be a string or a list of strings.")

def is_whole_word(word, text):
    """Check if the word appears as a whole word in the processed text, matching case exactly."""
    index = text.find(word)
    while index != -1:
        # Ensure that the found word is at a word boundary
        if (index == 0 or not text[index - 1].isalnum() and text[index - 1] != '_') and \
           (index + len(word) == len(text) or not text[index + len(word)].isalnum() and text[index + len(word)] != '_'):
            return True
        index = text.find(word, index + 1)
    return False

def print_single_word_count(word, words):
    """Print the count of a single string in the sentences"""

    word_count = sum(is_whole_word(word, w) for w in words)
    if word_count == 1:
        print(f'The word `{word}` appears {word_count} time.')
    else:
        print(f'The word `{word}` appears {word_count} times.')

def print_word_count_table(search_terms, counts):
    """Print a table of search result in a formatted manner without using packages"""
    # Determine maximum length for keyword and count columns
    max_word_length = max(len(word) for word in search_terms + ["WORD"])
    max_count_length = max(len(str(count)) for count in counts + ["COUNT"])

    # Calculate column widths based on maximum lengths, adding padding
    word_col_width = max_word_length + 2  # Add padding for spaces
    count_col_width = max_count_length + 2  # Add padding for spaces
    
    # Print header with dynamic dashes
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    print(f"| {'WORD'.ljust(word_col_width - 1)}| {'COUNT'.rjust(count_col_width - 1)}|")
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    
    # Print each row with word and count, left-aligned words and right-aligned counts
    for keyword, count in zip(search_terms, counts):
        print(f"| {keyword.ljust(word_col_width - 1)}| {str(count).rjust(count_col_width - 1)}|")
    
    # Print footer with total
    total_count = sum(counts)
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    print(f"| {'TOTAL'.ljust(word_col_width - 1)}| {str(total_count).rjust(count_col_width - 1)}|")
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")

def word_count_summary(file_path, search_terms):
    """Read the file content and count the occurrences of search terms.

    Args:
        file_path (str): The path to the file to be read.
        search_terms (str or list): The search term(s) to count in the file. This needs to be string, or list of strings. 
        Among strings, it will only allow A-Z,a-z, 0-9 and _

        The string of search_terms will be split into a list of words, and each word will be counted separately.
        The list of search_terms will be validated to ensure all elements are strings.
    """
    try:
        # Validate and normalize search terms
        search_terms = validate_search_terms(search_terms)

        # Read and process the file content
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Split content into words based on compiled whole word pattern
            words = re.findall(r'\b\w+\b', content)

            # Proceed with counting based on the normalized search_terms
            counts = [sum(is_whole_word(keyword, word) for word in words) for keyword in search_terms]
            
            # Print results
            if len(search_terms) == 1:
                print_single_word_count(search_terms[0], words)
            else:
                print_word_count_table(search_terms, counts)

    except FileNotFoundError:
        print('Error: File not found.')
    except ValueError as ve:
        print(f"Validation error: {ve}")
    except TypeError as te:
        print(f"Type error: {te}")
    except Exception as e:
        print(f'An error occurred: {str(e)}')


# Example Input
path = '../../pride-and-prejudice.txt'
# keyword = 'Lydia'
keyword = 'the'

# Execute the function
word_count_summary(path, keyword)


The word `the` appears 4060 times.


To optimize the shortcoming of the above way of processing the list of stirng: 

 `The current approach is not efficient because for each of the string in the list, the text will be looked over once.`
 
 Here are some potential improvements:
 1. possibile cache the result of the search for each word in the list, and use the cache for the next string in the list.
 2. pre-compute the result and save it. Only extract the result from the pre-computed set could save computing time

In [100]:
import re

# Compile regex patterns for reuse
valid_word_pattern = re.compile(r'[\w]+')  # Matches valid characters (A-Z, a-z, 0-9, and underscore)
whole_word_pattern = re.compile(r'\b\w+\b')  # Matches whole words in text

def validate_word(word):
    """Validate the word to ensure it contains only valid characters."""
    # Check if the word contains only valid characters
    if not valid_word_pattern.fullmatch(word):
        raise ValueError(f"The word '{word}' contains invalid characters. Only A-Z, a-z, 0-9, and underscore '_' are allowed.")

def validate_search_terms(search_terms):
    # Validate search_terms as a string or a list of strings
    if isinstance(search_terms, str):
        validate_word(search_terms)  # Validate single string directly
        return [search_terms]  # Return as a list for uniform processing
    elif isinstance(search_terms, list):
        for term in search_terms:
            if not isinstance(term, str):  # Ensure each term is a string
                raise TypeError(f"Expected a string, but got {type(term).__name__} for term '{term}'.")
            validate_word(term)
        return search_terms  # Return the original list if all are valid
    else:
        raise TypeError("search_terms should be a string or a list of strings.")

def is_whole_word(word, text):
    # Check if the word appears as a whole word in the processed text, matching case exactly
    index = text.find(word)
    while index != -1:
        # Ensure that the found word is at a word boundary
        if (index == 0 or not text[index - 1].isalnum() and text[index - 1] != '_') and \
           (index + len(word) == len(text) or not text[index + len(word)].isalnum() and text[index + len(word)] != '_'):
            return True
        index = text.find(word, index + 1)
    return False

def print_single_word_count(word, words):
    # Count occurrences of a single word and print message based on count
    word_count = sum(is_whole_word(word, w) for w in words)
    if word_count == 1:
        print(f'The word `{word}` appears {word_count} time.')
    else:
        print(f'The word `{word}` appears {word_count} times.')

def print_word_count_table(search_terms, counts):
    # Determine maximum length for keyword and count columns
    max_word_length = max(len(word) for word in search_terms + ["WORD"])
    max_count_length = max(len(str(count)) for count in counts + ["COUNT"])

    # Calculate column widths based on maximum lengths, adding padding
    word_col_width = max_word_length + 2  # Add padding for spaces
    count_col_width = max_count_length + 2  # Add padding for spaces
    
    # Print header with dynamic dashes
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    print(f"| {'WORD'.ljust(word_col_width - 1)}| {'COUNT'.rjust(count_col_width - 1)}|")
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    
    # Print each row with word and count, left-aligned words and right-aligned counts
    for keyword, count in zip(search_terms, counts):
        print(f"| {keyword.ljust(word_col_width - 1)}| {str(count).rjust(count_col_width - 1)}|")
    
    # Print footer with total
    total_count = sum(counts)
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")
    print(f"| {'TOTAL'.ljust(word_col_width - 1)}| {str(total_count).rjust(count_col_width - 1)}|")
    print(f"|{'-' * word_col_width}|{'-' * count_col_width}|")

def word_count_summary(file_path, search_terms):
    try:
        # Validate and normalize search terms
        search_terms = validate_search_terms(search_terms)

        # Read and process the file content
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Use compiled regex to find all words in the content
            words = whole_word_pattern.findall(content)

            # Proceed with counting based on the normalized search_terms
            counts = [sum(is_whole_word(keyword, word) for word in words) for keyword in search_terms]
            
            # Print results
            if len(search_terms) == 1:
                print_single_word_count(search_terms[0], words)
            else:
                print_word_count_table(search_terms, counts)

    except FileNotFoundError:
        print('Error: File not found.')
    except ValueError as ve:
        print(f"Validation error: {ve}")
    except TypeError as te:
        print(f"Type error: {te}")
    except Exception as e:
        print(f'An error occurred: {str(e)}')

# Example Input
path = '../pride-and-prejudice.txt'
keyword = '_That_'  
keyword = '_would_'
# Execute the function
word_count_summary(path, keyword)


The word `Kitty` appears 71 times.
