In [1]:
import requests, re
import pandas as pd
from pathlib import Path
import spacy
import os
from typing import List

nlp = spacy.blank("en")

# Text related functionse to clean up raw data

In [2]:
def get_base_text_url(url: str) -> str:
    '''
    Takes a URL and returns the raw text from that URL.
    
    Parameters
    ----------
    url : str
        The URL to get the text from.
        
    Returns
    -------
    str
        The raw text from the URL.
    '''
    r = requests.get(url)
    return r.text

def get_base_text_file(file_path: str) -> str:
    '''
    Reads the content of a text file and returns it as a string.
    
    Parameters
    ----------
    file_path : str
        The path to the text file.
        
    Returns
    -------
    str
        The content of the text file as a string.
    '''
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


def clean_guttenberg_header(text: str) -> str:
    '''
    Cleans the header added by Project Gutenberg from the text.
    
    Parameters
    ----------
    text : str
        The raw text obtained from Project Gutenberg.
        
    Returns
    -------
    str
        The text with the header removed.
    '''
    ex = r"The Project Gutenberg eBook.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*"
    main_text = re.sub(ex, "", text, flags=re.DOTALL)
    return main_text

def clean_guttenberg_footer(text: str) -> str:
    '''
    Cleans the footer added by Project Gutenberg from the text.
    
    Parameters
    ----------
    text : str
        The text with the header cleaned.
        
    Returns
    -------
    str
        The text with the footer removed.
    '''
    ex = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*"
    main_text = re.sub(ex, "", text, flags=re.DOTALL)
    return main_text

def clean_guttenberg(text: str) -> str:
    '''
    Cleans both the header and footer added by Project Gutenberg from the text.
    
    Parameters
    ----------
    text : str
        The raw text obtained from Project Gutenberg.
        
    Returns
    -------
    str
        The cleaned text with both the header and footer removed.
    '''
    clean_text = clean_guttenberg_header(text)
    clean_text = clean_guttenberg_footer(clean_text)
    return clean_text

def normalize_input(text: str) -> List[str]:
    '''
    Takes a string, removes stop words and punctuation, normalizes spaces and newlines,
    converts text to lowercase, and returns the text.
    
    Parameters
    ----------
    text : str
        The input text to be normalized.
        
    Returns
    -------
    List[str]
        A list of normalized words (tokens).
    '''
    normalized = text
    normalized = re.sub(r'(\.|\?|\!|,)\s*', ' ', normalized)

    # getting rid of multiple spaces
    normalized = re.sub(r'[ \t]+', ' ', normalized)
    # getting rid of multiple new lines at a time
    normalized = re.sub(r'\n+', '\n', normalized)
    normalized = re.sub(r'\s*\n', '\n', normalized)
    
    # making lowercase
    normalized = normalized.lower()
    # Can add other stuff like to clean up more as well if needed

    return normalized

# Functions for spliting up text into chapters and saving it

In [112]:


# Function to split up text into chapters
def split_text_into_chapters(text: str, chapter_titles: List[str]) -> List[str]:
    '''
    Splits a text into chapters based on the given chapter titles.
    
    Parameters
    ----------
    text : str
        The text to be split into chapters.
    chapter_titles : List[str]
        A list of chapter titles used to find chapter breaks.
        
    Returns
    -------
    List[str]
        A list of chapter contents.
    '''
    chapter_regex = "|".join(re.escape(title) for title in chapter_titles)
    pattern = re.compile(chapter_regex, re.IGNORECASE)

    chapter_positions = [m.start() for m in pattern.finditer(text)]

    chapters = []

    for i in range(len(chapter_positions)):
        start = chapter_positions[i]
        end = chapter_positions[i+1] if i+1 < len(chapter_positions) else None

        chapter_context = text[start:end].strip() if end is not None else text[start:].strip()
        chapters.append(chapter_context)

    return chapters

# Saving chapters to a CSV file
def save_chapters_to_csv(chapters: List[str], file: str):
    '''
    Saves chapter contents to a CSV file with chapter numbers and content.
    
    Parameters
    ----------
    chapters : List[str]
        The list of chapter contents.
    file : str
        The path to the CSV file where the chapters will be saved.
        
    Returns
    -------
    None
    '''
    df = pd.DataFrame({
        'Chapter': [f'Chapter {i+1}' for i in range(len(chapters))],
        'Content': chapters
    })

    df.to_csv(file, index=False, encoding='utf-8')

    

def process_file(file: str, chapter_titles: List[str], remove_table_of_contents: bool = False, table_of_contents: List[str] = None):
    '''
    Processes a single .txt file, optionally removes the table of contents, normalizes the content, 
    cleans it of Project Gutenberg headers and footers, splits it into chapters, and saves each as a .csv file 
    with the same name as the original .txt file.

    Parameters
    ----------
    file : str
        The path to the .txt file being processed.
    chapter_titles : List[str]
        A list of chapter titles used to split the text into chapters.
    remove_table_of_contents : bool
        If True, removes the first occurrence of each chapter title in the table of contents.
    table_of_contents : List[str], optional
        A separate list of titles for the table of contents. If not provided, it defaults to chapter titles.

    Returns
    -------
    None
    '''
    csv_name = re.sub(r'\.txt$', '.csv', file, flags=re.IGNORECASE)

    text = get_base_text_file(file)

    # Use chapter_titles as the default for table_of_contents if not specified
    if table_of_contents is None:
        table_of_contents = chapter_titles

    # Optionally remove the first occurrence of each chapter title (Table of Contents)
    if remove_table_of_contents:
        for title in table_of_contents:
            # Remove the first occurrence of each chapter title
            text = re.sub(re.escape(title), '', text, count=1, flags=re.IGNORECASE)

    # Normalize the text (you can customize this function as needed)
    book_full = clean_guttenberg(text)
    book_chapters = split_text_into_chapters(book_full, chapter_titles)
    normalized_chapters = [normalize_input(chapter) for chapter in book_chapters]

    # Save the chapters to a CSV file
    save_chapters_to_csv(normalized_chapters, csv_name)

    print(f"Processed {file} and saved as {csv_name}")

# Helper functions used to generate chapter titles for chapter seperation

In [170]:
def list_txt_files_in_directory(directory: str):
    '''
    Takes a directory path and prints the names of all .txt files in that directory.
    Also prints the total number of .txt files.
    
    Parameters
    ----------
    directory : str
        The path to the directory containing .txt files.
    '''
    txt_files = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file ends with .txt
        if filename.endswith(".txt"):
            txt_files.append(filename)
            print(filename)
    
    # Print the total number of .txt files found
    print(f"Total number of .txt files: {len(txt_files)}")

def list_txt_path_in_directory(directory: str):
    '''
    Takes a directory path and prints the relative paths of all .txt files in the directory in the format:
    book = "relative/path/to/file.txt"
    
    Parameters
    ----------
    directory : str
        The path to the directory containing .txt files.
    '''
    txt_files = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file ends with .txt
        if filename.endswith(".txt"):
            # Get the relative path of the file
            relative_path = os.path.join(directory, filename)
            txt_files.append(relative_path)
            # Print the relative path in the required format
            print(f'book = "{relative_path}"')
    
    # Print the total number of .txt files found
    print(f"Total number of .txt files: {len(txt_files)}")


# Helper function to convert an integer to a Roman numeral
def int_to_roman(n: int) -> str:
    '''
    Converts an integer to its Roman numeral representation.
    
    Parameters
    ----------
    n : int
        The integer to be converted.
        
    Returns
    -------
    str
        The Roman numeral representation of the integer.
    '''
    roman_numerals = {
        1: 'i', 4: 'iv', 5: 'v', 9: 'ix', 10: 'x',
        40: 'xl', 50: 'l', 90: 'xc', 100: 'c'
    }
    result = ''
    
    for value in sorted(roman_numerals.keys(), reverse=True):
        while n >= value:
            result += roman_numerals[value]
            n -= value
            
    return result

# Generate an array with chapter titles "chapter i", "chapter ii", ...
def generate_roman_chapters(num_chapters: int) -> List[str]:
    '''
    Generates a list of chapter titles in the format "chapter i", "chapter ii", etc.
    
    Parameters
    ----------
    num_chapters : int
        The number of chapters to generate titles for.
        
    Returns
    -------
    List[str]
        A list of chapter titles with Roman numerals.
    '''
    return [f"chapter {int_to_roman(i)}" for i in range(1, num_chapters + 1)]

def text_to_array(text: str) -> List[str]:
    '''
    Takes a block of text and splits it into an array, with each element representing a line of text.
    
    Parameters
    ----------
    text : str
        The input text containing multiple lines.
        
    Returns
    -------
    List[str]
        A list where each element is a line from the input text.
    '''
    # Split the text by newlines and strip any leading/trailing spaces
    return [line.strip() for line in text.split('\n') if line.strip()]

def remove_underscore_numbers(line):
    # This regex looks for an underscore, one or more digits, and another underscore
    return re.sub(r'_\d+_', '', line)

def remove_numbers(line):
    # This regex looks for one or more digits in the line and removes them
    return re.sub(r'\d+', '', line)
def format_chapter_titles_with_extra_line(chapters):
    formatted_chapters = []
    for chapter in chapters:
        # Split the chapter title based on the first space
        number, title = chapter.split(' ', 1)
        # Format the chapter with an extra newline between the number and the title
        formatted_chapter = number + '\n\n' + title
        formatted_chapters.append(formatted_chapter)
    return formatted_chapters

Since each book can have its own format with regards to Chapter formats, I am manually adding parameters for each book to account for the numerous special cases. So if new books are added they will have to me manually adjusted. Though I will try to make helper functions for specific formats to make it easer to add new books.

# Agitha Christi Books to CSV
- Doing all 11 texts we have 

In [45]:
## Agitha christi
book = "data/ac/poirot_investigates.txt"
chapter_titles = ['The Adventure of “The Western Star', 'The Tragedy at Marsdon Manor', 'The Adventure of the Cheap Flat', 'The Mystery of Hunter’s Lodge', 'The Million Dollar Bond Robbery', 'The Adventure of the Egyptian Tomb', 'Jewel Robbery at the _Grand Metropolitan_', 'The Kidnapped Prime Minister', 'The Disappearance of Mr. Davenheim', 'The Adventure of the Italian Nobleman', 'The Case of the Missing Will']
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_big_four.txt"
chapter_titles = ['1. THE UNEXPECTED GUEST', '2. THE MAN FROM THE ASYLUM', '3. WE HEAR MORE ABOUT LI CHANG YEN', '4. THE IMPORTANCE OF A LEG OF MUTTON', '5. DISAPPEARANCE OF A SCIENTIST', '6. THE WOMAN ON THE STAIRS', '7. THE RADIUM THIEVES', '8. IN THE HOUSE OF THE ENEMY', '9. THE YELLOW JASMINE MYSTERY', '10. WE INVESTIGATE AT CROFTLANDS', '11. A CHESS PROBLEM', '12. THE BAITED TRAP', '13. THE MOUSE WALKS IN', '14. THE PEROXIDE BLONDE', '15. THE TERRIBLE CATASTROPHE', '16. THE DYING CHINAMAN', '17. NUMBER FOUR WINS A TRICK', '18. IN THE FELSENLABYRYNTH']
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_hunters_lodge_case.txt"
chapter_titles = ['*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *']
process_file(book, chapter_titles, remove_table_of_contents=False)


book = "data/ac/the_man_in_the_brown_suit.txt"
chapter_titles = generate_roman_chapters(37)
chapter_titles = [chapter.upper() for chapter in chapter_titles]
chapter_titles.insert(0, 'PROLOGUE')
process_file(book, chapter_titles, remove_table_of_contents=False)


book = "data/ac/the_missing_will.txt"
chapter_titles = ['*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *']
process_file(book, chapter_titles, remove_table_of_contents=False)


book = "data/ac/the_murder_of_roger_ackroyd.txt"
chapter_titles = generate_roman_chapters(27)
chapter_titles = [chapter.upper() for chapter in chapter_titles]
process_file(book, chapter_titles, remove_table_of_contents=False)


book = "data/ac/the_murder_on_the_links.txt"
chapter_titles = ['1 A Fellow Traveller', '2 An Appeal for Help', '3 At the Villa Geneviève', '4 The Letter Signed “Bella”', '5 Mrs. Renauld’s Story', '6 The Scene of the Crime', '7 The Mysterious Madame Daubreuil', '8 An Unexpected Meeting', '9 M. Giraud Finds Some Clues', '10 Gabriel Stonor', '11 Jack Renauld', '12 Poirot Elucidates Certain Points', '13 The Girl with the Anxious Eyes', '14 The Second Body', '15 A Photograph', '16 The Beroldy Case', '17 We Make Further Investigations', '18 Giraud Acts', '19 I Use My Grey Cells', '20 An Amazing Statement', '21 Hercule Poirot on the Case!', '22 I Find Love', '23 Difficulties Ahead', '24 “Save Him!”', '25 An Unexpected Dénouement', '26 I Receive a Letter', '27 Jack Renauld’s Story', '28 Journey’s End']
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_mysterious_affair_at_styles.txt"
chapter_titles = generate_roman_chapters(13)
chapter_titles = [chapter.upper() for chapter in chapter_titles]
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_mystery_of_the_blue_train.txt"
chapter_titles = ['1. The Man with the White Hair', '2. M. le Marquis', '3. Heart of Fire', '4. In Curzon Street', '5. A Useful Gentleman', '6. Mirelle', '7. Letters', '8. Lady Tamplin Writes a Letter', '9. An Offer Refused', '10. On the Blue Train', '11. Murder', '12. At the Villa Marguerite', '13. Van Aldin Gets a Telegram', "14. Ada Mason's Story", '15. The Comte de la Roche', '16. Poirot Discusses the Case', '17. An Aristocratic Gentleman', '18. Derek Lunches', '19. An Unexpected Visitor', '20. Katherine Makes a Friend', '21. At the Tennis', '22. M. Papopolous Breakfasts', '23. A New Theory', '24. Poirot Gives Advice', '25. Defiance', '26. A Warning', '27. Interview with Mirelle', '28. Poirot Plays the Squirrel', '29. A Letter From Home', '30. Miss Viner Gives Judgment', '31. Mr. Aarons Lunches', '32. Katherine and Poirot Compare Notes', '33. A New Theory', '34. The Blue Train Again', '35. Explanations', '36. By the Sea']
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_plymouth_express_affair.txt"
chapter_titles = ['*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *', '*       *       *       *       *']
process_file(book, chapter_titles, remove_table_of_contents=False)


book = "data/ac/the_secret_adversary.txt"
chapter_titles = generate_roman_chapters(28)
chapter_titles = [chapter.upper() for chapter in chapter_titles]
chapter_titles.insert(0, 'PROLOGUE')
process_file(book, chapter_titles, remove_table_of_contents=True)


book = "data/ac/the_secret_of_chimneys.txt"
toc = ['1 ANTHONY CADE SIGNS ON', '2 A LADY IN DISTRESS', '3 ANXIETY IN HIGH PLACES', '4 INTRODUCING A VERY CHARMING LADY', '5 FIRST NIGHT IN LONDON', '6 THE GENTLE ART OF BLACKMAIL', '7 MR. MCGRATH REFUSES AN INVITATION', '8 A DEAD MAN', '9 ANTHONY DISPOSES OF A BODY', '10 CHIMNEYS', '11 SUPERINTENDENT BATTLE ARRIVES', '12 ANTHONY TELLS HIS STORY', '13 THE AMERICAN VISITOR', '14 MAINLY POLITICAL AND FINANCIAL', '15 THE FRENCH STRANGER', '16 TEA IN THE SCHOOLROOM', '17 A MIDNIGHT ADVENTURE', '18 SECOND MIDNIGHT ADVENTURE', '19 SECRET HISTORY', '20 BATTLE AND ANTHONY CONFER', '21 MR. ISAACSTEIN’S SUIT-CASE', '22 THE RED SIGNAL', '23 ENCOUNTER IN THE ROSE GARDEN', '24 THE HOUSE AT DOVER', '25 TUESDAY NIGHT AT CHIMNEYS', '26 THE 13TH OF OCTOBER', '27 THE 13TH OF OCTOBER (_contd._)', '28 KING VICTOR', '29 FURTHER EXPLANATIONS', '30 ANTHONY SIGNS ON FOR A NEW JOB', '31 SUNDRY DETAILS']
chapter_titles = ['1\n\nANTHONY CADE SIGNS ON', '2\n\nA LADY IN DISTRESS', '3\n\nANXIETY IN HIGH PLACES', '4\n\nINTRODUCING A VERY CHARMING LADY', '5\n\nFIRST NIGHT IN LONDON', '6\n\nTHE GENTLE ART OF BLACKMAIL', '7\n\nMR. MCGRATH REFUSES AN INVITATION', '8\n\nA DEAD MAN', '9\n\nANTHONY DISPOSES OF A BODY', '10\n\nCHIMNEYS', '11\n\nSUPERINTENDENT BATTLE ARRIVES', '12\n\nANTHONY TELLS HIS STORY', '13\n\nTHE AMERICAN VISITOR', '14\n\nMAINLY POLITICAL AND FINANCIAL', '15\n\nTHE FRENCH STRANGER', '16\n\nTEA IN THE SCHOOLROOM', '17\n\nA MIDNIGHT ADVENTURE', '18\n\nSECOND MIDNIGHT ADVENTURE', '19\n\nSECRET HISTORY', '20\n\nBATTLE AND ANTHONY CONFER', '21\n\nMR. ISAACSTEIN’S SUIT-CASE', '22\n\nTHE RED SIGNAL', '23\n\nENCOUNTER IN THE ROSE GARDEN', '24\n\nTHE HOUSE AT DOVER', '25\n\nTUESDAY NIGHT AT CHIMNEYS', '26\n\nTHE 13TH OF OCTOBER', '27\n\nThe 13th of October (contd.)', '28\n\nKING VICTOR', '29\n\nFURTHER EXPLANATIONS', '30\n\nANTHONY SIGNS ON FOR A NEW JOB', '31\n\nSUNDRY DETAILS']
process_file(book, chapter_titles, remove_table_of_contents=True, table_of_contents=toc)

Processed data/ac/poirot_investigates.txt and saved as data/ac/poirot_investigates.csv


Created new generic functions to try and speed up the process

In [176]:
def split_text_generic(text: str, pattern: str) -> List[str]:
    '''
    Splits a text into sections based on the provided regex pattern.
    
    Parameters
    ----------
    text : str
        The text to be split.
    pattern : str
        The regex pattern used to find the section breaks.
        
    Returns
    -------
    List[str]
        A list of section contents.
    '''
    compiled_pattern = re.compile(pattern, re.IGNORECASE)

    # Find all positions where the pattern appears
    section_positions = [m.start() for m in compiled_pattern.finditer(text)]

    sections = []

    # Split the text based on pattern positions
    for i in range(len(section_positions)):
        start = section_positions[i]
        end = section_positions[i+1] if i+1 < len(section_positions) else None

        # Extract the content for each section
        section_content = text[start:end].strip() if end is not None else text[start:].strip()
        sections.append(section_content)

    return sections

# Saving chapters to a CSV file
def save_sections_to_csv(chapters: List[str], file: str):
    '''
    Saves chapter contents to a CSV file with chapter numbers and content.
    
    Parameters
    ----------
    chapters : List[str]
        The list of chapter contents (e.g., chapters).
    file : str
        The path to the CSV file where the chapters will be saved.
        
    Returns
    -------
    None
    '''
    df = pd.DataFrame({
        'Chapter': [f'Chapter {i+1}' for i in range(len(chapters))],
        'Content': chapters
    })

    df.to_csv(file, index=False, encoding='utf-8')

def process_file_generic(file: str, pattern: str, remove_contents: bool = False):
    '''
    Processes a single .txt file, optionally removes the first matching content based on the pattern, 
    splits it based on the provided pattern, and saves each chapter as a .csv file.

    Parameters
    ----------
    file : str
        The path to the .txt file being processed.
    pattern : str
        The regex pattern used to find the chapter breaks (e.g., "CHAPTER [IVXLCDM]+").
    remove_contents : bool, optional
        If True, removes the first matching portion of the text based on the same regex pattern.
    
    Returns
    -------
    None
    '''
    # Generate the CSV file name
    csv_name = re.sub(r'\.txt$', '.csv', file, flags=re.IGNORECASE)

    # Read the content of the .txt file
    text = get_base_text_file(file)

    # Normalize and clean the text (this is your custom function)
    book_full = clean_guttenberg(text)

    # Optionally remove the first occurrence of the matching pattern
    if remove_contents:
        # Remove the first matching chapter based on the pattern
        book_full = re.sub(pattern, '', book_full, count=1, flags=re.IGNORECASE)

    # Split the text using the single generic pattern
    book_chapters = split_text_generic(book_full, pattern)

    # Normalize each chapter
    normalized_chapters = [normalize_input(chapter) for chapter in book_chapters]

    # Save the chapters to a CSV file
    save_sections_to_csv(normalized_chapters, csv_name)

    print(f"Processed {file} and saved as {csv_name}")


# A. Conan Doyle
- Currently doing 10 books

In [226]:
# 14 total chapters
book = "data/cd/a_study_in_scarlet.txt"
pattern = r'CHAPTER [IVXLCDM]+\.\n'
process_file_generic(book, pattern, remove_contents=False)

# 12 total stories
book = "data/cd/adventures_of_sherlock_holmes.txt"
pattern = r'\nAdventure [IVXLCDM]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 17 chapters
book = "data/cd/beyond_the_city.txt"
pattern = r'\nCHAPTER [IVXLCDM]+\. '
process_file_generic(book, pattern, remove_contents=False)

# 10 chapters, could maybe broken down more
book = "data/cd/danger!_and_other_stories.txt"
pattern = r'\n[IVX]+\. '
process_file_generic(book, pattern, remove_contents=False)

# I think this is a collection of 7 stories
book = "data/cd/his_last_bow_an_epilogue_of_sherlock_holmes.txt"
chapter_titles = [    'The Adventure of Wisteria Lodge\n', 'The Adventure of the Bruce-Partington Plans\n', 'The Adventure of the Devil’s Foot\n', 'The Adventure of the Red Circle\n', 'The Disappearance of Lady Frances Carfax\n', 'The Adventure of the Dying Detective\n', 'His Last Bow: The War Service of Sherlock Holmes\n']
process_file(book, chapter_titles, remove_table_of_contents=True)

# 36 chapters
book = "data/cd/micah_clarke.txt"
pattern = r'\nCHAPTER [IVX]+\. '
process_file_generic(book, pattern, remove_contents=False)

# 2 parts 
book = "data/cd/my_friend_the_murderer.txt"
pattern = r'(By A\. Conan Doyle)|(\*\*\*\*\*)'
process_file_generic(book, pattern, remove_contents=False)

# a bunch of songs
book = "data/cd/songs_of_the_road.txt"
chapter_titles = ["A HYMN OF EMPIRE", "SIR NIGEL'S SONG", "THE ARAB STEED", "A POST-IMPRESSIONIST", "EMPIRE BUILDERS", "THE GROOM'S ENCORE", "THE BAY HORSE", "THE OUTCASTS", "THE END", "1902-1909", "THE WANDERER", "BENDY'S SERMON", "COMPENSATION", "THE BANNER OF PROGRESS", "HOPE", "RELIGIO MEDICI", "MAN'S LIMITATION", "MIND AND MATTER", "DARKNESS", "A WOMAN'S LOVE", "BY THE NORTH SEA", "DECEMBER'S SNOW", "SHAKESPEARE'S EXPOSTULATION", "THE EMPIRE", "A VOYAGE", "THE ORPHANAGE", "SEXAGENARIUS LOQUITUR", "NIGHT VOICES", "THE MESSAGE", "THE ECHO", "ADVICE TO A YOUNG AUTHOR", "A LILT OF THE ROAD"]
process_file(book, chapter_titles, remove_table_of_contents=True)

# 2 6 chapter books
book = "data/cd/tales_of_terror_and_mystery.txt"
chapter_titles = ["The Horror of the Heights\n", "The Leather Funnel\n", "The New Catacomb\n", "The Case of Lady Sannox\n", "The Terror of Blue John Gap\n", "The Brazilian Cat\n", "The Lost Special\n", "The Beetle-Hunter\n", "The Man with the Watches\n", "The Japanned Box\n", "The Black Doctor\n", "The Jew's Breastplate\n"]
process_file(book, chapter_titles, remove_table_of_contents=True)

# No clear break in the book
# book = "data/cd/the_adventure_of_the_bruce-partington_plans.txt"
# chapter_titles = []
# process_file(book, chapter_titles, remove_table_of_contents=True)

# No clear break in the book
# book = "data/cd/the_adventure_of_the_cardboard_box.txt"
# chapter_titles = []
# process_file(book, chapter_titles, remove_table_of_contents=True)
# Same for data/cd/the_adventure_of_the_devils_foot.txt
# and data/cd/the_adventure_of_the_dying_detective.txt

book = "data/cd/the_adventure_of_wisteria_lodge.txt"
chapter_titles = ["1.  The Singular Experience of Mr. John Scott Eccles", "2.  The Tiger of San Pedro"]
process_file(book, chapter_titles, remove_table_of_contents=False)

Processed data/cd/a_study_in_scarlet.txt and saved as data/cd/a_study_in_scarlet.csv
Processed data/cd/adventures_of_sherlock_holmes.txt and saved as data/cd/adventures_of_sherlock_holmes.csv
Processed data/cd/beyond_the_city.txt and saved as data/cd/beyond_the_city.csv
Processed data/cd/danger!_and_other_stories.txt and saved as data/cd/danger!_and_other_stories.csv
Processed data/cd/his_last_bow_an_epilogue_of_sherlock_holmes.txt and saved as data/cd/his_last_bow_an_epilogue_of_sherlock_holmes.csv
Processed data/cd/micah_clarke.txt and saved as data/cd/micah_clarke.csv
Processed data/cd/my_friend_the_murderer.txt and saved as data/cd/my_friend_the_murderer.csv
Processed data/cd/songs_of_the_road.txt and saved as data/cd/songs_of_the_road.csv
Processed data/cd/tales_of_terror_and_mystery.txt and saved as data/cd/tales_of_terror_and_mystery.csv
Processed data/cd/the_adventure_of_wisteria_lodge.txt and saved as data/cd/the_adventure_of_wisteria_lodge.csv


# G. K. Chesterton
- Currently doing 10 books

In [224]:
# 18 chapters
book = "data/gc/a_short_history_of_england.txt"
pattern = r'\n[IVX]+\n{1,2}\w+'
process_file_generic(book, pattern, remove_contents=False)

# 35 chaptesrs
book = "data/gc/all_things_considered.txt"
chapter_titles = titles = ["\nTHE CASE FOR THE EPHEMERAL\n", "\nCOCKNEYS AND THEIR JOKES\n", "\nTHE FALLACY OF SUCCESS\n", "\nON RUNNING AFTER ONE’S HAT\n", "\nTHE VOTE AND THE HOUSE\n", "\nCONCEIT AND CARICATURE\n", "\nPATRIOTISM AND SPORT\n", "\nAN ESSAY ON TWO CITIES\n", "\nFRENCH AND ENGLISH\n", "\nTHE ZOLA CONTROVERSY\n", "\nOXFORD FROM WITHOUT\n", "\nWOMAN\n", "\nTHE MODERN MARTYR\n", "\nON POLITICAL SECRECY\n", "\nEDWARD VII. AND SCOTLAND\n", "\nTHOUGHTS AROUND KOEPENICK\n", "\nTHE BOY\n", "\nLIMERICKS AND COUNSELS OF PERFECTION\n", "\nANONYMITY AND FURTHER COUNSELS\n", "\nON THE CRYPTIC AND THE ELLIPTIC\n", "\nTHE WORSHIP OF THE WEALTHY\n", "\nSCIENCE AND RELIGION\n", "\nTHE METHUSELAHITE\n", "\nSPIRITUALISM\n", "\nTHE ERROR OF IMPARTIALITY\n", "\nPHONETIC SPELLING\n", "\nHUMANITARIANISM AND STRENGTH\n", "\nWINE WHEN IT IS RED\n", "\nDEMAGOGUES AND MYSTAGOGUES\n", "\nTHE “EATANSWILL GAZETTE”\n", "\nFAIRY TALES\n", "\nTOM JONES AND MORALITY\n", "\nTHE MAID OF ORLEANS\n", "\nA DEAD POET\n", "\nCHRISTMAS\n"]
process_file(book, chapter_titles, remove_table_of_contents=False)

# 17 ch
book = "data/gc/eugenics_and_other_evils.txt"
chapter_titles = ["\nWHAT IS EUGENICS?\n", "\nTHE FIRST OBSTACLES\n", "\nTHE ANARCHY FROM ABOVE\n", "\nTHE LUNATIC AND THE LAW\n", "\nTHE FLYING AUTHORITY\n", "\nTHE UNANSWERED CHALLENGE\n", "\nTHE ESTABLISHED CHURCH OF DOUBT\n", "\nA SUMMARY OF A FALSE THEORY\n", "\nTHE IMPOTENCE OF IMPENITENCE\n", "\nTRUE HISTORY OF A TRAMP\n", "\nTRUE HISTORY OF A EUGENIST\n", "\nTHE VENGEANCE OF THE FLESH\n", "\nTHE MEANNESS OF THE MOTIVE\n", "\nTHE ECLIPSE OF LIBERTY\n", "\nTHE TRANSFORMATION OF SOCIALISM\n", "\nTHE END OF THE HOUSEHOLD GODS\n", "\nA SHORT CHAPTER\n"]
process_file(book, chapter_titles, remove_table_of_contents=False)

# 20 ch
book = "data/gc/heretics.txt"
pattern = r'\n[IVX]+\. '
process_file_generic(book, pattern, remove_contents=False)

# Prelude and 2 act starting at act 2, kinda hard to tell
book = "data/gc/magic.txt"
pattern = r'(THE PRELUDE|ACT [I]+)'
process_file_generic(book, pattern, remove_contents=False)

# 10 ch
book = "data/gc/manalive.txt"
chapter_titles = ["How the Great Wind Came to Beacon House", "The Luggage of an Optimist", "The Banner of Beacon", "The Garden of the God", "The Allegorical Practical Joker", "The Eye of Death; or, the Murder Charge", "The Two Curates; or, the Burglary Charge", "The Round Road; or, the Desertion Charge", "The Wild Weddings; or, the Polygamy Charge", "How the Great Wind Went from Beacon House"]
process_file(book, chapter_titles, remove_table_of_contents=True)

# 9 ch and had to add . for ch 4
book = "data/gc/orthodoxy.txt"
pattern = r'CHAPTER [IVX]+\.-'
process_file_generic(book, pattern, remove_contents=False)

# 8 ch
book = "data/gc/robert_browning.txt"
pattern = r'CHAPTER [IVX]+\n\n'
process_file_generic(book, pattern, remove_contents=False)

# 10 ch
book = "data/gc/st._francis_of_assisi.txt"
pattern = r'_CHAPTER [IVX]+_'
process_file_generic(book, pattern, remove_contents=False)

# 20 ch
book = "data/gc/the_ball_and_the_cross.txt"
pattern = r'\n\n[IVX]+\.'
process_file_generic(book, pattern, remove_contents=False)

Processed data/gc/a_short_history_of_england.txt and saved as data/gc/a_short_history_of_england.csv
Processed data/gc/all_things_considered.txt and saved as data/gc/all_things_considered.csv
Processed data/gc/eugenics_and_other_evils.txt and saved as data/gc/eugenics_and_other_evils.csv
Processed data/gc/heretics.txt and saved as data/gc/heretics.csv
Processed data/gc/magic.txt and saved as data/gc/magic.csv
Processed data/gc/manalive.txt and saved as data/gc/manalive.csv
Processed data/gc/orthodoxy.txt and saved as data/gc/orthodoxy.csv
Processed data/gc/robert_browning.txt and saved as data/gc/robert_browning.csv
Processed data/gc/st._francis_of_assisi.txt and saved as data/gc/st._francis_of_assisi.csv
Processed data/gc/the_ball_and_the_cross.txt and saved as data/gc/the_ball_and_the_cross.csv


# Maurice Leblanc
- Currently doing 10 books

In [240]:
# 17 ch including EPILOGUE
book = "data/ml/813.txt"
pattern = r'(CHAPTER [IVXLCDM]+|EPILOGUE)\n\n'
process_file_generic(book, pattern, remove_contents=False)

# 23
book = "data/ml/arsène_lupin.txt"
pattern = r'CHAPTER [IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 15 including EPILOGUE
book = "data/ml/memoirs_of_arsène_lupin.txt"
pattern = r'(CHAPTER [IVX]+\.)|(_EPILOGUE_)\n'
process_file_generic(book, pattern, remove_contents=False)

# 8 ch
book = "data/ml/the_blonde_lady.txt"
pattern = r'CHAPTER [IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 10 ch
book = "data/ml/the_confessions_of_arsène_lupin.txt"
pattern = r'\n[IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 13 ch
book = "data/ml/the_crystal_stopper.txt"
pattern = r'CHAPTER [IVX]+\.\n'
process_file_generic(book, pattern, remove_contents=False)

# 8 ch
book = "data/ml/the_eight_strokes_of_the_clock.txt"
pattern = r'\n[IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 11 ch
book = "data/ml/the_eyes_of_innocence.txt"
pattern = r'\n[IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 20 ch
book = "data/ml/the_frontier.txt"
pattern = r'CHAPTER [IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

# 19 ch had to add CHAPTER to chapter 7
book = "data/ml/the_golden_triangle_the_return_of_arsène_lupin.txt"
pattern = r'CHAPTER [IVX]+\n'
process_file_generic(book, pattern, remove_contents=False)

Processed data/ml/813.txt and saved as data/ml/813.csv
Processed data/ml/arsène_lupin.txt and saved as data/ml/arsène_lupin.csv
Processed data/ml/memoirs_of_arsène_lupin.txt and saved as data/ml/memoirs_of_arsène_lupin.csv
Processed data/ml/the_blonde_lady.txt and saved as data/ml/the_blonde_lady.csv
Processed data/ml/the_confessions_of_arsène_lupin.txt and saved as data/ml/the_confessions_of_arsène_lupin.csv
Processed data/ml/the_crystal_stopper.txt and saved as data/ml/the_crystal_stopper.csv
Processed data/ml/the_eight_strokes_of_the_clock.txt and saved as data/ml/the_eight_strokes_of_the_clock.csv
Processed data/ml/the_eyes_of_innocence.txt and saved as data/ml/the_eyes_of_innocence.csv
Processed data/ml/the_frontier.txt and saved as data/ml/the_frontier.csv
Processed data/ml/the_golden_triangle_the_return_of_arsène_lupin.txt and saved as data/ml/the_golden_triangle_the_return_of_arsène_lupin.csv


# Shakespeare
- Currently doing 4 books

Added altered functions to do scenes here quicker and automatically. Did before making the above generic function so that's why its not used.

In [155]:
def split_text_into_scenes(text: str) -> List[str]:
    '''
    Splits a text into scenes based on the pattern "SCENE #".
    
    Parameters
    ----------
    text : str
        The text to be split into scenes.
        
    Returns
    -------
    List[str]
        A list of scene contents.
    '''
    scene_regex = r'SCENE [IVXLCDM]+\.'  # Matches "SCENE I." or similar
    pattern = re.compile(scene_regex, re.IGNORECASE)

    # Find positions of all occurrences of "SCENE #"
    scene_positions = [m.start() for m in pattern.finditer(text)]

    scenes = []

    # Split the text based on scene positions
    for i in range(len(scene_positions)):
        start = scene_positions[i]
        end = scene_positions[i+1] if i+1 < len(scene_positions) else None

        # Extract scene content
        scene_content = text[start:end].strip() if end is not None else text[start:].strip()
        scenes.append(scene_content)

    return scenes


def process_file_scenes(file: str):
    '''
    Processes a single .txt file, normalizes the content, 
    cleans it of Project Gutenberg headers and footers, splits it into scenes, and saves each as a .csv file.

    Parameters
    ----------
    file : str
        The path to the .txt file being processed.

    Returns
    -------
    None
    '''
    # Generate the CSV file name
    csv_name = re.sub(r'\.txt$', '.csv', file, flags=re.IGNORECASE)

    # Read the content of the .txt file
    text = get_base_text_file(file)

    # Normalize and clean the text (this is your custom function)
    book_full = clean_guttenberg(text)

    # Split the text into scenes
    book_sections = split_text_into_scenes(book_full)

    # Normalize each scene (if you need to clean each section individually)
    normalized_sections = [normalize_input(section) for section in book_sections]

    # Save the scenes to a CSV file
    save_chapters_to_csv(normalized_sections, csv_name)

    print(f"Processed {file} and saved as {csv_name}")

In [157]:
book = "data/shakespeare/coriolanus.txt"
process_file_scenes(book)

book = "data/shakespeare/hamlet.txt"
process_file_scenes(book)

book = "data/shakespeare/macbeth.txt"
process_file_scenes(book)

book = "data/shakespeare/othello.txt"
process_file_scenes(book)

Processed data/shakespeare/coriolanus.txt and saved as data/shakespeare/coriolanus.csv
Processed data/shakespeare/hamlet.txt and saved as data/shakespeare/hamlet.csv
Processed data/shakespeare/macbeth.txt and saved as data/shakespeare/macbeth.csv
Processed data/shakespeare/othello.txt and saved as data/shakespeare/othello.csv
