In [1]:
import requests, re
import pandas as pd

In [2]:
def get_base_text_url(URL):
  r = requests.get(URL)
  return r.text

# Cleaning first part of text obtained from project guttenberg
def clean_guttenberg_header(text):
  ex = r"The Project Gutenberg eBook.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*"
  main_text = re.sub(ex, "", text, flags=re.DOTALL)
  return main_text

# Cleaning out the footer extra info added by project gutteneberg
def clean_guttenberg_footer(text):
  ex = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*"
  main_text = re.sub(ex, "", text, flags=re.DOTALL)
  return main_text

def clean_guttenberg(text):
  clean = clean_guttenberg_header(text)
  clean = clean_guttenberg_footer(clean)
  return clean

In [3]:
def get_base_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [4]:
# Example useage with The murder of Roger Ackroyd by Agatha Christie
base_text = get_base_text_url("https://www.gutenberg.org/cache/epub/69087/pg69087.txt")
# base_text = get_base_text_file("hamlet.txt")


In [5]:
book_full = clean_guttenberg(base_text)
# print(book)

In [6]:
# Takes a string and removes articles and punctuation(specified), then returns the words seperated by commas. This is done after sections of text that we want are found so no information is lost
def NormalizeInput(text):
    # getting rid of multiple spaces
    normalized = re.sub(r'[ \t]+', ' ', text)
    # getting rid of multiple new lines at a time
    normalized = re.sub(r'\n+', '\n', normalized)
    normalized = re.sub(r'\s*\n', '\n', normalized)
    # making lowercase
    normalized = normalized.lower()
    # Can add other stuff like to clean up more as well if needed

    return normalized

In [7]:
book_norm = NormalizeInput(book_full)
print(book_norm)

﻿
 the murder of
 roger ackroyd
 by
 agatha christie
 author of
 the secret of chimneys,
 the murder on the links, etc.
 [illustration]
 grosset & dunlap
 publishers new york
 copyright, 1926,
 by dodd, mead and company, inc.
 to punkie,
 who likes an orthodox detective
 story, murder, inquest, and suspicion
 falling on every one in turn!
 contents
 chapter page
 i dr. sheppard at the breakfast table 1
 ii who’s who in king’s abbot 7
 iii the man who grew vegetable marrows 17
 iv dinner at fernly 31
 v murder 49
 vi the tunisian dagger 65
 vii i learn my neighbor’s profession 75
 viii inspector raglan is confident 92
 ix the goldfish pond 106
 x the parlormaid 118
 xi poirot pays a call 136
 xii round the table 145
 xiii the goose quill 156
 xiv mrs. ackroyd 165
 xv geoffrey raymond 178
 xvi an evening at mah jong 190
 xvii parker 202
 xviii charles kent 218
 xix flora ackroyd 226
 xx miss russell 238
 xxi the paragraph in the paper 251
 xxii ursula’s story 260
 xxiii poirot’s little r

In [8]:
# helper function to convert an integer to a Roman numeral
def int_to_roman(n):
  roman_numerals = {
    1: 'i', 4: 'iv', 5: 'v', 9: 'ix', 10: 'x',
    40: 'xl', 50: 'l', 90: 'xc', 100: 'c'
  }
  result = ''
  for value in sorted(roman_numerals.keys(), reverse=True):
    while n >= value:
      result += roman_numerals[value]
      n -= value
  return result

# Generate an array with chapter titles "chapter i", "chapter ii", ...
def generate_roman_chapters(num_chapters):
  return [f"chapter {int_to_roman(i)}" for i in range(1, num_chapters + 1)]

# Function to split up text into chapters
def split_text_into_chapters(text, chapter_titles):
  chapter_regex = "|".join(re.escape(title) for title in chapter_titles)
  pattern = re.compile(chapter_regex, re.IGNORECASE)

  chapter_positions = [m.start() for m in pattern.finditer(text)]

  chapters = []

  for i in range(len(chapter_positions)):
    start = chapter_positions[i]
    end = chapter_positions[i+1] if i+1 < len(chapter_positions) else None

    chapter_context = text[start:end].strip() if end is not None else text[start:].strip()
    chapters.append(chapter_context)

  return chapters

# Saving chapters to a csv file. Aasuming this will change to account for other changes
def save_chapters_to_csv(chapters, file):
  df = pd.DataFrame({
    'Chapter': [f'Chapter {i+1}' for i in range(len(chapters))],
    'Content': chapters
  })

  df.to_csv(file, index=False, encoding='utf-8')


In [9]:
# Example use of the help function with the book I selected. For some books this might have to be more manually set or another help function may be needed
chapter_titles = generate_roman_chapters(27)
print(chapter_titles)
chapter_array = ['chapter i', 'chapter ii', 'chapter iii', 'chapter iv', 'chapter v', 'chapter vi', 'chapter vii', 'chapter viii', 'chapter ix', 'chapter x', 'chapter xi', 'chapter xii', 'chapter xiii', 'chapter xiv', 'chapter xv', 'chapter xvi', 'chapter xvii', 'chapter xviii', 'chapter xix', 'chapter xx', 'chapter xxi', 'chapter xxii', 'chapter xxiii', 'chapter xxiv', 'chapter xxv', 'chapter xxvi', 'chapter xxvii']

['chapter i', 'chapter ii', 'chapter iii', 'chapter iv', 'chapter v', 'chapter vi', 'chapter vii', 'chapter viii', 'chapter ix', 'chapter x', 'chapter xi', 'chapter xii', 'chapter xiii', 'chapter xiv', 'chapter xv', 'chapter xvi', 'chapter xvii', 'chapter xviii', 'chapter xix', 'chapter xx', 'chapter xxi', 'chapter xxii', 'chapter xxiii', 'chapter xxiv', 'chapter xxv', 'chapter xxvi', 'chapter xxvii']


In [10]:
book_chapters = split_text_into_chapters(book_norm, chapter_titles)

In [11]:
print(book_chapters[0])

chapter i
 dr. sheppard at the breakfast table
mrs. ferrars died on the night of the 16th–17th september—a thursday. i
was sent for at eight o’clock on the morning of friday the 17th. there
was nothing to be done. she had been dead some hours.
it was just a few minutes after nine when i reached home once more. i
opened the front door with my latch-key, and purposely delayed a few
moments in the hall, hanging up my hat and the light overcoat that
i had deemed a wise precaution against the chill of an early autumn
morning. to tell the truth, i was considerably upset and worried. i am
not going to pretend that at that moment i foresaw the events of the
next few weeks. i emphatically did not do so. but my instinct told me
that there were stirring times ahead.
from the dining-room on my left there came the rattle of tea-cups and
the short, dry cough of my sister caroline.
“is that you, james?” she called.
an unnecessary question, since who else could it be? to tell the
truth, it was precise

In [12]:
save_chapters_to_csv(book_chapters, "test.csv")