In [51]:
import re

In [52]:
FILEPATH = "../data/My Clippings.txt"

In [None]:
with open (FILEPATH, "r", encoding="utf-8-sig") as file:
    content = file.read()
    lines = content.split("==========")

In [None]:
def get_title_author(split_line):
    """
    Get title and author from Kindle clipping

    :param split_line: Clipping split on linebreak
    :type split_line: String
    """
    try:
        return split_line[0].strip()
    except IndexError as e:
        print(f"Error {e} on line {split_line}")
        return None

In [None]:
def get_page(split_line):
    """
    Get page from Kindle clipping
    
    :param split_line: Clipping split on linebreak
    :type split_line: String
    """
    page_location_date = split_line[1].split("|")
    # Length of 2 means Kindle text does not have pages, only Kindle locations
    if len(page_location_date) == 2:
        return None
    try:
        page = re.search(r'\d+', page_location_date[0]).group()
        return int(page)
    except IndexError as e:
        print(f"Error {e} on line {split_line}")
        return None

In [99]:
def get_start_location(location):
    """
    Get Kindle start location from Kindle clipping
    
    :param location: Clipping split on linebreak and then split on |
    :type location: String
    """
    locations = re.findall(r'\d+', location)
    start_location = locations[0]
    return int(start_location)

In [100]:
def get_end_location(location):
    """
    Get Kindle end location from Kindle clipping
    
    :param location: Clipping split on linebreak and then split on |
    :type location: String
    """
    locations = re.findall(r'\d+', location)
    # Length of 1 means Kindle clipping only has starting location
    if len(locations) == 1:
        return None
    end_location = locations[1]
    return int(end_location)

In [None]:
def get_locations(split_line):
    """
    Get Kindle location from Kindle clipping
    
    :param split_line: Clipping split on linebreak
    :type split_line: String
    """
    page_location_date = split_line[1].split("|")
    # Length of 2 means Kindle text does not have pages, only Kindle locations
    if len(page_location_date) == 2:
        location = page_location_date[0]
        start_location = get_start_location(location)
        end_location = get_end_location(location)
    # Length of 3 means Kindle text has pages and locations
    if len(page_location_date) == 3:
        location = page_location_date[1]
        start_location = get_start_location(location)
        end_location = get_end_location(location)
    else:
        start_location = end_location = None
    return start_location, end_location

In [105]:
def get_date_added(split_line):
    """
    Get date clipping was made from Kindle clipping
    
    :param split_line: Clipping split on linebreak
    :type split_line: String
    """
    page_location_date = split_line[1].split("|")
    try:
        # Date is always last item on this line
        date = page_location_date[-1]
        return date.strip(" Added on ").strip()
    except IndexError as e:
        print(f"Error {e} on line {split_line}")
        return None

In [109]:
def get_text(split_line):
    """
    Get highlighted text from Kindle clipping
    
    :param split_line: Clipping split on linebreak
    :type split_line: String
    """
    try:
        text = split_line[2]
        return text.strip()
    except IndexError as e:
        print(f"Error {e} on line {split_line}")
        return None

In [110]:
def parse_line(line):
    # Remove any empty strings from split line
    line_split = [s for s in line.split("\n") if s]
    title_author = get_title_author(line_split)
    page = get_page(line_split)
    location_start, location_end = get_locations(line_split)
    date_added = get_date_added(line_split)
    text = get_text(line_split)
    return [title_author, page, location_start, location_end, date_added, text]


In [111]:
clippings = []
for line in lines:
    if line == "\n":
        continue
    try:
        parsed_line = parse_line(line)
        clippings.append(parsed_line)
    except Exception as e:
        print(f'Problem {e} with {line}')

Error list index out of range on line ['The Civil War: A Narrative: Volume 2: Fredericksburg to Meridian (Vintage Civil War Library) (Foote, Shelby)', '- Your Bookmark on page 218 | Location 4478 | Added on Saturday, March 15, 2025 11:32:17 AM']


In [112]:
clippings[0:5]

[['A Murder of Quality: A George Smiley Novel (George Smiley Novels Book 2) (le Carré, John)',
  39,
  673,
  673,
  'Saturday, July 15, 2023 8:14:37 AM',
  'we’ve got to rely on laboratories, tracker dogs, and nation'],
 ['The Looking Glass War: A George Smiley Novel (George Smiley Novels Book 4) (le Carré, John)',
  221,
  3115,
  3115,
  'Saturday, July 29, 2023 10:33:14 PM',
  'understand why he was so hungry. Perhaps it was the exercise. Yes, it must be the exercise. He would eat, but not in a'],
 ['Tinker, Tailor, Soldier, Spy: A George Smiley Novel (George Smiley Novels Book 5) (le Carré, John)',
  259,
  3185,
  3185,
  'Sunday, August 6, 2023 10:34:40 PM',
  'Mary Masterman; could Sam swing it?'],
 ['The Honourable Schoolboy: A George Smiley Novel (George Smiley Novels Book 6) (le Carré, John)',
  344,
  5075,
  5075,
  'Wednesday, August 30, 2023 9:47:18 PM',
  'and even what they intend toward him. We could take'],
 ['The Secret Pilgrim (George Smiley Novels) (le Carré, John