# Apple Books Highlights

> **Goal:** Building off of [ibooks_highlights](https://github.com/shrsv), we want to access our Apple Books highlights and generate a database to manage.<br>
> **Author:** Rahim Hashim, May 2024 <br>
> **Contact:** rh2898[at]columbia[dot]edu <br>

***

## Scrape Books

In [11]:
# This is a sample script to extract notes from a markdown file
# The script extracts notes from the markdown file and inserts into a database
import os
import re
import sys
import tqdm
import string
import pprint
import pandas as pd
pd.set_option('max_colwidth', 600)

class Book:
	def __init__(self, title, author):
		self.title = title
		self.author = author
		self.notes = []

	def add_note(self, note):
		self.notes.append(note)

	def __str__(self):
		return f'{self.title} by {self.author}'

def generate_books_df(book_dir, verbose=False):
	print(f"Searching for books in dir: \'{book_dir}\'...")
	book_df = pd.DataFrame(columns=['title', 'author', 'chapter', 'note'])
	total_highlights_count = 0
	for book in tqdm.tqdm(os.listdir(book_dir)):
		if not book.endswith('.md'):
			continue
		with open(os.path.join(book_dir, book), 'r') as f:
			book_content = f.read()
			# title is # <title>
			title = re.search(r'# (.+)', book_content).group(1)
			# author is By <author>
			author_results = re.search(r'By (.+)', book_content).group(1)
			# all my notes are after ## My notes
			my_notes = re.search(r'## My notes <a name="my_notes_dont_delete"></a>(.+)', book_content, re.DOTALL).group(1)
			# all notes after ## iBooks notes
			book_notes = re.search(r'## iBooks notes <a name="ibooks_notes_dont_delete"></a>(.+)', book_content, re.DOTALL).group(1)
			# chapter titles start with ### <chapter_title>
			chapters = re.findall(r'### (.+)', book_notes)
			# some notes are split in chapters
			if len(chapters) != 0:
				book_notes_count = len(chapters)
				total_highlights_count += len(chapters)
				# chapter notes are between chapter titles
				chapter_notes = re.split(r'### .+', book_notes)[1:]
				for chapter, note in zip(chapters, chapter_notes):
					note = note.strip()
					new_row = {'title': title, 'author': author_results, 'chapter': chapter, 'note': note}
					book_df.loc[len(book_df)] = new_row
			# others are only split by '\n\n'
			else:
				book_notes = book_notes.strip().split('\n\n')
				total_highlights_count += len(book_notes)
				book_notes_count = len(book_notes)
				for note in book_notes:
					new_row = {'title': title, 'author': author_results, 'chapter': None, 'note': note}
					book_df.loc[len(book_df)] = new_row
			if verbose:
				print(f'{title} | {author_results}')
				print(f'  Notes: {len(book_notes_count)}')
	print('Done.')
	print('_'*50)
	print(f'Total Book Count: {len(book_df['title'].unique())}')
	print(f'Total Notes: {total_highlights_count}')
	return book_df


In [12]:
# book_dir = '/Users/rahimhashim/Library/CloudStorage/GoogleDrive-rh2898@columbia.edu/.shortcut-targets-by-id/0B-7bnvTD8Ld2Ynd5T1V0X2l5RWc/Projects/ibooks-highlights/ibooks-highlights'
book_dir = 'books/'
books_df = generate_books_df(book_dir, verbose=False)

Searching for books in dir: 'books/'...


100%|██████████| 61/61 [00:00<00:00, 518.19it/s]

Done.
__________________________________________________
Total Book Count: 61
Total Notes: 340





In [13]:
def get_book_author(books_df, book):
	author = books_df[books_df['title'] == book]['author'].iloc[0]
	return author

def get_book_info(books_df):
	book_titles = sorted(books_df['title'].unique().tolist(), reverse=False)
	authors = [get_book_author(books_df, book) for book in book_titles]
	return book_titles, authors

def print_titles(books_df):
	print('Book Index | Titles:')
	book_titles, authors = get_book_info(books_df)
	# books_indexed = list(zip(book_titles_index, book_titles))
	# pprint.pprint(books_indexed, indent=4, width=100)
	for bix, book in enumerate(book_titles):
		author = authors[bix]
		print(f'  {bix:>3}: {book} - {author}')

def print_authors(books_df):
	print('Book Authors:')
	book_titles, authors = get_book_info(books_df)
	# books_indexed = list(zip(book_titles_index, book_titles))
	# pprint.pprint(books_indexed, indent=4, width=100)
	authors_unique = sorted(set(authors), reverse=False)
	for aix, author in enumerate(authors_unique):
		author = authors_unique[aix]
		author_num_books = len(books_df[books_df['author'] == author]['title'].unique())
		author_quotes = len(books_df[books_df['author'] == author])
		print(f' {author:>25} : {author_num_books:>2} book(s) | {author_quotes:>3} quote(s)')

def get_book(books_df, title=None, author=None, index=None):
	print('Looking for specified book...')
	book_titles, book_authors = get_book_info(books_df)
	if title and not index:
		print(f' Using input: {title}')
		book = book_titles[book_titles.index(title)]
	elif title == None and index:
		print(f' Using index: {index}')
		book = book_titles[index]
	elif title and index:
		if title != book_titles[index]:
			print('  Misaligned title/index matching.')
			print(f'    title: {title}')
			print(f'    index: {book_titles[index]}')
			sys.exit()
		else:
			book = book_titles[index]
	else:
		print('  Missing title and index input.')
		sys.exit()
	book_df = books_df[books_df['title'] == book]
	author = book_df['author'].unique()[0]
	print(' Book found.')
	print(f'{book} | {author}')
	print(f'  Number of quotes: {len(book_df)}')
	return book_df

print_titles(books_df)

Book Index | Titles:
    0: 1004 - Ben Lerner
    1: 2001 A Space Odyssey - Arthur C Clarke
    2: A Brief History of Everyone Who Ever Lived - Adam Rutherford
    3: A Foray into the Worlds of Animals and Humans: With A Theory of Meaning - Jakob von Uexküll
    4: A Game of Thrones 4Book Bundle - George R R Martin
    5: Adaptive Markets - Andrew W Lo
    6: All of the Marvels - Douglas Wolk
    7: An Immense World - Ed Yong
    8: Babel - R F Kuang
    9: Barack Obama  A Promised Land - Obama Barack
   10: Behave - Robert M Sapolsky
   11: Darwins Camera - Phillip Prodger
   12: Do Androids Dream of Electric Sheep - Philip K Dick
   13: Dune - Frank Herbert
   14: Dune Messiah - Frank Herbert
   15: Eichmann in Jerusalem  A Report on the Banality of Evil - Hannah Arendt
   16: Exhalation - Ted Chiang
   17: Feminism for the 99 - Cinzia Arruzza
   18: Fire  Blood A Song of Ice and Fire - George R R Martin
   19: Here is New York - E B White
   20: How Emotions Are Made the Secret Life

In [14]:
print_authors(books_df)

Book Authors:
           Adam Rutherford :  1 book(s) |   3 quote(s)
             Aldous Huxley :  1 book(s) |   3 quote(s)
               Andrew W Lo :  1 book(s) |   1 quote(s)
          Anthony Bourdain :  1 book(s) |   1 quote(s)
           Arthur C Clarke :  1 book(s) |   5 quote(s)
            Bassem Youssef :  1 book(s) |   1 quote(s)
                Ben Lerner :  1 book(s) |   5 quote(s)
          Benjamin Ehrlich :  1 book(s) |   3 quote(s)
          Blythe Grossberg :  1 book(s) |   4 quote(s)
             ByungChul Han :  1 book(s) |   1 quote(s)
                Carl Sagan :  1 book(s) |   4 quote(s)
              Carl W Ernst :  1 book(s) |   2 quote(s)
            Charles Darwin :  1 book(s) |  12 quote(s)
            Cinzia Arruzza :  1 book(s) |   2 quote(s)
            Daron Acemoglu :  1 book(s) |  31 quote(s)
      David Foster Wallace :  1 book(s) |   2 quote(s)
              Douglas Wolk :  1 book(s) |   8 quote(s)
                 E B White :  1 book(s) |   3 quote

In [15]:
books_df

Unnamed: 0,title,author,chapter,note
0,Dune,Frank Herbert,Chapter 01,"“I must not fear. Fear is the mind-killer. Fear is the little-death that brings total obliteration. I will face my fear. I will permit it to pass over me and through me. And when it has gone past I will turn the inner eye to see its path. Where the fear has gone there will be nothing. Only I will remain.”\n\nPaul looked down at the hand that had known pain, then up to the Reverend Mother. The sound of her voice had contained a difference then from any other voice in his experience. The words were outlined in brilliance. There was an edge to them. He felt that any question he might ask her ..."
1,Dune,Frank Herbert,Chapter 04,Hawat looked at the boy. “I was thinking we’ll all be out of here soon and likely never see the place again.”\n“Does that make you sad?”\n“Sad? Nonsense! Parting with friends is a sadness. A place is only a place.” He glanced at the charts on the table. “And Arrakis is just another place.”
2,Dune,Frank Herbert,Chapter 07,"“You must teach me someday how you do that,” he said, “the way you thrust your worries aside and turn to practical matters. It must be a Bene Gesserit thing.”\n“It’s a female thing,” she said.\n\nWhat was it St. Augustine said? she asked herself. “The mind commands the body and it obeys. The mind orders itself and meets resistance.” Yes—I am meeting more resistance lately. I could use a quiet retreat by myself."
3,Dune,Frank Herbert,Chapter 16,"Greatness is a transitory experience. It is never consistent. It depends in part upon the myth-making imagination of humankind. The person who experiences greatness must have a feeling for the myth he is in. He must reflect what is projected upon him. And he must have a strong sense of the sardonic. This is what uncouples him from belief in his own pretensions. The sardonic is all that permits him to move within himself. Without this quality, even occasional greatness will destroy a man."
4,Dune,Frank Herbert,Chapter 22,"Remembering the letter, Paul re-experienced the distress of that moment—a thing sharp and strange that seemed to happen outside his new mental alertness. He had read that his father was dead, known the truth of the words, but had felt them as no more than another datum to be entered in his mind and used.\nI loved my father, Paul thought, and knew this for truth. I should mourn him. I should feel something.\nBut he felt nothing except: Here’s an important fact.\nIt was one with all the other facts.\nAll the while his mind was adding sense impressions, extrapolating, computing.\nHalleck’s wo..."
...,...,...,...,...
335,The Idea of the Brain,Matthew Cobb,9. Control,"McCulloch had been thinking about this approach to biology for over fifteen years.17 His key insight came when he realised that the all-or-none nature of an action potential was the equivalent of a proposition in logic–a statement that is either true or false. The neuron either fires or it does not. This was an example of what McCulloch called a ‘psychon’–a basic mental ‘atom’, which combined with others to form more complex phenomena. He now understood that it should be possible to describe the activity of a series of neurons–what he called a ‘nervous net’–in terms of a series of proposit..."
336,The Idea of the Brain,Matthew Cobb,10. Memory,"In 1971 O’Keefe, together with his student Jonathan Dostrovsky, reported data from eight cells in the hippocampus that were each activated when the rat was in specific locations in the cage. But it was not only location that was significant: the strongest response came from a cell that fired when the rat was in a particular place, was being held by an experimenter, and the lights were on. If any one of those factors was absent, the cell stopped firing, indicating that it required a very specific set of stimuli to be activated.\n\nO’Keefe’s research revealed that as well as the ability to e..."
337,The Idea of the Brain,Matthew Cobb,14. Localisation,"At the 1958 symposium where Selfridge unveiled his Pandemonium program, Gregory argued that identifying function by ablating or lesioning a particular structure was not only logically flawed but also failed to provide real insight–you might be focusing on the output of a damaged, misfiring system. To properly understand the role of the component, you need a theoretical model of how the system works. And therein lies the difficulty, argued Gregory: ‘The biologist has no “Maker’s Manuals” or any clear idea of what many of the “devices” he studies may be. He must guess the purpose, and put up..."
338,The Idea of the Brain,Matthew Cobb,15. Consciousness,"A radical challenge to our everyday experience of consciousness appeared in a series of studies by the veteran neuroscientist Benjamin Libet, contributing to the philosophical excitement that began in the 1980s and 1990s.64 Libet’s work is generally taken to undermine the notion of free will–our feeling that we can choose how to behave. In a very complicated experiment that has since been replicated many times in various forms, Libet found that EEG traces which revealed subjects’ intentions to move a finger slightly preceded their conscious decision to do so. For many scientists and some p..."


In [16]:
book_title = 'How the World Works'
book_df = get_book(books_df, title=book_title)

Looking for specified book...
 Using input: How the World Works
 Book found.
How the World Works | Noam Chomsky David Barsamian Arthur Naiman
  Number of quotes: 24


In [12]:
pprint.pprint(
	book_df['note'].tolist(), 
	width=100
)

['headed the State Department planning staff until 1950, when he was replaced by Nitze—Kennan’s '
 'office, incidentally, was responsible for the Gehlen network.\n'
 'Kennan was one of the most intelligent and lucid of US planners, and a major figure in shaping '
 'the postwar world. His writings are an extremely interesting illustration of the dovish '
 'position. One document to look at if you want to understand your country is Policy Planning '
 'Study 23, written by Kennan for the State Department planning staff in 1948. Here’s some of what '
 'it says:We have about 50% of the world’s wealth but only 6.3% of its population....In this '
 'situation, we cannot fail to be the object of envy and resentment. Our real task in the coming '
 'period is to devise a pattern of relationships which will  permit us to maintain this position '
 'of disparity....To do so, we will have to dispense with all sentimentality and daydreaming; and '
 'our attention will have to be concentrated everywher