## Search for a word's context in the chapter where it's Kindle highlight is
As opposed to searching for the first instance of the word in the text

In [21]:
import csv
import re
import matplotlib.pyplot as plt
import numpy as np

### Load the data

In [7]:
wcl = []  # Words, Chapters, Locations
with open('words-chapters-locations.tsv', encoding='utf-8', newline='') as f:
    reader = csv.reader(f, delimiter='\t', dialect='excel-tab')
    row1 = next(reader)
    for row in reader:
        wcl.append(row)

In [3]:
wcl

[['restriñimientos.', 'I', 'Location 591'],
 ['sucedáneos.', 'I', 'Location 598'],
 ['remordimientos', 'I', 'Location 602'],
 ['mentecato.', 'I', 'Location 609'],
 ['veleidoso,', 'I', 'Location 611'],
 ['aturdirse,', 'I', 'Location 611'],
 ['desvencijada,', 'I', 'Location 626'],
 ['apretando', 'I', 'Location 631'],
 ['entablaba', 'I', 'Location 638'],
 ['a lo sumo,', 'I', 'Location 640'],
 ['culebras', 'I', 'Location 651'],
 ['despreocupada,', 'I', 'Location 652'],
 ['se puso colorado', 'I', 'Location 663'],
 ['fulminó', 'I', 'Location 681'],
 ['pátina;', 'I', 'Location 684'],
 ['arrimaron', 'I', 'Location 692'],
 ['carcajadas', 'I', 'Location 692'],
 ['El Magistral le iba a la mano', 'I', 'Location 709'],
 ['Era menester que', 'I', 'Location 716'],
 ['flamantes', 'I', 'Location 734'],
 ['hacer aspavientos.', 'I', 'Location 734'],
 ['pellizco', 'I', 'Location 741'],
 ['calzón', 'I', 'Location 746'],
 ['estrépito', 'I', 'Location 754'],
 ['enaguas;', 'I', 'Location 755'],
 ['disimulaba'

In [6]:
with open('regenta-tomo-I.txt', 'r', encoding='utf-8') as f:
    tomo1 = f.read()

In [8]:
# get the location numer out of the string
for sublist in wcl:
    location_str = sublist[2]
    location_number = int(location_str.removeprefix('Location '))
    sublist.append(location_number)

In [9]:
wcl

[['restriñimientos.', 'I', 'Location 591', 591],
 ['sucedáneos.', 'I', 'Location 598', 598],
 ['remordimientos', 'I', 'Location 602', 602],
 ['mentecato.', 'I', 'Location 609', 609],
 ['veleidoso,', 'I', 'Location 611', 611],
 ['aturdirse,', 'I', 'Location 611', 611],
 ['desvencijada,', 'I', 'Location 626', 626],
 ['apretando', 'I', 'Location 631', 631],
 ['entablaba', 'I', 'Location 638', 638],
 ['a lo sumo,', 'I', 'Location 640', 640],
 ['culebras', 'I', 'Location 651', 651],
 ['despreocupada,', 'I', 'Location 652', 652],
 ['se puso colorado', 'I', 'Location 663', 663],
 ['fulminó', 'I', 'Location 681', 681],
 ['pátina;', 'I', 'Location 684', 684],
 ['arrimaron', 'I', 'Location 692', 692],
 ['carcajadas', 'I', 'Location 692', 692],
 ['El Magistral le iba a la mano', 'I', 'Location 709', 709],
 ['Era menester que', 'I', 'Location 716', 716],
 ['flamantes', 'I', 'Location 734', 734],
 ['hacer aspavientos.', 'I', 'Location 734', 734],
 ['pellizco', 'I', 'Location 741', 741],
 ['calzón',

### Mark where the chapters are in the text

In [16]:
chapter_numerals = {xs[1] for xs in wcl}
chapter_numerals

{'I',
 'II',
 'III',
 'IV',
 'IX',
 'V',
 'VI',
 'VII',
 'VIII',
 'X',
 'XI',
 'XII',
 'XIII',
 'XIV',
 'XV'}

In [17]:
chapter_headings = ['—{}—'.format(n) for n in chapter_numerals]
chapter_headings

['—XV—',
 '—IX—',
 '—V—',
 '—X—',
 '—XII—',
 '—XIII—',
 '—II—',
 '—IV—',
 '—VI—',
 '—XI—',
 '—VII—',
 '—XIV—',
 '—III—',
 '—VIII—',
 '—I—']

In [55]:
chapters_idxs = {n.strip('—') : tomo1.find(n) for n in chapter_headings}

In [56]:
chapters_idxs

{'XV': 758315,
 'IX': 403884,
 'V': 196396,
 'X': 456052,
 'XII': 553257,
 'XIII': 642061,
 'II': 73051,
 'IV': 151666,
 'VI': 261158,
 'XI': 492454,
 'VII': 301088,
 'XIV': 722539,
 'III': 112178,
 'VIII': 342027,
 'I': 0}

### Search for the words starting at a specific chapter

In [58]:
def get_context(word, text, n=300, start_idx=0):
    """Search from START_IDX for WORD in TEXT with N characters of context.
    
    Get ~300 characters before WORD and ~ 300 characters after it.
    This will probably break if the word comes right before the
    beginning or end of the TEXT. It is also going to find the
    first instance of the word. Start searching from START_IDX
    """
    word_idx = text.find(word, start_idx)
    context = text[word_idx - n : word_idx + n]
    return context

wccl = []
for sublist in wcl:
    word, ch = sublist[0], sublist[1]
    ch_start_idx = chapters_idxs[ch]
    ch_context = get_context(word, tomo1, 400, ch_start_idx)
    wccl.append([word, ch_context, ch, sublist[2]])

In [59]:
wccl

[['restriñimientos.',
  'a en que le tomaron por clérigo, se dejaba la barba, de un negro de tinta china, pero la recortaba como el boj de su huerto. Tenía la boca muy grande, y al sonreír con propósito de agradar, los labios iban de oreja a oreja. No se sabe por qué entonces era cuando mejor se conocía que Bermúdez no se quejaba de vicio al quejarse del pícaro estómago, de digestiones difíciles y sobre todo de perpetuos restriñimientos. Era una sonrisa llena de arrugas, que equivalía a una mueca provocada por un dolor intestinal, aquella con que Bermúdez quería pasar por el hombre más espiritual de Vetusta, y el más capaz de comprender una pasión profunda y alambicada. Pues debe advertirse que sus lecturas serias de cronicones y otros libros viejos alternaban en su ambicioso espíritu con las novelas más finas y p',
  'I',
  'Location 591'],
 ['sucedáneos.',
  'nda don Saturno y quedar convertida en sotana era todo uno. Siempre parecía que iba de luto, aunque no fuera. Sin embargo, poc

### Export to pipe-separated .csv
Because if we want to inject HTML and CSS styling we need pipes to separate the values. Needs the variable generated a few cells above

In [60]:
with open('word_ch-context_chapter_kindle-location.csv', 'w', newline='', encoding='utf-8') as psvfile:
    # idk if the dialect matters here.
    psvwriter = csv.writer(psvfile, delimiter='|', dialect='excel')
    first_row = ['word', 'ch-context', 'chapter', 'kindle-location']
    psvwriter.writerow(first_row)
    for sublist in wccl:
        psvwriter.writerow(sublist)