## Parse & Annotate

In [2]:
import numpy as np
import pandas as pd
import configparser
config = configparser.ConfigParser()

In [3]:
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [5]:
!ls -l {data_home}

total 89664
-rw-r--r--@ 1 Samantha  staff   1752309 Jan 27 17:42 austen-persuasion.csv
drwxr-xr-x@ 9 Samantha  staff       288 Jan 16 08:52 [34mgutenberg[m[m
-rw-r--r--@ 1 Samantha  staff  43681675 Feb 28 14:01 novels-CORPUS.csv
-rw-r--r--@ 1 Samantha  staff       421 Feb 28 14:00 novels-LIB.csv
-rw-r--r--@ 1 Samantha  staff    465627 Jan 20 15:27 pg42324.txt
drwxr-xr-x@ 6 Samantha  staff       192 Apr  7 20:49 [34mwoolf[m[m


In [6]:
data_directory = f"{data_home}/woolf"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

#### Get lines

In [8]:
# The Voyage Out
voyage = f"{data_directory}/voyage.txt"
LINES_voyage = pd.DataFrame(open(voyage, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_voyage.index.name = 'line_num'
LINES_voyage.line_str = LINES_voyage.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Night and Day
nightday = f"{data_directory}/nightday.txt"
LINES_nightday = pd.DataFrame(open(nightday, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_nightday.index.name = 'line_num'
LINES_nightday.line_str = LINES_nightday.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Jacob's Room
jacob = f"{data_directory}/jacobsroom.txt"
LINES_jacob = pd.DataFrame(open(jacob, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_jacob.index.name = 'line_num'
LINES_jacob.line_str = LINES_jacob.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Mrs. Dalloway
dalloway = f"{data_directory}/dalloway.txt"
LINES_dalloway = pd.DataFrame(open(dalloway, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_dalloway.index.name = 'line_num'
LINES_dalloway.line_str = LINES_dalloway.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

#### Clean up

In [10]:
books = [LINES_voyage, LINES_nightday, LINES_jacob, LINES_dalloway]

def clip(book):
    clip_pats = [
        r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
        r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
    ]
    
    pat_a = book.line_str.str.match(clip_pats[0])
    pat_b = book.line_str.str.match(clip_pats[1])

    line_a = book.loc[pat_a].index[0] + 1
    line_b = book.loc[pat_b].index[0] - 1

    return book.loc[line_a : line_b]

LINES_voyage = clip(LINES_voyage)
LINES_nightday = clip(LINES_nightday)
LINES_jacob = clip(LINES_jacob)
LINES_dalloway = clip(LINES_dalloway)

#### Chunk by chapter (sections for Mrs. Dalloway)

voyage:

In [13]:
# First get rid of table of contents
LINES_voyage = LINES_voyage.iloc[38:]

chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_voyage = LINES_voyage.line_str.str.match(chap_pat, case=False)

In [14]:
LINES_voyage.loc[chap_lines_voyage, 'chap_num'] = [i+1 for i in range(LINES_voyage.loc[chap_lines_voyage].shape[0])]

In [15]:
LINES_voyage.chap_num = LINES_voyage.chap_num.ffill()

In [16]:
LINES_voyage = LINES_voyage.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_voyage = LINES_voyage.loc[~chap_lines_voyage] # Remove chapter heading lines; their work is done
LINES_voyage.chap_num = LINES_voyage.chap_num.astype('int') # Convert chap_num from float to int

In [17]:
LINES_voyage.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
7650,because for the moment he could not remember w...,14
13738,"“Now, Nurse,” he whispered, “please tell me yo...",25
7388,"and stepped into the radius of the light, and ...",14


In [18]:
# Make CHAPS table
CHAPS_voyage = LINES_voyage.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_voyage['chap_str'] = CHAPS_voyage.chap_str.str.strip()

In [245]:
CHAPS_voyage

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,As the streets that lead from the Strand to th...
2,"Uncomfortable as the night, with its rocking m..."
3,Early next morning there was a sound as of cha...
4,Next morning Clarissa was up before anyone els...
5,She was not able to follow up her observations...
6,“That’s the tragedy of life—as I always say!” ...
7,From a distance the _Euphrosyne_ looked very s...
8,"The next few months passed away, as many years..."
9,"An hour passed, and the downstairs rooms at th..."
10,Among the promises which Mrs. Ambrose had made...


nightday:

In [20]:
LINES_nightday = LINES_nightday.iloc[56:]

In [21]:
chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_nightday = LINES_nightday.line_str.str.match(chap_pat, case=False)

In [22]:
LINES_nightday.loc[chap_lines_nightday, 'chap_num'] = [i+1 for i in range(LINES_nightday.loc[chap_lines_nightday].shape[0])]

In [23]:
LINES_nightday.chap_num = LINES_nightday.chap_num.ffill()

In [24]:
LINES_nightday = LINES_nightday.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_nightday = LINES_nightday.loc[~chap_lines_nightday] # Remove chapter heading lines
LINES_nightday.chap_num = LINES_nightday.chap_num.astype('int') # Convert chap_num from float to int

In [25]:
LINES_nightday.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
9517,"Directly Katharine moved she felt, inarticulat...",21
14061,“At the Zoo?” she asked.,29
13388,,27


In [26]:
# Make CHAPS table
CHAPS_nightday = LINES_nightday.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_nightday['chap_str'] = CHAPS_nightday.chap_str.str.strip()

In [243]:
CHAPS_nightday

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"It was a Sunday evening in October, and in com..."
2,The young man shut the door with a sharper sla...
3,Denham had accused Katharine Hilbery of belong...
4,"At about nine o’clock at night, on every alter..."
5,Denham had no conscious intention of following...
6,Of all the hours of an ordinary working week-d...
7,"“And little Augustus Pelham said to me, ‘It’s ..."
8,"She took her letters up to her room with her, ..."
9,Katharine disliked telling her mother about Cy...
10,"Messrs. Grateley and Hooper, the solicitors in..."


jacob:

In [28]:
LINES_jacob = LINES_jacob.iloc[20:]

In [29]:
chap_pat = r"^\s*CHAPTER\s+[A-Z]+\s*$"
chap_lines_jacob = LINES_jacob.line_str.str.match(chap_pat, case=False)

In [30]:
LINES_jacob.loc[chap_lines_jacob, 'chap_num'] = [i+1 for i in range(LINES_jacob.loc[chap_lines_jacob].shape[0])]

In [31]:
LINES_jacob.chap_num = LINES_jacob.chap_num.ffill()

In [32]:
LINES_jacob = LINES_jacob.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_jacob = LINES_jacob.loc[~chap_lines_jacob] # Remove chapter heading lines
LINES_jacob.chap_num = LINES_jacob.chap_num.astype('int') # Convert chap_num from float to int

In [33]:
LINES_jacob.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
3273,that the mind keeps beneath the windows of oth...,8
3827,"The magnificent world--the live, sane, vigorou...",9
3734,"Plato and Shakespeare, and then are buried at ...",9


In [34]:
# Make CHAPS table
CHAPS_jacob = LINES_jacob.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_jacob['chap_str'] = CHAPS_jacob.chap_str.str.strip()

In [241]:
CHAPS_jacob

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"""So of course,"" wrote Betty Flanders, pressing..."
2,"""MRS. FLANDERS""--""Poor Betty Flanders""--""Dear ..."
3,"""This is not a smoking-carriage,"" Mrs. Norman ..."
4,"What's the use of trying to read Shakespeare, ..."
5,"""I rather think,"" said Jacob, taking his pipe ..."
6,"The flames had fairly caught.\n\n""There's St. ..."
7,About this time a firm of merchants having dea...
8,"About half-past nine Jacob left the house, his..."
9,The Countess of Rocksbier sat at the head of t...
10,Through the disused graveyard in the parish of...


dalloway:

In [214]:
LINES_dalloway = LINES_dalloway.iloc[68:]

In [216]:
fake_break = pd.DataFrame({'line_str': ['* * * * *']})
LINES_dalloway = pd.concat([fake_break, LINES_dalloway], ignore_index=True)

LINES_dalloway.reset_index(drop=True, inplace=True)
LINES_dalloway.index.name = 'line_num'

In [218]:
chap_pat = r"^\s*(\*\s*){3,}\s*$" # mrs dalloway doesn't have chapters, but i am treating sections as chapters
chap_lines_dalloway = LINES_dalloway.line_str.str.match(chap_pat, case=False)

In [226]:
len(LINES_dalloway.loc[chap_lines_dalloway]) # 9 sections

9

In [228]:
LINES_dalloway.loc[chap_lines_dalloway, 'chap_num'] = [i+1 for i in range(LINES_dalloway.loc[chap_lines_dalloway].shape[0])]

In [230]:
LINES_dalloway.chap_num = LINES_dalloway.chap_num.ffill()

In [232]:
LINES_dalloway = LINES_dalloway.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_dalloway = LINES_dalloway.loc[~chap_lines_dalloway] # Remove chapter heading lines
LINES_dalloway.chap_num = LINES_dalloway.chap_num.astype('int') # Convert chap_num from float to int

In [234]:
LINES_dalloway.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1782,dispense with a dark flutter of the leaves cha...,5
6220,,8
1463,"Now it was time to move, and, as a woman gathe...",3


In [236]:
# Make CHAPS table
CHAPS_dalloway = LINES_dalloway.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_dalloway['chap_str'] = CHAPS_dalloway.chap_str.str.strip()

In [249]:
# Get rid of "Transcriber's note" (chap_num 9)
CHAPS_dalloway = CHAPS_dalloway.drop(9)

In [255]:
CHAPS_dalloway

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,Mrs. Dalloway said she would buy the flowers h...
2,The violent explosion which made Mrs. Dalloway...
3,“What are they looking at?” said Clarissa Dall...
4,"Remember my party, remember my party, said Pet..."
5,The grey nurse resumed her knitting as Peter W...
6,So the elderly nurse knitted over the sleeping...
7,"It was awful, he cried, awful, awful!\n\nStill..."
8,"Lucy came running full tilt downstairs, having..."


#### Chunk by paragraph

In [260]:
para_pat = r'\n\n+'

In [262]:
def get_para(CHAPS, para_pat):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
    
    PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack().to_frame('para_str').sort_index()
    PARAS.index.names = OHCO[:2]

    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
    PARAS['para_str'] = PARAS['para_str'].str.strip()
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

    return PARAS

In [264]:
PARAS_voyage = get_para(CHAPS_voyage, para_pat)
PARAS_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,As the streets that lead from the Strand to th...
1,1,One afternoon in the beginning of October when...
1,2,The embankment juts out in angles here and the...
1,3,"Although Mrs. Ambrose stood quite still, much ..."
1,4,Lars Porsena of Clusium By the nine Gods he sw...


In [266]:
PARAS_nightday = get_para(CHAPS_nightday, para_pat)
PARAS_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"It was a Sunday evening in October, and in com..."
1,1,Considering that the little party had been sea...
1,2,"That fact was perceptible to Mr. Denham also, ..."
1,3,"“Now, what would you do if you were married to..."
1,4,"“Surely she could learn Persian,” broke in a t..."


In [268]:
PARAS_jacob = get_para(CHAPS_jacob, para_pat)
PARAS_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"""So of course,"" wrote Betty Flanders, pressing..."
1,1,"Slowly welling from the point of her gold nib,..."
1,2,"""... nothing for it but to leave,"" she read."
1,3,"""Well, if Jacob doesn't want to play"" (the sha..."
1,4,"""Where IS that tiresome little boy?"" she said...."


In [270]:
PARAS_dalloway = get_para(CHAPS_dalloway, para_pat)
PARAS_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,Mrs. Dalloway said she would buy the flowers h...
1,1,For Lucy had her work cut out for her. The doo...
1,2,What a lark! What a plunge! For so it had alwa...
1,3,"She stiffened a little on the kerb, waiting fo..."
1,4,For having lived in Westminster--how many year...


#### Chunk by sentence

In [288]:
import re

def mask_abbreviations(text):
    # Replace known abbreviations with safe versions (no periods)
    abbr_map = {
        'Mr.': 'Mr',
        'Mrs.': 'Mrs',
        'Ms.': 'Ms',
        'Dr.': 'Dr',
        'Prof.': 'Prof',
        'St.': 'St',
        'Jr.': 'Jr',
        'Sr.': 'Sr',
        'etc.': 'etc',
        'e.g.': 'eg',
        'i.e.': 'ie',
    }
    for k, v in abbr_map.items():
        text = text.replace(k, v)
    return text

def get_sent(PARAS):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

    # Mask abbreviations to avoid splitting on them
    PARA_MASKED = PARAS['para_str'].apply(mask_abbreviations)

    # Now split on punctuation followed by whitespace
    SENTS = PARA_MASKED.str.split(r'(?<=[.?!])\s+', expand=True).stack().to_frame('sent_str')
    SENTS.index.names = OHCO[:3]

    # Clean up
    SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')]
    SENTS['sent_str'] = SENTS['sent_str'].str.strip()

    return SENTS

In [292]:
SENTS_voyage = get_sent(PARAS_voyage)
SENTS_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,As the streets that lead from the Strand to th...
1,0,1,"If you persist, lawyers’ clerks will have to m..."
1,0,2,In the streets of London where beauty goes unr...
1,1,0,One afternoon in the beginning of October when...
1,1,1,Angry glances struck upon their backs.


In [294]:
SENTS_nightday = get_sent(PARAS_nightday)
SENTS_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,"It was a Sunday evening in October, and in com..."
1,0,1,Perhaps a fifth part of her mind was thus occu...
1,0,2,"But although she was silent, she was evidently..."
1,0,3,A single glance was enough to show that Mrs Hi...
1,1,0,Considering that the little party had been sea...


In [296]:
SENTS_jacob = get_sent(PARAS_jacob)
SENTS_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,"""So of course,"" wrote Betty Flanders, pressing..."
1,1,0,"Slowly welling from the point of her gold nib,..."
1,1,1,The entire bay quivered; the lighthouse wobble...
1,1,2,She winked quickly.
1,1,3,Accidents were awful things.


In [298]:
SENTS_dalloway = get_sent(PARAS_dalloway)
SENTS_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,Mrs Dalloway said she would buy the flowers he...
1,1,0,For Lucy had her work cut out for her.
1,1,1,The doors would be taken off their hinges; Rum...
1,1,2,"And then, thought Clarissa Dalloway, what a mo..."
1,2,0,What a lark!


#### Chunk by token (get corpus tables!)

In [301]:
def get_tokens(SENTS):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

    token_pat = r"[\s',-]+"
    TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack().to_frame('token_str')

    TOKENS.index.names = OHCO[:4]

    return TOKENS

In [303]:
TOKEN_voyage = get_tokens(SENTS_voyage)
TOKEN_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,As
1,0,0,1,the
1,0,0,2,streets
1,0,0,3,that
1,0,0,4,lead


In [305]:
TOKEN_nightday = get_tokens(SENTS_nightday)
TOKEN_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,It
1,0,0,1,was
1,0,0,2,a
1,0,0,3,Sunday
1,0,0,4,evening


In [308]:
TOKEN_jacob = get_tokens(SENTS_jacob)
TOKEN_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,"""So"
1,0,0,1,of
1,0,0,2,course
1,0,0,3,""""
1,0,0,4,wrote


In [310]:
TOKEN_dalloway = get_tokens(SENTS_dalloway)
TOKEN_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,Mrs
1,0,0,1,Dalloway
1,0,0,2,said
1,0,0,3,she
1,0,0,4,would


In [314]:
len(TOKEN_dalloway) + len(TOKEN_jacob) + len(TOKEN_nightday) + len(TOKEN_voyage)

427340