## Parse & Annotate

In [174]:
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
import numpy as np
import pandas as pd
import configparser
config = configparser.ConfigParser()

In [6]:
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [8]:
!ls -l {data_home}

total 89664
-rw-r--r--@ 1 Samantha  staff   1752309 Jan 27 17:42 austen-persuasion.csv
drwxr-xr-x@ 9 Samantha  staff       288 Jan 16 08:52 [34mgutenberg[m[m
-rw-r--r--@ 1 Samantha  staff  43681675 Feb 28 14:01 novels-CORPUS.csv
-rw-r--r--@ 1 Samantha  staff       421 Feb 28 14:00 novels-LIB.csv
-rw-r--r--@ 1 Samantha  staff    465627 Jan 20 15:27 pg42324.txt
drwxr-xr-x@ 6 Samantha  staff       192 Apr  7 20:49 [34mwoolf[m[m


In [81]:
data_directory = f"{data_home}/woolf"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

#### Get lines

In [95]:
# The Voyage Out
voyage = f"{data_directory}/voyage.txt"
LINES_voyage = pd.DataFrame(open(voyage, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_voyage.index.name = 'line_num'
LINES_voyage.line_str = LINES_voyage.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Night and Day
nightday = f"{data_directory}/nightday.txt"
LINES_nightday = pd.DataFrame(open(nightday, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_nightday.index.name = 'line_num'
LINES_nightday.line_str = LINES_nightday.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Jacob's Room
jacob = f"{data_directory}/jacobsroom.txt"
LINES_jacob = pd.DataFrame(open(jacob, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_jacob.index.name = 'line_num'
LINES_jacob.line_str = LINES_jacob.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Mrs. Dalloway
dalloway = f"{data_directory}/dalloway.txt"
LINES_dalloway = pd.DataFrame(open(dalloway, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_dalloway.index.name = 'line_num'
LINES_dalloway.line_str = LINES_dalloway.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

#### Clean up

In [97]:
books = [LINES_voyage, LINES_nightday, LINES_jacob, LINES_dalloway]

def clip(book):
    clip_pats = [
        r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
        r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
    ]
    
    pat_a = book.line_str.str.match(clip_pats[0])
    pat_b = book.line_str.str.match(clip_pats[1])

    line_a = book.loc[pat_a].index[0] + 1
    line_b = book.loc[pat_b].index[0] - 1

    return book.loc[line_a : line_b]

LINES_voyage = clip(LINES_voyage)
LINES_nightday = clip(LINES_nightday)
LINES_jacob = clip(LINES_jacob)
LINES_dalloway = clip(LINES_dalloway)

#### Chunk by chapter (sections for Mrs. Dalloway)

voyage:

In [99]:
# First get rid of table of contents
LINES_voyage = LINES_voyage.iloc[38:]

chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_voyage = LINES_voyage.line_str.str.match(chap_pat, case=False)

In [105]:
LINES_voyage.loc[chap_lines_voyage, 'chap_num'] = [i+1 for i in range(LINES_voyage.loc[chap_lines_voyage].shape[0])]

In [111]:
LINES_voyage.chap_num = LINES_voyage.chap_num.ffill()

In [119]:
LINES_voyage = LINES_voyage.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_voyage = LINES_voyage.loc[~chap_lines_voyage] # Remove chapter heading lines; their work is done
LINES_voyage.chap_num = LINES_voyage.chap_num.astype('int') # Convert chap_num from float to int

In [141]:
LINES_voyage.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
8851,going to describe the kind of parties I once w...,16
4795,or absolutely terrifying. I don’t know which a...,9
204,for Night Schools.,1


In [143]:
# Make CHAPS table
CHAPS_voyage = LINES_voyage.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_voyage['chap_str'] = CHAPS_voyage.chap_str.str.strip()

nightday:

In [160]:
LINES_nightday = LINES_nightday.iloc[56:]

In [164]:
chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_nightday = LINES_nightday.line_str.str.match(chap_pat, case=False)

In [168]:
LINES_nightday.loc[chap_lines_nightday, 'chap_num'] = [i+1 for i in range(LINES_nightday.loc[chap_lines_nightday].shape[0])]

In [176]:
LINES_nightday.chap_num = LINES_nightday.chap_num.ffill()

In [180]:
LINES_nightday = LINES_nightday.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_nightday = LINES_nightday.loc[~chap_lines_nightday] # Remove chapter heading lines
LINES_nightday.chap_num = LINES_nightday.chap_num.astype('int') # Convert chap_num from float to int

In [184]:
LINES_nightday.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
12194,that Katharine was outside the community in so...,26
9427,"Katharine, who spoke sincerely, could envy her.",21
18040,"unfinished, the unfulfilled, the unwritten, th...",34


In [186]:
# Make CHAPS table
CHAPS_nightday = LINES_nightday.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_nightday['chap_str'] = CHAPS_nightday.chap_str.str.strip()

jacob:

In [199]:
LINES_jacob = LINES_jacob.iloc[20:]

In [201]:
chap_pat = r"^\s*CHAPTER\s+[A-Z]+\s*$"
chap_lines_jacob = LINES_jacob.line_str.str.match(chap_pat, case=False)

In [203]:
LINES_jacob.loc[chap_lines_jacob, 'chap_num'] = [i+1 for i in range(LINES_jacob.loc[chap_lines_jacob].shape[0])]

In [205]:
LINES_jacob.chap_num = LINES_jacob.chap_num.ffill()

In [209]:
LINES_jacob = LINES_jacob.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_jacob = LINES_jacob.loc[~chap_lines_jacob] # Remove chapter heading lines
LINES_jacob.chap_num = LINES_jacob.chap_num.astype('int') # Convert chap_num from float to int

In [215]:
LINES_jacob.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
2251,by the industrious pen. Innumerable overcoats ...,5
4047,barked down in the hollow. The liquid shadows ...,10
2866,"Messrs. Mackie's dye-works, suffering in winte...",7


In [217]:
# Make CHAPS table
CHAPS_jacob = LINES_jacob.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_jacob['chap_str'] = CHAPS_jacob.chap_str.str.strip()

dalloway: