## Parse & Annotate

In [2]:
import numpy as np
import pandas as pd
import configparser
config = configparser.ConfigParser()

In [3]:
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [5]:
!ls -l {data_home}

total 89664
-rw-r--r--@ 1 Samantha  staff   1752309 Jan 27 17:42 austen-persuasion.csv
drwxr-xr-x@ 9 Samantha  staff       288 Jan 16 08:52 [34mgutenberg[m[m
-rw-r--r--@ 1 Samantha  staff  43681675 Feb 28 14:01 novels-CORPUS.csv
-rw-r--r--@ 1 Samantha  staff       421 Feb 28 14:00 novels-LIB.csv
-rw-r--r--@ 1 Samantha  staff    465627 Jan 20 15:27 pg42324.txt
drwxr-xr-x@ 7 Samantha  staff       224 May  1 22:04 [34mwoolf[m[m


In [6]:
data_directory = f"{data_home}/woolf"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

#### Get lines

In [8]:
# The Voyage Out
voyage = f"{data_directory}/voyage.txt"
LINES_voyage = pd.DataFrame(open(voyage, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_voyage.index.name = 'line_num'
LINES_voyage.line_str = LINES_voyage.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Night and Day
nightday = f"{data_directory}/nightday.txt"
LINES_nightday = pd.DataFrame(open(nightday, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_nightday.index.name = 'line_num'
LINES_nightday.line_str = LINES_nightday.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Jacob's Room
jacob = f"{data_directory}/jacobsroom.txt"
LINES_jacob = pd.DataFrame(open(jacob, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_jacob.index.name = 'line_num'
LINES_jacob.line_str = LINES_jacob.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

# Mrs. Dalloway
dalloway = f"{data_directory}/dalloway.txt"
LINES_dalloway = pd.DataFrame(open(dalloway, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES_dalloway.index.name = 'line_num'
LINES_dalloway.line_str = LINES_dalloway.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

#### Clean up

In [10]:
books = [LINES_voyage, LINES_nightday, LINES_jacob, LINES_dalloway]

def clip(book):
    clip_pats = [
        r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
        r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
    ]
    
    pat_a = book.line_str.str.match(clip_pats[0])
    pat_b = book.line_str.str.match(clip_pats[1])

    line_a = book.loc[pat_a].index[0] + 1
    line_b = book.loc[pat_b].index[0] - 1

    return book.loc[line_a : line_b]

LINES_voyage = clip(LINES_voyage)
LINES_nightday = clip(LINES_nightday)
LINES_jacob = clip(LINES_jacob)
LINES_dalloway = clip(LINES_dalloway)

#### Chunk by chapter (sections for Mrs. Dalloway)

voyage:

In [13]:
# First get rid of table of contents
LINES_voyage = LINES_voyage.iloc[38:]

chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_voyage = LINES_voyage.line_str.str.match(chap_pat, case=False)

In [14]:
LINES_voyage.loc[chap_lines_voyage, 'chap_num'] = [i+1 for i in range(LINES_voyage.loc[chap_lines_voyage].shape[0])]

In [15]:
LINES_voyage.chap_num = LINES_voyage.chap_num.ffill()

In [16]:
LINES_voyage = LINES_voyage.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_voyage = LINES_voyage.loc[~chap_lines_voyage] # Remove chapter heading lines; their work is done
LINES_voyage.chap_num = LINES_voyage.chap_num.astype('int') # Convert chap_num from float to int

In [17]:
LINES_voyage.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
13620,"directions, looked over at Rachel, but did not...",25
13499,,25
1681,of her person lacked its proper instrument. Th...,3


In [18]:
# Make CHAPS table
CHAPS_voyage = LINES_voyage.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_voyage['chap_str'] = CHAPS_voyage.chap_str.str.strip()

In [19]:
CHAPS_voyage

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,As the streets that lead from the Strand to th...
2,"Uncomfortable as the night, with its rocking m..."
3,Early next morning there was a sound as of cha...
4,Next morning Clarissa was up before anyone els...
5,She was not able to follow up her observations...
6,“That’s the tragedy of life—as I always say!” ...
7,From a distance the _Euphrosyne_ looked very s...
8,"The next few months passed away, as many years..."
9,"An hour passed, and the downstairs rooms at th..."
10,Among the promises which Mrs. Ambrose had made...


nightday:

In [21]:
LINES_nightday = LINES_nightday.iloc[56:]

In [22]:
chap_pat = r"^\s*CHAPTER\s+[IVXLCDM]+\s*$"
chap_lines_nightday = LINES_nightday.line_str.str.match(chap_pat, case=False)

In [23]:
LINES_nightday.loc[chap_lines_nightday, 'chap_num'] = [i+1 for i in range(LINES_nightday.loc[chap_lines_nightday].shape[0])]

In [24]:
LINES_nightday.chap_num = LINES_nightday.chap_num.ffill()

In [25]:
LINES_nightday = LINES_nightday.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_nightday = LINES_nightday.loc[~chap_lines_nightday] # Remove chapter heading lines
LINES_nightday.chap_num = LINES_nightday.chap_num.astype('int') # Convert chap_num from float to int

In [26]:
LINES_nightday.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1917,exclaimed. “I hope you don’t sleep in this roo...,4
3180,,6
15466,like children who had been caught prying. They...,31


In [27]:
# Make CHAPS table
CHAPS_nightday = LINES_nightday.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_nightday['chap_str'] = CHAPS_nightday.chap_str.str.strip()

In [28]:
CHAPS_nightday

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"It was a Sunday evening in October, and in com..."
2,The young man shut the door with a sharper sla...
3,Denham had accused Katharine Hilbery of belong...
4,"At about nine o’clock at night, on every alter..."
5,Denham had no conscious intention of following...
6,Of all the hours of an ordinary working week-d...
7,"“And little Augustus Pelham said to me, ‘It’s ..."
8,"She took her letters up to her room with her, ..."
9,Katharine disliked telling her mother about Cy...
10,"Messrs. Grateley and Hooper, the solicitors in..."


jacob:

In [30]:
LINES_jacob = LINES_jacob.iloc[20:]

In [31]:
chap_pat = r"^\s*CHAPTER\s+[A-Z]+\s*$"
chap_lines_jacob = LINES_jacob.line_str.str.match(chap_pat, case=False)

In [32]:
LINES_jacob.loc[chap_lines_jacob, 'chap_num'] = [i+1 for i in range(LINES_jacob.loc[chap_lines_jacob].shape[0])]

In [33]:
LINES_jacob.chap_num = LINES_jacob.chap_num.ffill()

In [34]:
LINES_jacob = LINES_jacob.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_jacob = LINES_jacob.loc[~chap_lines_jacob] # Remove chapter heading lines
LINES_jacob.chap_num = LINES_jacob.chap_num.astype('int') # Convert chap_num from float to int

In [35]:
LINES_jacob.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
3980,,10
4712,"and moderns, with some pretty sharp hits at Mr...",12
1796,vegetable plot in front of the cottage. Mrs. P...,4


In [36]:
# Make CHAPS table
CHAPS_jacob = LINES_jacob.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_jacob['chap_str'] = CHAPS_jacob.chap_str.str.strip()

In [37]:
CHAPS_jacob

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"""So of course,"" wrote Betty Flanders, pressing..."
2,"""MRS. FLANDERS""--""Poor Betty Flanders""--""Dear ..."
3,"""This is not a smoking-carriage,"" Mrs. Norman ..."
4,"What's the use of trying to read Shakespeare, ..."
5,"""I rather think,"" said Jacob, taking his pipe ..."
6,"The flames had fairly caught.\n\n""There's St. ..."
7,About this time a firm of merchants having dea...
8,"About half-past nine Jacob left the house, his..."
9,The Countess of Rocksbier sat at the head of t...
10,Through the disused graveyard in the parish of...


dalloway:

In [39]:
LINES_dalloway = LINES_dalloway.iloc[68:]

In [40]:
fake_break = pd.DataFrame({'line_str': ['* * * * *']})
LINES_dalloway = pd.concat([fake_break, LINES_dalloway], ignore_index=True)

LINES_dalloway.reset_index(drop=True, inplace=True)
LINES_dalloway.index.name = 'line_num'

In [41]:
chap_pat = r"^\s*(\*\s*){3,}\s*$" # mrs dalloway doesn't have chapters, but i am treating sections as chapters
chap_lines_dalloway = LINES_dalloway.line_str.str.match(chap_pat, case=False)

In [42]:
len(LINES_dalloway.loc[chap_lines_dalloway]) # 9 sections

9

In [43]:
LINES_dalloway.loc[chap_lines_dalloway, 'chap_num'] = [i+1 for i in range(LINES_dalloway.loc[chap_lines_dalloway].shape[0])]

In [44]:
LINES_dalloway.chap_num = LINES_dalloway.chap_num.ffill()

In [45]:
LINES_dalloway = LINES_dalloway.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES_dalloway = LINES_dalloway.loc[~chap_lines_dalloway] # Remove chapter heading lines
LINES_dalloway.chap_num = LINES_dalloway.chap_num.astype('int') # Convert chap_num from float to int

In [46]:
LINES_dalloway.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
2749,alive to them.,7
5802,might have been better if Richard had married ...,8
4313,lady’s disorder; her hair down; her parcel on ...,7


In [47]:
# Make CHAPS table
CHAPS_dalloway = LINES_dalloway.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

CHAPS_dalloway['chap_str'] = CHAPS_dalloway.chap_str.str.strip()

In [48]:
# Get rid of "Transcriber's note" (chap_num 9)
CHAPS_dalloway = CHAPS_dalloway.drop(9)

In [49]:
CHAPS_dalloway

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,Mrs. Dalloway said she would buy the flowers h...
2,The violent explosion which made Mrs. Dalloway...
3,“What are they looking at?” said Clarissa Dall...
4,"Remember my party, remember my party, said Pet..."
5,The grey nurse resumed her knitting as Peter W...
6,So the elderly nurse knitted over the sleeping...
7,"It was awful, he cried, awful, awful!\n\nStill..."
8,"Lucy came running full tilt downstairs, having..."


#### Chunk by paragraph

In [51]:
para_pat = r'\n\n+'

In [52]:
def get_para(CHAPS, para_pat):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
    
    PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack().to_frame('para_str').sort_index()
    PARAS.index.names = OHCO[:2]

    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
    PARAS['para_str'] = PARAS['para_str'].str.strip()
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

    return PARAS

In [53]:
PARAS_voyage = get_para(CHAPS_voyage, para_pat)
PARAS_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,As the streets that lead from the Strand to th...
1,1,One afternoon in the beginning of October when...
1,2,The embankment juts out in angles here and the...
1,3,"Although Mrs. Ambrose stood quite still, much ..."
1,4,Lars Porsena of Clusium By the nine Gods he sw...


In [54]:
PARAS_nightday = get_para(CHAPS_nightday, para_pat)
PARAS_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"It was a Sunday evening in October, and in com..."
1,1,Considering that the little party had been sea...
1,2,"That fact was perceptible to Mr. Denham also, ..."
1,3,"“Now, what would you do if you were married to..."
1,4,"“Surely she could learn Persian,” broke in a t..."


In [55]:
PARAS_jacob = get_para(CHAPS_jacob, para_pat)
PARAS_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"""So of course,"" wrote Betty Flanders, pressing..."
1,1,"Slowly welling from the point of her gold nib,..."
1,2,"""... nothing for it but to leave,"" she read."
1,3,"""Well, if Jacob doesn't want to play"" (the sha..."
1,4,"""Where IS that tiresome little boy?"" she said...."


In [56]:
PARAS_dalloway = get_para(CHAPS_dalloway, para_pat)
PARAS_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,Mrs. Dalloway said she would buy the flowers h...
1,1,For Lucy had her work cut out for her. The doo...
1,2,What a lark! What a plunge! For so it had alwa...
1,3,"She stiffened a little on the kerb, waiting fo..."
1,4,For having lived in Westminster--how many year...


#### Chunk by sentence

In [58]:
import re

def mask_abbreviations(text):
    # Replace known abbreviations with safe versions (no periods)
    abbr_map = {
        'Mr.': 'Mr',
        'Mrs.': 'Mrs',
        'Ms.': 'Ms',
        'Dr.': 'Dr',
        'Prof.': 'Prof',
        'St.': 'St',
        'Jr.': 'Jr',
        'Sr.': 'Sr',
        'etc.': 'etc',
        'e.g.': 'eg',
        'i.e.': 'ie',
    }
    for k, v in abbr_map.items():
        text = text.replace(k, v)
    return text

def get_sent(PARAS):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

    # Mask abbreviations to avoid splitting on them
    PARA_MASKED = PARAS['para_str'].apply(mask_abbreviations)

    # Now split on punctuation followed by whitespace
    SENTS = PARA_MASKED.str.split(r'(?<=[.?!])\s+', expand=True).stack().to_frame('sent_str')
    SENTS.index.names = OHCO[:3]

    # Clean up
    SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')]
    SENTS['sent_str'] = SENTS['sent_str'].str.strip()

    return SENTS

In [59]:
SENTS_voyage = get_sent(PARAS_voyage)
SENTS_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,As the streets that lead from the Strand to th...
1,0,1,"If you persist, lawyers’ clerks will have to m..."
1,0,2,In the streets of London where beauty goes unr...
1,1,0,One afternoon in the beginning of October when...
1,1,1,Angry glances struck upon their backs.


In [60]:
SENTS_nightday = get_sent(PARAS_nightday)
SENTS_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,"It was a Sunday evening in October, and in com..."
1,0,1,Perhaps a fifth part of her mind was thus occu...
1,0,2,"But although she was silent, she was evidently..."
1,0,3,A single glance was enough to show that Mrs Hi...
1,1,0,Considering that the little party had been sea...


In [61]:
SENTS_jacob = get_sent(PARAS_jacob)
SENTS_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,"""So of course,"" wrote Betty Flanders, pressing..."
1,1,0,"Slowly welling from the point of her gold nib,..."
1,1,1,The entire bay quivered; the lighthouse wobble...
1,1,2,She winked quickly.
1,1,3,Accidents were awful things.


In [62]:
SENTS_dalloway = get_sent(PARAS_dalloway)
SENTS_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,Mrs Dalloway said she would buy the flowers he...
1,1,0,For Lucy had her work cut out for her.
1,1,1,The doors would be taken off their hinges; Rum...
1,1,2,"And then, thought Clarissa Dalloway, what a mo..."
1,2,0,What a lark!


#### Chunk by token (get corpus tables!)

In [64]:
def get_tokens(SENTS):
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

    token_pat = r"[\s',-]+"
    TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack().to_frame('token_str')

    TOKENS.index.names = OHCO[:4]

    return TOKENS

In [65]:
TOKEN_voyage = get_tokens(SENTS_voyage)
TOKEN_voyage.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,As
1,0,0,1,the
1,0,0,2,streets
1,0,0,3,that
1,0,0,4,lead


In [66]:
TOKEN_nightday = get_tokens(SENTS_nightday)
TOKEN_nightday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,It
1,0,0,1,was
1,0,0,2,a
1,0,0,3,Sunday
1,0,0,4,evening


In [67]:
TOKEN_jacob = get_tokens(SENTS_jacob)
TOKEN_jacob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,"""So"
1,0,0,1,of
1,0,0,2,course
1,0,0,3,""""
1,0,0,4,wrote


In [68]:
TOKEN_dalloway = get_tokens(SENTS_dalloway)
TOKEN_dalloway.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,Mrs
1,0,0,1,Dalloway
1,0,0,2,said
1,0,0,3,she
1,0,0,4,would


In [69]:
len(TOKEN_dalloway) + len(TOKEN_jacob) + len(TOKEN_nightday) + len(TOKEN_voyage)

427340

In [70]:
# Combine into one TOKEN (corpus) table

TOKEN_voyage['book_id'] = 1
TOKEN_nightday['book_id'] = 2
TOKEN_jacob['book_id'] = 3
TOKEN_dalloway['book_id'] = 4

In [71]:
TOKEN_voyage = TOKEN_voyage.set_index("book_id", append=True)
TOKEN_voyage = TOKEN_voyage.reorder_levels(["book_id", "chap_num", "para_num", "sent_num", "token_num"])

In [72]:
TOKEN_nightday = TOKEN_nightday.set_index("book_id", append=True)
TOKEN_nightday = TOKEN_nightday.reorder_levels(["book_id", "chap_num", "para_num", "sent_num", "token_num"])

In [73]:
TOKEN_jacob = TOKEN_jacob.set_index("book_id", append=True)
TOKEN_jacob = TOKEN_jacob.reorder_levels(["book_id", "chap_num", "para_num", "sent_num", "token_num"])

In [74]:
TOKEN_dalloway = TOKEN_dalloway.set_index("book_id", append=True)
TOKEN_dalloway = TOKEN_dalloway.reorder_levels(["book_id", "chap_num", "para_num", "sent_num", "token_num"])

In [75]:
TOKEN = pd.concat([TOKEN_voyage,TOKEN_nightday,TOKEN_jacob,TOKEN_dalloway])
TOKEN.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
2,27,14,3,31,for
3,3,48,1,72,he
2,29,145,0,1,have


In [76]:
TOKEN['term_str'] = TOKEN.token_str.replace(r'[\W_]+', '', regex=True).str.lower()

In [77]:
TOKEN.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
4,7,345,19,0,Holmes,holmes
2,10,3,3,82,it.,it
2,18,140,1,28,attempt,attempt


In [78]:
# Add pos and pos_group

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('universal_tagset')

def pos_tag_sentence(group):
    tokens = group['token_str'].tolist()
    tags = pos_tag(tokens)
    group['pos'] = [tag for _, tag in tags]
    return group

TOKEN = TOKEN.groupby(['book_id', 'chap_num', 'para_num', 'sent_num']).apply(pos_tag_sentence)

def coarse_pos(tag):
    if tag.startswith('NN'):
        return 'NN'
    elif tag.startswith('VB'):
        return 'VB'
    elif tag.startswith('JJ'):
        return 'JJ'
    elif tag.startswith('RB'):
        return 'RB'
    elif tag.startswith('PR'):
        return 'PR'
    elif tag.startswith('WP'):
        return 'WP'
    elif tag in ['.', ',', ':', '(', ')', "''", '``']:
        return 'PUNCT'
    else:
        return tag

TOKEN['pos_group'] = TOKEN['pos'].apply(coarse_pos)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Samantha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/Samantha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/Samantha/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  TOKEN = TOKEN.groupby(['book_id', 'chap_num', 'para_num', 'sent_num']).apply(pos_tag_sentence)


In [79]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos,pos_group
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,As,as,IN,IN
1,1,0,0,1,the,the,DT,DT
1,1,0,0,2,streets,streets,NNS,NN
1,1,0,0,3,that,that,WDT,WDT
1,1,0,0,4,lead,lead,VBP,VB


#### Get VOCAB table from TOKEN table

In [81]:
VOCAB = TOKEN.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [82]:
VOCAB.sample(5)

Unnamed: 0_level_0,n,n_chars,p,i
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
shirtings,1,9,2e-06,18.705025
tailor,2,6,5e-06,17.705025
rows,9,4,2.1e-05,15.5351
modern,21,6,4.9e-05,14.312707
sympathies,2,10,5e-06,17.705025


In [83]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [84]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [85]:
VOCAB[VOCAB.stop == 1].sample(3)

Unnamed: 0_level_0,n,n_chars,p,i,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
isn,2,3,5e-06,17.705025,1
you,2616,3,0.006122,7.351878,1
is,1365,2,0.003194,8.29034,1


In [86]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [87]:
VOCAB['max_pos'] = TOKEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = TOKEN[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [88]:
VOCAB['n_pos_group'] = TOKEN[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = TOKEN[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [89]:
VOCAB['n_pos'] = TOKEN[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = TOKEN[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [90]:
VOCAB = VOCAB[1:]

In [91]:
VOCAB.sample(3).T

term_str,expanse,psychological,manningsand
n,1,1,1
n_chars,7,13,11
p,0.000002,0.000002,0.000002
i,18.705025,18.705025,18.705025
stop,0,0,0
stem_porter,expans,psycholog,manningsand
max_pos,NN,JJ,NNP
max_pos_group,NN,JJ,NN
n_pos_group,1,1,1
cat_pos_group,{NN},{JJ},{NN}


In [92]:
# Compute df and idf to get VOCAB['dfidf']

## To do this I have to first create the BOW table, and then the DTCM, then compute DF and IDF.
## I'm going to use chapter as the bag. 

In [93]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)
bag = 'CHAPS'

In [94]:
BOW = TOKEN.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n')

In [95]:
DTCM = BOW.n.unstack(fill_value=0)

In [96]:
DF = DTCM.astype('bool').sum()

In [97]:
N = DTCM.shape[0]
idf = {
    'standard': np.log2(N / DF),
    'max': np.log2(DF.max() / DF),
    'smooth': np.log2((1 + N) / (1 + DF)) + 1
}
idf_method = 'standard'
IDF = idf[idf_method]

In [98]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF

In [99]:
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

In [100]:
VOCAB.sample(3).T

term_str,inserted,cripples,wealthy
n,1,2,1
n_chars,8,8,7
p,0.000002,0.000005,0.000002
i,18.705025,17.705025,18.705025
stop,0,0,0
stem_porter,insert,crippl,wealthi
max_pos,VBD,NNS,NN
max_pos_group,VB,NN,NN
n_pos_group,1,1,1
cat_pos_group,{VB},{NN},{NN}


#### Make LIB table

In [102]:
source_file_list = ['/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/voyage.txt', # book_id 1
                    '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/nightday.txt', # 2
                    '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/jacobsroom.txt', # 3
                    '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/dalloway.txt' # 4
]

In [103]:
book_data = []
book_id = 1
for source_file_path in source_file_list:
    if book_id == 1:
        book_title = 'THE VOYAGE OUT'
    elif book_id == 2:
        book_title = 'NIGHT AND DAY'
    elif book_id == 3:
        book_title = 'JACOBS ROOM'
    elif book_id == 4:
        book_title = 'MRS DALLOWAY'
        
    book_data.append((book_id, source_file_path, book_title))
    book_id += 1

In [104]:
book_data

[(1,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/voyage.txt',
  'THE VOYAGE OUT'),
 (2,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/nightday.txt',
  'NIGHT AND DAY'),
 (3,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/jacobsroom.txt',
  'JACOBS ROOM'),
 (4,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/woolf/dalloway.txt',
  'MRS DALLOWAY')]

In [105]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [106]:
LIB['book_len'] = TOKEN.groupby('book_id').term_str.count() # number of words

In [107]:
LIB['n_chaps'] = TOKEN.reset_index()[['book_id','chap_num']]\
    .drop_duplicates()\
    .groupby('book_id').chap_num.count()

In [108]:
dates = [1915,
         1919,
         1922,
         1925]
LIB['date'] = dates

In [109]:
TOKEN['n_chars'] = TOKEN['token_str'].str.len()
chars_per_book = TOKEN.groupby('book_id')['n_chars'].sum()

In [110]:
chars_per_book

book_id
1    612493
2    760337
3    250699
4    287737
Name: n_chars, dtype: int64

In [111]:
n_chars = [612493,
          760337,
          250699,
          287737]

In [112]:
LIB['n_chars'] = n_chars

In [113]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title,book_len,n_chaps,date,n_chars
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,THE VOYAGE OUT,138107,27,1915,612493
2,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,NIGHT AND DAY,168196,34,1919,760337
3,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,JACOBS ROOM,56548,14,1922,250699
4,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,MRS DALLOWAY,64489,8,1925,287737


In [114]:
ages = [33,37,40,43]
LIB['woolf_age'] = ages

In [115]:
sexes = ['f','f','m','f']
LIB['prot_sex'] = sexes

In [116]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title,book_len,n_chaps,date,n_chars,woolf_age,prot_sex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,THE VOYAGE OUT,138107,27,1915,612493,33,f
2,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,NIGHT AND DAY,168196,34,1919,760337,37,f
3,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,JACOBS ROOM,56548,14,1922,250699,40,m
4,/Users/Samantha/Desktop/MSDS/DS5001/data/woolf...,MRS DALLOWAY,64489,8,1925,287737,43,f


### SAVE MAIN TABLES TO FILES

In [221]:
TOKEN.to_csv('woolf-CORPUS.csv', index=True, header=True)
VOCAB.to_csv('woolf-VOCAB.csv', index=True, header=True)
LIB.to_csv('woolf-LIB.csv', index=True, header=True)

#### Finish BOW, DTCM, TFIDF, TFIDF_L2

In [226]:
tf = {
    'sum': (DTCM.T / DTCM.T.sum()).T,
    'max': (DTCM.T / DTCM.T.max()).T,
    'log': (np.log2(1 + DTCM.T)).T,
    'raw':  DTCM,
    'double_norm': (DTCM.T / DTCM.T.max()).T,
    'binary': DTCM.T.astype('bool').astype('int').T
}

In [228]:
TF = tf['max']

In [230]:
TFIDF = TF * IDF

In [232]:
TFIDF.sample(3)

Unnamed: 0_level_0,term_str,Unnamed: 2_level_0,112,1215counted,1580,1660,1697,1780,1852,1853,1860,...,περῶν,πολιοῦ,πολλὰ,πόντου,τοῦτο,τὰ,χειμερίῳ,χωρεῖ,ἀν,ὑπ
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,11,0.008867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.001204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,0.003859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [234]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [236]:
BOW.sort_values('tfidf', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tf,tfidf
book_id,chap_num,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,7,salvin,13,0.154762,0.986613
3,11,cruttendon,33,0.172775,0.928672
3,14,bonamy,6,0.272727,0.920465
2,29,cassandra,57,0.333333,0.855895
2,29,katharine,97,0.567251,0.779993


In [238]:
DTCM.sample(3)

Unnamed: 0_level_0,term_str,Unnamed: 2_level_0,112,1215counted,1580,1660,1697,1780,1852,1853,1860,...,περῶν,πολιοῦ,πολλὰ,πόντου,τοῦτο,τὰ,χειμερίῳ,χωρεῖ,ἀν,ὑπ
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,22,64,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [240]:
## Get L2 norm of TFIDF

# Compute the L2 norm for each document vector (row)
l2_norms = np.sqrt((TFIDF ** 2).sum(axis=1))

# Divide each row by its L2 norm
TFIDF_L2 = TFIDF.div(l2_norms, axis=0)

In [242]:
# Now reduce to top N terms with highest mean tfidf
# Note that I haven't filtered out proper nouns so many top tfidf terms are names
N_terms = 2000

# Compute average TFIDF across documents for each term
mean_tfidf = TFIDF_L2.mean(axis=0)

# Select top N terms
top_terms = mean_tfidf.sort_values(ascending=False).head(N_terms).index

# Reduce the TFIDF_L2 matrix
TFIDF_L2 = TFIDF_L2[top_terms]

In [244]:
TFIDF_L2.T.sample(10)

book_id,1,1,1,1,1,1,1,1,1,1,...,3,3,4,4,4,4,4,4,4,4
chap_num,1,2,3,4,5,6,7,8,9,10,...,13,14,1,2,3,4,5,6,7,8
term_str,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
had,0.003961,0.005394,0.003296,0.002665,0.001863,0.004406,0.002693,0.004346,0.003361,0.004052,...,0.002154,0.0,0.005805,0.003956,0.006739,0.004544,0.00027,0.006339,0.008344,0.008047
hesitation,0.028847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.019855,0.0,0.0,0.0,0.0
believed,0.0,0.0,0.0,0.0,0.0,0.012463,0.0,0.030732,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015689,0.009281,0.003905
puzzled,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
relationship,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002547,0.0
stone,0.0,0.006686,0.0,0.005368,0.0,0.0,0.027405,0.010004,0.004166,0.013746,...,0.0,0.0,0.0,0.009621,0.009546,0.0,0.0,0.0,0.00846,0.0
wound,0.010692,0.011761,0.009344,0.0,0.0,0.028548,0.016069,0.0,0.007328,0.01209,...,0.0,0.0,0.0,0.008462,0.0,0.0,0.0,0.0,0.004252,0.0
barfoot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rise,0.0,0.01308,0.0,0.0,0.0,0.015875,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009338,0.0,0.068086,0.0,0.007093,0.0
sorts,0.0,0.0,0.021561,0.0,0.0,0.016469,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015188,0.009763,0.019375,0.0,0.0,0.0,0.004906,0.0


### SAVE DERIVED TABLES TO FILES

In [246]:
BOW.to_csv(f'woolf-BOW-{bag}.csv', index=True, header=True)
DTCM.to_csv(f'woolf-DTCM-{bag}.csv', index=True, header=True)
TFIDF.to_csv(f'woolf-TFIDF-{bag}.csv', index=True, header=True)
TFIDF_L2.to_csv(f'woolf-TFIDF_L2-{bag}.csv', index=True, header=True)

#### Compute metrics/outputs for final project notebook

In [248]:
# 6.1 Average length of each document in characters
LIB.n_chars.mean()

477816.5

In [250]:
# 6.3 List the top 20 significant words in the corpus by DFIDF
VOCAB.sort_values('dfidf', ascending=False).head(20).index

Index(['pages', 'wants', 'considerable', 'god', 'happiness', 'save', 'pink',
       'single', 'john', 'bedroom', 'agree', 'gentlemen', 'interrupted',
       'burst', 'explained', 'comes', 'compared', 'hat', 'anyhow', 'force'],
      dtype='object', name='term_str')

In [252]:
# 7.1 Number of observations
len(BOW)

113953