# HW04

```yaml
Course:   DS 5001
Module:   04 Homework
Topic:    NLP and the Pipeline
Author:   Ryan Lipps
Date:     10 February 2023
```

## Setup

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px
import configparser
import sys
from textparser import TextParser

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [3]:
source_files = f'{data_home}/gutenberg/eliot-set'
data_prefix = 'eliot-set'

In [4]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [5]:
sys.path.append(local_lib)

In [6]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (507,   rf"^\s*Chapter\s+{roman}\s*$"),
    (145,   rf"^[A-Z]+\s?{roman}\.\s*$"),
    (6688,  rf"^\s*Chapter\s+{roman}\.\s*$")
]

In [7]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [8]:
source_file_list

['/Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt',
 '/Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt',
 '/Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt']

In [9]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('/')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

## Build LIB frame

In [10]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()
LIB

Unnamed: 0_level_0,source_file_path,raw_title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
145,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE MIDDLEMARCH
507,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE ADAM BEDE
6688,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE THE MILL ON THE FLOSS


## Add chapter regex

In [11]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))
LIB

Unnamed: 0_level_0,source_file_path,raw_title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
145,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE MIDDLEMARCH,^[A-Z]+\s?[IVXLCM]+\.\s*$
507,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE ADAM BEDE,^\s*Chapter\s+[IVXLCM]+\s*$
6688,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.\s*$


## Tokenize Corpus

In [12]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].raw_title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [13]:
CORPUS = tokenize_collection(LIB)

Tokenizing 145 ELIOT GEORGE MIDDLEMARCH
Importing  /Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^[A-Z]+\s?[IVXLCM]+\.\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 507 ELIOT GEORGE ADAM BEDE
Importing  /Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*Chapter\s+[IVXLCM]+\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 6688 ELIOT GEORGE THE MILL ON THE FLOSS
Importing  /Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE

In [14]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
145,1,0,0,0,"(Since, IN)",IN,Since,since
145,1,0,0,1,"(I, PRP)",PRP,I,i
145,1,0,0,2,"(can, MD)",MD,can,can
145,1,0,0,3,"(do, VB)",VB,do,do
145,1,0,0,4,"(no, DT)",DT,no,no
...,...,...,...,...,...,...,...,...
6688,58,69,0,2,"(death, NN)",NN,death,death
6688,58,69,0,3,"(they, PRP)",PRP,they,they
6688,58,69,0,4,"(were, VBD)",VBD,were,were
6688,58,69,0,5,"(not, RB)",RB,not,not


## Add columns to LIB

In [15]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()
LIB

Unnamed: 0_level_0,source_file_path,raw_title,chap_regex,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
145,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE MIDDLEMARCH,^[A-Z]+\s?[IVXLCM]+\.\s*$,317305
507,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE ADAM BEDE,^\s*Chapter\s+[IVXLCM]+\s*$,215404
6688,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.\s*$,207461


In [16]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id', 'chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

In [17]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
145,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE MIDDLEMARCH,^[A-Z]+\s?[IVXLCM]+\.\s*$,317305,86
507,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE ADAM BEDE,^\s*Chapter\s+[IVXLCM]+\s*$,215404,55
6688,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,ELIOT GEORGE THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.\s*$,207461,58


## Extract vocab

### Look for weirdness

In [18]:
CORPUS[CORPUS.term_str == '']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
145,4,46,1,0,"(…, NN)",NN,…,
145,5,0,0,39,"(…, NNP)",NNP,…,
145,29,33,5,0,"():, VB)",VB,):,
145,38,55,4,8,"(…, NNP)",NNP,…,
145,42,9,2,17,"(&, CC)",CC,&,
145,58,21,3,0,"();, NN)",NN,);,
145,58,53,1,0,"();, IN)",IN,);,
145,71,3,1,0,"(;”, NNS)",NNS,;”,
145,86,59,0,26,"(&, CC)",CC,&,
507,21,11,2,68,"((&), NNP)",NNP,(&),


### Drop the weirdness

In [19]:
CORPUS = CORPUS[CORPUS.term_str != '']
CORPUS[CORPUS.term_str == '']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


### Chunk POS to first two letters to make POS group

In [20]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]
CORPUS.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CORPUS['pos_group'] = CORPUS.pos.str[:2]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
145,1,0,0,0,"(Since, IN)",IN,Since,since,IN
145,1,0,0,1,"(I, PRP)",PRP,I,i,PR
145,1,0,0,2,"(can, MD)",MD,can,can,MD
145,1,0,0,3,"(do, VB)",VB,do,do,VB
145,1,0,0,4,"(no, DT)",DT,no,no,DT


In [28]:
CORPUS.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6688,58,69,0,2,"(death, NN)",NN,death,death,NN
6688,58,69,0,3,"(they, PRP)",PRP,they,they,PR
6688,58,69,0,4,"(were, VBD)",VBD,were,were,VB
6688,58,69,0,5,"(not, RB)",RB,not,not,RB
6688,58,69,0,6,"(divided.”, JJ)",JJ,divided.”,divided,JJ


### Convert to VOCAB table format

In [21]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()    
VOCAB['p'] = VOCAB.n/VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,1e-06,19.497458
1790,1,4,1e-06,19.497458
1799,2,4,3e-06,18.497458
1801more,1,8,1e-06,19.497458
1807,1,4,1e-06,19.497458


## Annotate VOCAB

### Get max POS

In [22]:
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,1e-06,19.497458,CD
1790,1,4,1e-06,19.497458,CD
1799,2,4,3e-06,18.497458,CD
1801more,1,8,1e-06,19.497458,CD
1807,1,4,1e-06,19.497458,CD


In [23]:
VOCAB['max_pos_group'] = CORPUS[['term_str', 'pos_group']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,1e-06,19.497458,CD,CD
1790,1,4,1e-06,19.497458,CD,CD
1799,2,4,3e-06,18.497458,CD,CD
1801more,1,8,1e-06,19.497458,CD,CD
1807,1,4,1e-06,19.497458,CD,CD


### Compute POS ambiguity

In [24]:
VOCAB['n_pos'] = CORPUS[['term_str', 'pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str', 'pos']].value_counts().to_frame('n')\
    .reset_index()\
    .groupby('term_str').pos\
    .apply(lambda x: set(x))
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,1e-06,19.497458,CD,CD,1,{CD}
1790,1,4,1e-06,19.497458,CD,CD,1,{CD}
1799,2,4,3e-06,18.497458,CD,CD,1,{CD}
1801more,1,8,1e-06,19.497458,CD,CD,1,{CD}
1807,1,4,1e-06,19.497458,CD,CD,1,{CD}


In [25]:
VOCAB.sort_values('n_pos', ascending=False).head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
you,7663,3,0.010353,6.593765,PRP,PR,20,"{RBR, CC, NNS, VB, NNP, RB, JJR, WP, VBP, FW, ..."
youre,217,5,0.000293,11.735907,NN,NN,18,"{NN, RBR, MD, RB, CC, NNS, VB, VBN, PRP$, JJR,..."
now,1573,3,0.002125,8.878155,RB,RB,18,"{NN, MD, RB, RP, CC, VBD, NNS, VB, VBN, PDT, I..."
em,350,2,0.000473,11.046247,NN,NN,18,"{NN, RP, RB, CD, VBD, NNS, VB, VBN, JJR, CC, V..."
wont,174,4,0.000235,12.054515,VBP,VB,18,"{NN, MD, RB, CC, VBD, NNS, VB, JJR, WP, VBN, V..."


In [26]:
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [27]:
VOCAB.sort_values('n_pos', ascending=False).head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,n_pos_group,cat_pos_group
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
you,7663,3,0.010353,6.593765,PRP,PR,20,"{RBR, CC, NNS, VB, NNP, RB, JJR, WP, VBP, FW, ...",12,"{NN, RP, RB, PR, CC, CD, MD, VB, WP, IN, JJ, FW}"
youre,217,5,0.000293,11.735907,NN,NN,18,"{NN, RBR, MD, RB, CC, NNS, VB, VBN, PRP$, JJR,...",10,"{NN, MD, PR, RB, CC, VB, IN, JJ, PD, FW}"
now,1573,3,0.002125,8.878155,RB,RB,18,"{NN, MD, RB, RP, CC, VBD, NNS, VB, VBN, PDT, I...",12,"{NN, MD, RB, PR, CC, RP, VB, IN, JJ, DT, PD, FW}"
em,350,2,0.000473,11.046247,NN,NN,18,"{NN, RP, RB, CD, VBD, NNS, VB, VBN, JJR, CC, V...",10,"{NN, RP, RB, PR, CD, CC, VB, IN, JJ, PD}"
wont,174,4,0.000235,12.054515,VBP,VB,18,"{NN, MD, RB, CC, VBD, NNS, VB, JJR, WP, VBN, V...",10,"{NN, MD, RB, CC, VB, WP, IN, WR, JJ, WD}"


## Add stopwords

In [29]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [31]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,n_pos_group,cat_pos_group,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,1,1e-06,19.497458,CD,CD,1,{CD},1,{CD},0
1790,1,4,1e-06,19.497458,CD,CD,1,{CD},1,{CD},0
1799,2,4,3e-06,18.497458,CD,CD,1,{CD},1,{CD},0
1801more,1,8,1e-06,19.497458,CD,CD,1,{CD},1,{CD},0
1807,1,4,1e-06,19.497458,CD,CD,1,{CD},1,{CD},0


In [33]:
VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,n_pos_group,cat_pos_group,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
we,1331,2,0.001798,9.119163,PRP,PR,9,"{NN, RB, NNS, VB, VBN, IN, JJ, NNP, PRP}",6,"{NN, PR, RB, VB, IN, JJ}",1
ourselves,56,9,7.6e-05,13.690103,PRP,NN,9,"{NN, RP, MD, NNS, VB, IN, JJ, VBZ, PRP}",7,"{NN, MD, PR, RP, VB, IN, JJ}",1
once,445,4,0.000601,10.699797,RB,RB,10,"{NN, RP, RB, NNS, VB, JJR, IN, JJ, VBZ, VBP}",6,"{NN, RP, RB, VB, IN, JJ}",1
same,414,4,0.000559,10.803971,JJ,JJ,2,"{NN, JJ}",2,"{NN, JJ}",1
ma,9,2,1.2e-05,16.327533,NNP,NN,5,"{NN, VBD, VB, NNP, FW}",3,"{NN, VB, FW}",1
then,1159,4,0.001566,9.318793,RB,RB,14,"{NN, RP, RB, VBD, NNS, VB, VBN, PDT, IN, JJ, V...",8,"{NN, RP, RB, VB, IN, JJ, WD, PD}",1
they,1980,4,0.002675,8.546173,PRP,PR,9,"{NN, RB, NNS, VB, VBN, IN, JJ, NNP, PRP}",6,"{NN, PR, RB, VB, IN, JJ}",1
out,1780,3,0.002405,8.699797,RP,RP,13,"{NN, RP, RB, CD, NNS, VB, VBN, IN, JJ, VBZ, VB...",8,"{NN, RP, RB, PR, CD, VB, IN, JJ}",1
here,567,4,0.000766,10.350253,RB,RB,13,"{NN, RB, VBD, EX, VB, NNS, VBN, IN, JJ, VBZ, V...",7,"{NN, RB, PR, EX, VB, IN, JJ}",1
than,1751,4,0.002366,8.723495,IN,IN,2,"{IN, NNP}",2,"{IN, NN}",1


# DOES IT MAKE SENSE THAT THERE ARE DIFFERENT MAX_POS AND MAX_POS_GROUP? I THINK SO

## Add stems

In [38]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [39]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,n_pos_group,cat_pos_group,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,1,0.000001,19.497458,CD,CD,1,{CD},1,{CD},0,1,1,1
1790,1,4,0.000001,19.497458,CD,CD,1,{CD},1,{CD},0,1790,1790,1790
1799,2,4,0.000003,18.497458,CD,CD,1,{CD},1,{CD},0,1799,1799,1799
1801more,1,8,0.000001,19.497458,CD,CD,1,{CD},1,{CD},0,1801more,1801more,1801more
1807,1,4,0.000001,19.497458,CD,CD,1,{CD},1,{CD},0,1807,1807,1807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
œdipus,2,6,0.000003,18.497458,NN,NN,1,{NN},1,{NN},0,œdipu,œdipus,œdip
μέγεθος,1,7,0.000001,19.497458,NNP,NN,1,{NNP},1,{NN},0,μέγεθος,μέγεθος,μέγεθος
τι,1,2,0.000001,19.497458,NNP,NN,1,{NNP},1,{NN},0,τι,τι,τι
ἀπέρωτος,1,8,0.000001,19.497458,JJ,JJ,1,{JJ},1,{JJ},0,ἀπέρωτος,ἀπέρωτος,ἀπέρωτος


## Question 1:
What regular expre

### Answer 1

In [56]:
LIB[LIB.raw_title == 'ELIOT GEORGE MIDDLEMARCH'].chap_regex.values[0]

'^[A-Z]+\\s?[IVXLCM]+\\.\\s*$'

## Question 2:
What is the title of the book that has the most tokens? 

### Answer 2

In [65]:
LIB.loc[LIB.book_len.idxmax()].raw_title

'ELIOT GEORGE MIDDLEMARCH'

## Question 3:
How many chapter level chunks are there in this novel?

### Answer 3:

In [67]:
LIB.loc[LIB.book_len.idxmax()].n_chaps

86

## Question 4:
Among the three stemming algorithms -- Porter, Lancaster, and Snowball --  which is the most aggressive, in terms of the number of words associated with each stem?

### Answer 4:

In [85]:
print(f"Number of unique Porter stems: {VOCAB.groupby('stem_porter').count().shape[0]}")
print(f"Number of unique Lancaster stems: {VOCAB.groupby('stem_lancaster').count().shape[0]}")
print(f"Number of unique Snowball stems: {VOCAB.groupby('stem_snowball').count().shape[0]}")

Number of unique Porter stems: 17540
Number of unique Lancaster stems: 14612
Number of unique Snowball stems: 17203


**Since Lancaster has the fewest rows after grouping by stems, it is the most aggressive.**

## Question 5:
Using the most aggressive stemmer from the previous question, what is the stem with the most associated terms?

### Answer 5:

In [105]:
VOCAB.stem_lancaster.value_counts().iloc[0:1]

stem_lancaster
cont    34
Name: count, dtype: int64