# M04 Homework
- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub: https://github.com/sqr8ap/DS5001-2025-01-R/blob/m04/lessons/M04_NLP/M04_HW.ipynb

In [3]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px
import configparser

In [4]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [5]:
import sys
sys.path.append(local_lib)
from textparser import TextParser

In [6]:
source_files = f'{data_home}/gutenberg/eliot-set'
data_prefix = 'eliot'

In [7]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [8]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (507,   rf"^\s*Chapter\s+{roman}\s*$"),
    (145,   rf"^\s*CHAPTER\s+{roman}\s*\.$"),
    #(6688,   rf"\s^Chapter\s+{roman}+\.\s*$"),
    (6688,   rf"^\s*Chapter\s+{roman}\.\s*$")
]

In [9]:
source_file_list = sorted(glob(f"{source_files}/*.*"))
source_file_list

['/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt',
 '/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt',
 '/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt']

In [10]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('/')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

In [11]:
book_data

[(507,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt',
  'ELIOT GEORGE ADAM BEDE'),
 (145,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt',
  'ELIOT GEORGE MIDDLEMARCH'),
 (6688,
  '/Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt',
  'ELIOT GEORGE THE MILL ON THE FLOSS')]

In [12]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [13]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [14]:
# Tokenize corpus

def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].raw_title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
        # text = TextImporter(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats) 

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        #text.import_source().parse_tokens();
        try:
            text.import_source().parse_tokens()
        except Exception as e:
            print(f"Error parsing book {book_id}: {e}")  # Debugging print
            continue

        ### Debug: Print the TOKENS structure after parsing
        print(f"Tokens for {book_id}:", text.TOKENS.head())

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [15]:
LIB.loc[145].chap_regex

'^\\s*CHAPTER\\s+[IVXLCM]+\\s*\\.$'

In [16]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Samantha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Samantha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/Samantha/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [17]:
CORPUS = tokenize_collection(LIB)

Tokenizing 145 ELIOT GEORGE MIDDLEMARCH
Importing  /Users/Samantha/Desktop/MSDS/DS5001/data/gutenberg/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s+[IVXLCM]+\s*\.$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokens for 145:                                        pos_tuple  pos token_str term_str
chap_id para_num sent_num token_num                                     
1       0        0        0          (Since, IN)   IN     Since    since
                          1             (I, PRP)  PRP         I        i
                          2            (can, MD)   MD       can      can
                          3             (do, VB)   VB        do       do
                          4             (no, DT)   DT        no       no
Tokenizing 507 ELIOT GEORGE ADAM BEDE
Importing 

In [18]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [19]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

### FINAL LIB TABLE:

In [21]:
LIB.T

book_id,145,507,6688
source_file_path,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...
raw_title,ELIOT GEORGE MIDDLEMARCH,ELIOT GEORGE ADAM BEDE,ELIOT GEORGE THE MILL ON THE FLOSS
chap_regex,^\s*CHAPTER\s+[IVXLCM]+\s*\.$,^\s*Chapter\s+[IVXLCM]+\s*$,^\s*Chapter\s+[IVXLCM]+\.\s*$
book_len,317305,215404,207461
n_chaps,86,55,58


In [22]:
CORPUS.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
507,2,1,0,24,"(little, JJ)",JJ,little,little
145,17,23,0,5,"(generally, RB)",RB,generally,generally
145,46,44,1,45,"(about;, NNS)",NNS,about;,about
145,64,18,0,43,"(to, TO)",TO,to,to
145,48,16,2,47,"(with, IN)",IN,with,with


In [23]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

token_str
&      10
…       3
);      2
),      2
):      1
;”      1
(&)     1
Name: count, dtype: int64

In [24]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [25]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

### FINAL CORPUS TABLE:

In [27]:
CORPUS.sample(5).T

book_id,145,6688,507,6688,6688
chap_id,62,5,15,52,13
para_num,4,73,11,61,9
sent_num,0,1,13,12,0
token_num,0,19,39,44,33
pos_tuple,"(If, IN)","(although, IN)","(for, IN)","(approached,, NN)","(Tulliver,, NNP)"
pos,IN,IN,IN,NN,NNP
token_str,If,although,for,"approached,","Tulliver,"
term_str,if,although,for,approached,tulliver
pos_group,IN,IN,IN,NN,NN


In [28]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [29]:
VOCAB.sample(5)

Unnamed: 0_level_0,n,n_chars,p,i
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cataract,1,8,1e-06,19.497458
tackled,1,7,1e-06,19.497458
protective,1,10,1e-06,19.497458
starchy,1,7,1e-06,19.497458
zest,6,4,8e-06,16.912496


In [30]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [31]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [32]:
VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,n,n_chars,p,i,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
each,294,4,0.000397,11.297786,1
where,684,5,0.000924,10.079606,1
after,844,5,0.00114,9.776359,1
herself,520,7,0.000703,10.47509,1
itself,180,6,0.000243,12.005605,1
having,387,6,0.000523,10.901268,1
all,2605,3,0.00352,8.15039,1
yourself,158,8,0.000213,12.193677,1
in,11491,2,0.015525,6.009241,1
during,71,6,9.6e-05,13.347711,1


In [33]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [34]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [35]:
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [36]:
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

### FINAL VOCAB TABLE:

In [38]:
VOCAB.sample(5).T

term_str,close,champagne,shying,fellowmen,beliefs
n,186,1,1,1,5
n_chars,5,9,6,9,7
p,0.000251,0.000001,0.000001,0.000001,0.000007
i,11.958299,19.497458,19.497458,19.497458,17.17553
stop,0,0,0,0,0
stem_porter,close,champagn,shi,fellowmen,belief
max_pos,JJ,NN,JJ,NN,NNS
max_pos_group,JJ,NN,JJ,NN,NN
n_pos_group,4,1,1,1,1
cat_pos_group,"{JJ, RB, VB, NN}",{NN},{JJ},{NN},{NN}


## Questions

#### 1. What regular expression did you use to chunk _Middlemarch_ into chapters?

In [41]:
LIB.loc[LIB['raw_title'] == 'ELIOT GEORGE MIDDLEMARCH', 'chap_regex'].values[0]

'^\\s*CHAPTER\\s+[IVXLCM]+\\s*\\.$'

#### 2. What is the title of the book that has the most tokens? 

In [43]:
LIB.loc[LIB['book_len'] == max(LIB['book_len']), 'raw_title'].values[0].split()[2]

'MIDDLEMARCH'

#### 3. How many chapter level chunks are there in this novel?

In [45]:
LIB.loc[LIB['raw_title'] == 'ELIOT GEORGE MIDDLEMARCH', 'n_chaps'].values[0]

86

#### 4. Among the three stemming algorithms -- Porter, Lancaster, and Snowball --  which is the most aggressive, in terms of the number of words associated with each stem?

In [47]:
from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [48]:
porter = VOCAB.groupby('stem_porter').size().reset_index(name='porter_count')
snowball = VOCAB.groupby('stem_snowball').size().reset_index(name='snowball_count')
lancaster = VOCAB.groupby('stem_lancaster').size().reset_index(name='lancaster_count')

# Merge the counts together
counts = porter.merge(snowball, left_on='stem_porter', right_on='stem_snowball', how='outer')\
                            .merge(lancaster, left_on='stem_porter', right_on='stem_lancaster', how='outer')
counts.fillna(0, inplace=True)

In [49]:
# Look at max values first:

print(f"Porter: {counts.porter_count.max()}")
print(f"Snowball: {counts.snowball_count.max()}")
print(f"Lancaster: {counts.lancaster_count.max()}")

Porter: 11.0
Snowball: 11.0
Lancaster: 34.0


In [50]:
# Now averages:

print(f"Porter: {counts.loc[counts['porter_count'] > 0, 'porter_count'].mean()}")
print(f"Snowball: {counts.loc[counts['snowball_count'] > 0, 'snowball_count'].mean()}")
print(f"Lancaster: {counts.loc[counts['lancaster_count'] > 0, 'lancaster_count'].mean()}")

Porter: 1.501539338654504
Snowball: 1.5309539033889439
Lancaster: 1.8024226663016698


The Lancaster stemming algorithm is the most aggressive, as it has the greatest number of words associated with each stem.  

#### 5. Using the most aggressive stemmer from the previous question, what is the stem with the most associated terms?

In [53]:
counts.loc[counts.lancaster_count.idxmax(), 'stem_lancaster']

'cont'