# M02 Homework

- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub:

In [3]:
import numpy as np
import pandas as pd
import configparser
config = configparser.ConfigParser()

In [4]:
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [5]:
!ls -l {data_home}

total 4336
-rw-r--r--@ 1 Samantha  staff  1752309 Jan 27 17:42 austen-persuasion.csv
drwxr-xr-x@ 8 Samantha  staff      256 Jan 16 08:52 [34mgutenberg[m[m
-rw-r--r--@ 1 Samantha  staff   465627 Jan 20 15:27 pg42324.txt


In [6]:
text_file = f"{data_home}/gutenberg/pg161.txt"
csv_file  = f"{output_dir}/austen-sense.csv" # The file we will create

In [7]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

### Import file into a dataframe

In [9]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

In [10]:
LINES.sample(3)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12429,"revealed, and he was listened to with unexpect..."
8780,"suppose by my arguments, and Fanny's entreatie..."
7371,


### Extract title

In [12]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')

In [13]:
print(title)

Sense and Sensibility, by Jane Austen


### Get rid of front and back matter

In [15]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [16]:
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

In [17]:
line_a = LINES.loc[pat_a].index[0] + 1
line_b = LINES.loc[pat_b].index[0] - 1

In [18]:
line_a, line_b

(20, 12666)

In [19]:
LINES = LINES.loc[line_a : line_b]

In [20]:
LINES.head(5)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
20,
21,
22,
23,
24,


In [21]:
LINES.tail(5)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12662,
12663,
12664,
12665,End of the Project Gutenberg EBook of Sense an...
12666,


### Chunk by chapter

In [23]:
chap_pat = r"^\s*(?:chapter|letter)\s+\d+"

In [24]:
chap_lines = LINES.line_str.str.match(chap_pat, case=False)

In [25]:
LINES.loc[chap_lines].head()

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
561,CHAPTER 4
756,CHAPTER 5


In [26]:
LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]

In [27]:
LINES.loc[chap_lines].head()

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,1.0
196,CHAPTER 2,2.0
399,CHAPTER 3,3.0
561,CHAPTER 4,4.0
756,CHAPTER 5,5.0


In [28]:
LINES.chap_num = LINES.chap_num.ffill()

In [29]:
LINES.sample(5)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
9678,"the moment, without any material loss of happi...",40.0
3698,"place that is so pretty I suppose.""",20.0
3135,"Edward started--""Reserved! Am I reserved, Mari...",17.0
6167,I have been told that you were asked to be of the,29.0
6688,"passionate violence--a reproach, however, so e...",31.0


In [30]:
LINES.head(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
20,,
21,,
22,,


In [31]:
LINES = LINES.dropna(subset=['chap_num']) # Remove everything before Chapter 1
LINES = LINES.loc[~chap_lines] # Remove chapter heading lines; their work is done
LINES.chap_num = LINES.chap_num.astype('int') # Convert chap_num from float to int

In [32]:
LINES.sample(3)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
4216,,22
11829,"persuasion she had been unjust, inattentive, n...",47
5962,"thing, I can tell you, for it has been known a...",29


In [33]:
OHCO[:1]

['chap_num']

In [34]:
# Make big string for each chapter
CHAPS = LINES.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

In [35]:
CHAPS.head(3)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...


In [36]:
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

### Split chapters into paragraphs

In [38]:
para_pat = r'\n\n+'

In [39]:
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[:2]

In [40]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

PARAS.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."


### Split paragraphs into sentences

In [42]:
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
    .to_frame('sent_str')
SENTS.index.names = OHCO[:3]

In [43]:
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
SENTS.sent_str = SENTS.sent_str.str.strip() # CRUCIAL TO REMOVE BLANK TOKENS

In [44]:
SENTS.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
21,24,2,""""
30,29,6,""""
28,2,1,"She soon caught his eye, and he immediately bo..."


### Split sentences into tokens

In [46]:
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')

In [47]:
TOKENS.index.names = OHCO[:4]

In [48]:
TOKENS.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
3,5,10,3,mean
23,6,8,27,and
16,22,0,7,great


### Import persuasion file

In [50]:
pers = f"{data_home}/austen-persuasion.csv"
persuasion_tokens = pd.read_csv(pers)

In [51]:
persuasion_tokens['book_num'] = 1
persuasion_tokens.head(3)

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str,book_num
0,1,0,0,0,Sir,sir,1
1,1,0,0,1,Walter,walter,1
2,1,0,0,2,Elliot,elliot,1


In [52]:
persuasion_tokens = persuasion_tokens.set_index(['book_num','chap_num','para_num','sent_num','token_num'])
persuasion_tokens.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
1,1,0,0,2,Elliot,elliot


### Combine into one dataframe of tokens

In [102]:
TOKENS['book_num'] = 2

In [104]:
TOKENS = TOKENS.set_index("book_num", append=True)

In [108]:
TOKENS = TOKENS.reorder_levels(["book_num", "chap_num", "para_num", "sent_num", "token_num"])

In [112]:
TOKENS.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
2,1,0,0,0,The
2,1,0,0,1,family
2,1,0,0,2,of


In [148]:
combined = pd.concat([persuasion_tokens, TOKENS])
combined.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
2,47,0,2,10,his,
1,5,15,2,6,have,have
1,5,19,0,25,appearance,appearance
2,22,10,0,0,"""I",
2,31,1,4,18,seclude,
1,12,28,9,6,rest,rest
2,39,0,4,47,longer,
2,24,36,0,1,not,
2,6,0,2,0,It,
1,21,57,0,3,replied,replied


In [150]:
# Going to reset the term_str column before extracting a vocabulary

combined = combined.drop(columns=['term_str'])

In [152]:
combined['term_str'] = combined.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB = combined.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [154]:
VOCAB['length'] = VOCAB['term_str'].str.len()
VOCAB = VOCAB.set_index('term_str')

In [159]:
VOCAB.sample(3)

Unnamed: 0_level_0,n,length
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
protege,2,7
highest,13,7
lasted,8,6


## QUESTIONS

#### 1. How many raw tokens are in the combined data frame?

In [166]:
len(combined)

207896

There are 207,896 raw tokens in the combined dataframe. 

#### 2. How many distinct terms are there in the combined data frame (i.e. how big is the vocabulary)?

In [172]:
len(VOCAB)

8239

There are 8,239 distinct terms in the combined dataframe. 

#### 3. How many more terms does the vocabulary of Sense and Sensibility have than that of Persuasion?

In [186]:
combined.groupby('book_num')['term_str'].nunique()

book_num
1    5760
2    6280
Name: term_str, dtype: int64

In [191]:
6280 - 5760

520

The vocabulary of Sense and Sensibility has 520 more terms than that of Persuasion. 

#### 4. What is the average number of tokens, rounded to an integer, per chapter in the corpus?

In [198]:
combined.groupby(['book_num','chap_num']).size().mean()

2809.4054054054054

The average number of tokens per chapter in the corpus is approximately 2,809.

#### 5. What is the average number of tokens, rounded to an integer, per paragraph in the corpus?

In [201]:
combined.groupby(['book_num','chap_num','para_num']).size().mean()

73.74813763746009

The average number of tokens per chapter in the corpus is approximately 74. 