# Homework 2

```yaml
Course:   DS 5001 
Module:   02 Text Models
Topic:    Text into Data Challenge
Author:   Ryan Lipps
Date:     14 October 2022 (revised)
```

In [1]:
import pandas as pd

### Import Config

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [3]:
text_file = f"{data_home}/gutenberg/pg161.txt"
csv_file = f"{output_dir}/austen-sense-and-sensibility.csv" # The file we will create

In [4]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

## Import file into a dataframe

In [5]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n', '', regex=True).str.strip()

In [6]:
LINES.sample(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
10086,openness and heartiness of her manner more tha...
8768,
8092,"""I was engaged elsewhere."""
930,house and garden in which theirs might at pres...
2868,"dislike of Edward; and it ended, as every feel..."
9273,"himself an escape from it;--and if so, she had..."
1945,pressed to say something more.
11769,
10904,letter-writing?--delicate--tender--truly femin...
7791,


## Extract Title 

In [7]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
title

'Sense and Sensibility, by Jane Austen'

## Clip Cruft

In [8]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [9]:
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

In [10]:
line_a = LINES.loc[pat_a].index[0] + 1
line_b = LINES.loc[pat_b].index[0] - 1

In [11]:
LINES = LINES.loc[line_a:line_b]

In [12]:
LINES.head(30)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,


In [13]:
LINES.tail(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12647,which strong family affection would naturally ...
12648,merits and the happiness of Elinor and Mariann...
12649,"as the least considerable, that though sisters..."
12650,"within sight of each other, they could live wi..."
12651,"between themselves, or producing coolness betw..."
12652,
12653,
12654,
12655,THE END
12656,


## Chunk by chapter

### Find all chapter headers

The regex will depend on the source text. You need to investigate the source text to figure this out.

In [14]:
chap_pat = r"^\s*(?:chapter|letter)\s+\d+"

In [15]:
chap_lines = LINES.line_str.str.match(chap_pat, case=False)

In [16]:
# Change to title case for formatting consistency
#LINES.loc[chap_lines,'line_str'] = LINES.loc[chap_lines,'line_str'].apply(lambda x: x.title())

In [17]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
561,CHAPTER 4
756,CHAPTER 5
858,CHAPTER 6
986,CHAPTER 7
1112,CHAPTER 8
1244,CHAPTER 9
1448,CHAPTER 10


### Assign numbers to chapters

In [18]:
LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,1.0
196,CHAPTER 2,2.0
399,CHAPTER 3,3.0
561,CHAPTER 4,4.0
756,CHAPTER 5,5.0
858,CHAPTER 6,6.0
986,CHAPTER 7,7.0
1112,CHAPTER 8,8.0
1244,CHAPTER 9,9.0
1448,CHAPTER 10,10.0


### Forward-fill chapter numbers to following text lines

`ffill()` will replace null values with the previous non-null value.

In [19]:
LINES.chap_num = LINES.chap_num.ffill()

### Clean up

In [20]:
LINES = LINES.dropna(subset=['chap_num'])      # Remove everything before chapter 1
LINES = LINES.loc[~chap_lines]                  # Remove chapter heading lines
LINES.chap_num = LINES.chap_num.astype('int')   # Convert chap_num to int

In [21]:
LINES.head()

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
43,,1
44,,1
45,The family of Dashwood had long been settled i...,1
46,"was large, and their residence was at Norland ...",1
47,"their property, where, for many generations, t...",1


### Group lines into chapters

In [22]:
OHCO[:1]

['chap_num']

In [23]:
# Change chapters into one big string
CHAPS = LINES.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

In [24]:
CHAPS.head(5)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."


In [25]:
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

In [26]:
CHAPS.head()

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,The family of Dashwood had long been settled i...
2,Mrs. John Dashwood now installed herself mistr...
3,Mrs. Dashwood remained at Norland several mont...
4,"""What a pity it is, Elinor,"" said Marianne, ""t..."
5,"No sooner was her answer dispatched, than Mrs...."


## Split chapters into paragraphs 

We use Pandas' convenient `.split()` method with `expand=True`, followed by `.stack()`.
Note that this creates zero-based indexes.

In [27]:
para_pat = r'\n\n+'

In [28]:
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[:2]

In [29]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


In [30]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [31]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


## Split paragraphs into sentences

In [32]:
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
    .to_frame('sent_str')
SENTS.index.names = OHCO[:3]

In [33]:
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
SENTS.sent_str = SENTS.sent_str.str.strip() # CRUCIAL TO REMOVE BLANK TOKENS
SENTS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,The family of Dashwood had long been settled i...
1,0,1,"Their estate was large, and their residence wa..."
1,0,2,The late owner of this estate was a single man...
1,0,3,"But her death, which happened ten years before..."
1,0,4,"for to supply her loss, he invited and receive..."


## Split sentences into tokens

In [34]:
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')

In [35]:
TOKENS.index.names = OHCO[:4]

In [36]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,The
1,0,0,1,family
1,0,0,2,of
1,0,0,3,Dashwood
1,0,0,4,had
...,...,...,...,...
50,22,0,8,and
50,22,0,9,Sensibility
50,22,0,10,by
50,22,0,11,Jane


## Extract Vocabulary

In [37]:
TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()

In [38]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [39]:
VOCAB

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,to,4115
1,the,4105
2,of,3574
3,and,3490
4,her,2543
...,...,...
6275,prefer,1
6276,dissolving,1
6277,beset,1
6278,effectually,1


## Gathering by Content Object

In [40]:
def gather(ohco_level):
    global TOKENS
    level_name = OHCO[ohco_level-1].split('_')[0]
    df = TOKENS.groupby(OHCO[:ohco_level])\
        .token_str.apply(lambda x: x.str.cat(sep=' '))\
        .to_frame(f"{level_name}_str")
    return df

In [41]:
gather(1)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,The family of Dashwood had long been settled i...
2,Mrs John Dashwood now installed herself mistre...
3,Mrs Dashwood remained at Norland several month...
4,"""What a pity it is Elinor "" said Marianne ""tha..."
5,No sooner was her answer dispatched than Mrs D...
6,The first part of their journey was performed ...
7,Barton Park was about half a mile from the cot...
8,Mrs Jennings was a widow with an ample jointur...
9,The Dashwoods were now settled at Barton with ...
10,Marianne s preserver as Margaret with more ele...


In [42]:
gather(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,By a former marriage Mr Henry Dashwood had one...
1,2,The old gentleman died his will was read and l...
1,3,Mr Dashwood s disappointment was at first seve...
1,4,His son was sent for as soon as his danger was...
...,...,...
50,18,For Marianne however in spite of his incivilit...
50,19,Mrs Dashwood was prudent enough to remain at t...
50,20,Between Barton and Delaford there was that con...
50,21,THE END


In [43]:
gather(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,The family of Dashwood had long been settled i...
1,0,1,Their estate was large and their residence was...
1,0,2,The late owner of this estate was a single man...
1,0,3,But her death which happened ten years before ...
1,0,4,for to supply her loss he invited and received...
...,...,...,...
50,19,3,Jennings when Marianne was taken from them Mar...
50,20,0,Between Barton and Delaford there was that con...
50,20,1,and among the merits and the happiness of Eli...
50,21,0,THE END


In [44]:
sns_frame = gather(4)

In [45]:
sns_frame.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,The
1,0,0,1,family
1,0,0,2,of
1,0,0,3,Dashwood
1,0,0,4,had


In [46]:
sns_frame['term_str'] = sns_frame['token_str'].str.lower()
sns_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,The,the
1,0,0,1,family,family
1,0,0,2,of,of
1,0,0,3,Dashwood,dashwood
1,0,0,4,had,had
...,...,...,...,...,...
50,22,0,8,and,and
50,22,0,9,Sensibility,sensibility
50,22,0,10,by,by
50,22,0,11,Jane,jane


### Add book number index

In [47]:
sns_frame = pd.concat([sns_frame], keys=[0], names=['book_num'])
sns_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0,0,0,The,the
0,1,0,0,1,family,family
0,1,0,0,2,of,of
0,1,0,0,3,Dashwood,dashwood
0,1,0,0,4,had,had
0,...,...,...,...,...,...
0,50,22,0,8,and,and
0,50,22,0,9,Sensibility,sensibility
0,50,22,0,10,by,by
0,50,22,0,11,Jane,jane


## Import Persuasion

In [48]:
pers_file = f"{data_home}/gutenberg/austen-persuasion.csv"
pers_frame = pd.read_csv(pers_file)
pers_frame

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str
0,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
2,1,0,0,2,Elliot,elliot
3,1,0,0,3,of,of
4,1,0,0,4,Kellynch,kellynch
...,...,...,...,...,...,...
85009,24,13,0,6,of,of
85010,24,13,0,7,Persuasion,persuasion
85011,24,13,0,8,by,by
85012,24,13,0,9,Jane,jane


### Set multi-index

In [49]:
pers_frame = pers_frame.set_index(['chap_num', 'para_num', 'sent_num', 'token_num'])
pers_frame.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,Sir,sir
1,0,0,1,Walter,walter
1,0,0,2,Elliot,elliot
1,0,0,3,of,of
1,0,0,4,Kellynch,kellynch


In [50]:
pers_frame = pd.concat([pers_frame], keys=[1], names=['book_num'])

### Add book number index

In [51]:
pers_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
1,1,0,0,2,Elliot,elliot
1,1,0,0,3,of,of
1,1,0,0,4,Kellynch,kellynch
1,...,...,...,...,...,...
1,24,13,0,6,of,of
1,24,13,0,7,Persuasion,persuasion
1,24,13,0,8,by,by
1,24,13,0,9,Jane,jane


In [52]:
corp_frame = pd.concat([sns_frame, pers_frame])
corp_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0,0,0,The,the
0,1,0,0,1,family,family
0,1,0,0,2,of,of
0,1,0,0,3,Dashwood,dashwood
0,1,0,0,4,had,had
...,...,...,...,...,...,...
1,24,13,0,6,of,of
1,24,13,0,7,Persuasion,persuasion
1,24,13,0,8,by,by
1,24,13,0,9,Jane,jane


In [53]:
corp_frame['term_str'] = corp_frame.token_str.replace(r'[\W_]+', '', regex=True).str.lower()

In [54]:
CORP_VOCAB = corp_frame.term_str.value_counts().to_frame()
CORP_VOCAB

Unnamed: 0_level_0,count
term_str,Unnamed: 1_level_1
the,7435
to,6923
and,6290
of,6146
her,3747
...,...
unconquerable,1
outgrown,1
prosperously,1
nominal,1


In [55]:
CORP_VOCAB = CORP_VOCAB.rename({'count':'n'}, axis=1)

In [56]:
CORP_VOCAB['n_chars'] = CORP_VOCAB.index.str.len()
CORP_VOCAB

Unnamed: 0_level_0,n,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7435,3
to,6923,2
and,6290,3
of,6146,2
her,3747,3
...,...,...
unconquerable,1,13
outgrown,1,8
prosperously,1,12
nominal,1,7


## Question 1:
How many raw tokens are in the combined data frame?

### Answer 1:
**There are 207,896 raw tokens in the combined frame**

In [57]:
corp_frame.shape

(207896, 2)

## Question 2:
How many distinct terms are there in the combined data frame (i.e. how big is the vocabulary)?

### Answer 2:
**There are 8,239 words in the combined vocabulary**

In [58]:
CORP_VOCAB.shape

(8239, 2)

## Question 3:
How many more terms does the vocabulary of Sense and Sensibility have than that of Persuasion?

### Answer 3:
**The vocabulary of Sense and Sensibility has 520 more terms than that of Persuasion**

In [59]:
# Sense and Sensibility
corp_frame.query('book_num == 0').term_str.value_counts()

term_str
to             4115
the            4105
of             3574
and            3490
her            2543
               ... 
prefer            1
dissolving        1
beset             1
effectually       1
austen            1
Name: count, Length: 6280, dtype: int64

In [60]:
# Persuasion
corp_frame.query('book_num == 1').term_str.value_counts()

term_str
the            3330
to             2808
and            2800
of             2572
a              1595
               ... 
reins             1
judiciously       1
rut               1
dung              1
austen            1
Name: count, Length: 5760, dtype: int64

In [65]:
6280-5760

520

## Question 4:
What is the average number of tokens, rounded to an integer, per chapter in the corpus?

### Answer 4:
**2,808**

In [77]:
corp_frame.groupby(['book_num', 'chap_num']).count().mean()

token_str    2807.918919
term_str     2807.918919
dtype: float64

## Question 5:
What is the average number of tokens, rounded to an integer, per paragraph in the corpus?

### Answer 5:
**74**

In [79]:
corp_frame.groupby(['book_num', 'chap_num', 'para_num']).count().mean()

token_str    73.709117
term_str     73.709117
dtype: float64