# HW05

```yaml
course:   DS 5001 
module:   Module 05 HW
topic:    BOW and TFIDF
author:   Ryan Lipps
date:     17 February 2024
```

# Set Up

## Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px

In [2]:
sns.set()

## Config

Change this to match the location of your data files.

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'austen-melville'

In [4]:
data_home

'/Users/ryanlipps/Documents/MSDS/DS5001/data'

In [5]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [6]:
bag = 'CHAPS'
# bag = 'BOOKS'

## Import LIB and CORPUS tables

In [7]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
CORPUS = pd.read_csv(f'{output_dir}/{data_prefix}-CORPUS.csv').set_index('book_id')

In [8]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$,83624,24
121,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$,77601,31
141,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$,160378,48
158,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$,160926,55
161,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$,119873,50
946,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",LADY SUSAN,^\s*[IVXLCM]+\s*$,23116,41
1212,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",LOVE AND FREINDSHIP SIC,^\s*LETTER .* to .*$,33265,24
1342,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",PRIDE AND PREJUDICE,^Chapter\s+\d+$,122126,61
1900,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"MELVILLE, HERMAN",TYPEE A ROMANCE OF THE SOUTH SEAS,^CHAPTER,108021,34
2701,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"MELVILLE, HERMAN",MOBY DICK OR THE WHALE,^(?:ETYMOLOGY|EXTRACTS|CHAPTER),215504,138


In [9]:
CORPUS.head()

Unnamed: 0_level_0,chap_id,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,pos_group
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
105,1,1,0,0,"('Sir', 'NNP')",NNP,Sir,sir,NN
105,1,1,0,1,"('Walter', 'NNP')",NNP,Walter,walter,NN
105,1,1,0,2,"('Elliot,', 'NNP')",NNP,"Elliot,",elliot,NN
105,1,1,0,3,"('of', 'IN')",IN,of,of,IN
105,1,1,0,4,"('Kellynch', 'NNP')",NNP,Kellynch,kellynch,NN


### Set CORPUS index

In [10]:
CORPUS = CORPUS.reset_index()
CORPUS = CORPUS.set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']).dropna()
CORPUS.shape[0]

2058943

## Question 1:
Show functions

### Answer 1:

## BOW Function

In [11]:
def get_BOW(corpus:pd.DataFrame, level:str):
    '''
    Function to get bag of words from a corpus

    Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

    PARAMETERS:

    `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

    `level` - string of OHCO to group by for bags

    OUTPUTS:

    pandas DataFrame of bag of words grouiped by `level`

    EXAMPLE:

    `BOW = get_BOW(CORPUS, 'chap_id')`
    '''
    # Get multi-index from `corpus` df
    idx = list(corpus.index.names)

    # Check to see that `level` exists in `corpus` OHCO
    # Raise error if not
    if (level not in idx):
        raise KeyError (f'{level} not found in corpus OHCO')

    # Split-apply-combine to generate BOW grouped by `level`
    return corpus.groupby(idx[:idx.index(level)+1]+['term_str'])\
        .term_str\
        .count()\
        .to_frame('n')

In [12]:
BOW = get_BOW(CORPUS, 'chap_id')
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
105,1,1,2
105,1,15,1
105,1,16,1
105,1,1760,1
105,1,1784,1


## TFIDF Function

In [14]:
def get_TFIDF(bow:pd.DataFrame, tf_type:str):
    '''
    Function to compute TFIDF for a given bag of words DataFrame

    PARAMETERS:

    `bow` - pandas DataFrame representation of bag of words
    
    `tf_type` - string of term frequency type to use. Options are currently:\n
                sum, max, log, raw, double_norm (defaults to k=1), and binary

    OUTPUTS:
    
    tf-idf vectorized DataFrame

    EXAMPLE:
    `TFIDF = get_TFIDF(BOW, 'max')`
    '''

    DTCM = bow.n.unstack(fill_value=0)

    # Term frequency calculation dictionary
    if tf_type == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_type == 'max':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'log':
        TF = np.log2(1 + DTCM.T)
        
    elif tf_type == 'raw':
        TF = DTCM.T
        
    elif tf_type == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
        
    TF = TF.T
    
    # Calculate document frequency
    DF = DTCM.astype('bool').sum()
    
    # Calculate number of documents
    N = DTCM.shape[0]

    # idf calculations
    IDF = np.log2(N / DF)
    
    #TFIDF = TF*IDF

    return TF*IDF

In [13]:
# # OG WORK
# def get_TFIDF(bow:pd.DataFrame, tf_type:str, double_norm=0.5):
#     '''
#     DOCSTRING GOES HERE
#     '''
#     # Term frequency calculation dictionary
#     tf_dict = {
#         'sum':lambda x: x.n / x.n.sum(),
#         'max':lambda x: x.n / x.n.max(), 
#         'log':lambda x: np.log2(1 + x.n), 
#         'raw':lambda x: x.n, 
#         'double_norm':lambda x: (double_norm + (1 - double_norm) * (x.n / x.n.max())),
#         'binary':lambda x: x.n.astype('bool').astype('int'), 
#     }

#     # Dynamically find levels to drop from bow format
#     # This is because we have to group by levels of bow to get TF, but we don't want to repeat those levels in the output
#     bow_level_drop = [x for x in range(len(list(BOW.index.names))-1)]

#     # Calculate term frequency
#     # Assuming bow is indexed by bag-level OHCO, group by bag level
#     # Apply parameterized tf computation
#     # Cast as frame
#     # Drop redundant levels from groupby
#     # Rename column
#     tf = bow.groupby(list(bow.index.names)[:-1])\
#         .apply(tf_dict.get(tf_type))\
#         .to_frame()\
#         .droplevel(bow_level_drop)\
#         .rename(columns={'n':'tf'})
    
#     # Calculate document frequency
#     df = tf.tf.unstack(fill_value=0)\
#         .astype('bool')\
#         .sum()
    
#     # Calculate number of documents
#     N = tf.groupby(list(bow.index.names)[:-1])\
#         .count()\
#         .shape[0]

#     # idf calculations
#     idf = np.log2(N / df)
    
#     return tf #* idf
        

## Create VOCAB table from CORPUS

In [15]:
VOCAB = CORPUS\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
VOCAB.index_name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [16]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,1,9.713722e-07,19.973472,CD
1,23,1,1.117078e-05,16.449911,CD
10,6,2,2.914117e-06,18.38851,CD
100,2,3,9.713722e-07,19.973472,CD
1000,2,4,9.713722e-07,19.973472,CD


### Add TFIDF means to VOCAB

In [17]:
VOCAB['tfidf_book_max_mean'] = get_TFIDF(get_BOW(CORPUS, 'book_id'), 'max').mean()
VOCAB['tfidf_chap_sum_mean'] = get_TFIDF(get_BOW(CORPUS, 'chap_id'), 'sum').mean()

In [18]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2,1,9.713722e-07,19.973472,CD,0.000341,2e-06
1,23,1,1.117078e-05,16.449911,CD,0.000268,4.7e-05
10,6,2,2.914117e-06,18.38851,CD,0.000221,2e-05
100,2,3,9.713722e-07,19.973472,CD,4.3e-05,1.3e-05
1000,2,4,9.713722e-07,19.973472,CD,4e-05,1e-05


## Question 2:
What are the top 20 words in the corpus by TFIDF mean using the `max` count method and `book` as the bag?

### Answer 2:

In [19]:
q2 = VOCAB.sort_values('tfidf_book_max_mean', ascending=False).head(20)
q2

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
elinor,623,6,0.000303,11.690384,NNP,0.03384,0.001016
pierre,1526,6,0.000741,10.397933,NNP,0.030911,0.003317
vernon,104,6,5.1e-05,14.273033,NNP,0.02598,0.000779
marianne,499,8,0.000242,12.010576,NNP,0.021347,0.000843
emma,788,4,0.000383,11.351421,NNP,0.021164,0.000996
darcy,374,5,0.000182,12.426578,NNP,0.019302,0.000689
reginald,74,8,3.6e-05,14.764019,NNP,0.018486,0.000678
babbalanja,547,10,0.000266,11.878075,NNP,0.018252,0.001429
catherine,557,9,0.000271,11.851939,NNP,0.018238,0.000874
frederica,72,9,3.5e-05,14.803547,NNP,0.017986,0.00044


## Question 3:
What are the top 20 words in the corpus by TFIDF mean, if you using the `sum` count method and `paragraph`  `chapter` as the bag? Note, because of the greater number of bags, this will take longer to compute.

### Answer 3:

In [20]:
q3 = VOCAB.sort_values('tfidf_chap_sum_mean', ascending=False).head(20)
q3

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
her,16927,3,0.008221,6.926434,PRP$,0.0,0.004327
she,12059,3,0.005857,7.41565,PRP,0.0,0.00415
cosmopolitan,101,12,4.9e-05,14.315261,NN,0.003489,0.003485
pierre,1526,6,0.000741,10.397933,NNP,0.030911,0.003317
communion,9,9,4e-06,17.803547,NN,0.000107,0.003004
i,27280,1,0.01325,6.237916,PRP,0.0,0.002771
sailors,617,7,0.0003,11.704346,NNS,0.002783,0.002668
you,14347,3,0.006968,7.165011,PRP,0.0,0.00262
hypothetical,3,12,1e-06,19.38851,NNP,0.000104,0.002437
mr,3388,2,0.001646,9.247254,NNP,0.006662,0.002084


## Question 4:
Characterize the general difference between the words in Question 3 and those in Question 2 in terms of part-of-speech.

### Answer 4:

In [21]:
q2.max_pos.value_counts()

max_pos
NNP    20
Name: count, dtype: int64

In [22]:
q3.max_pos.value_counts()

max_pos
NN      8
NNP     4
PRP     3
PRP$    1
NNS     1
CC      1
JJ      1
DT      1
Name: count, dtype: int64

**The POS for question 2 are only proper nouns, whereas the POS for question 3 have a mix of different POS**

## Question 5:
Compute mean `TFIDF` for vocabularies conditioned on individual author, using *chapter* as the bag and `max` as the `TF` count method. Among the two authors, whose work has the most significant adjective?

### Answer 5:

### Create author-specific corpora and vocabs

In [23]:
austen_books = list(LIB.query('author == "AUSTEN, JANE"').index)
AUSTEN = CORPUS.query(f'book_id in {austen_books}')
AUSTEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
105,1,1,0,0,"('Sir', 'NNP')",NNP,Sir,sir,NN
105,1,1,0,1,"('Walter', 'NNP')",NNP,Walter,walter,NN
105,1,1,0,2,"('Elliot,', 'NNP')",NNP,"Elliot,",elliot,NN
105,1,1,0,3,"('of', 'IN')",IN,of,of,IN
105,1,1,0,4,"('Kellynch', 'NNP')",NNP,Kellynch,kellynch,NN
...,...,...,...,...,...,...,...,...,...
1342,61,18,0,8,"('and', 'CC')",CC,and,and,CC
1342,61,18,0,9,"('Prejudice,', 'NNP')",NNP,"Prejudice,",prejudice,NN
1342,61,18,0,10,"('by', 'IN')",IN,by,by,IN
1342,61,18,0,11,"('Jane', 'NNP')",NNP,Jane,jane,NN


In [24]:
melville = list(LIB.query('author == "MELVILLE, HERMAN"').index)
MELVILLE = CORPUS.query(f'book_id in {melville}')
MELVILLE

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1900,1,0,0,0,"('THE', 'DT')",DT,THE,the,DT
1900,1,0,0,1,"('SEA', 'NNP')",NNP,SEA,sea,NN
1900,1,0,0,2,"('LONGINGS', 'NNP')",NNP,LONGINGS,longings,NN
1900,1,0,0,3,"('FOR', 'NNP')",NNP,FOR,for,NN
1900,1,0,0,4,"('SHORE', 'NNP')",NNP,SHORE,shore,NN
...,...,...,...,...,...,...,...,...,...
34970,114,24,0,6,"('The', 'DT')",DT,The,the,DT
34970,114,24,0,7,"('Ambiguities,', 'NNP')",NNP,"Ambiguities,",ambiguities,NN
34970,114,24,0,8,"('by', 'IN')",IN,by,by,IN
34970,114,24,0,9,"('Herman', 'NNP')",NNP,Herman,herman,NN


In [25]:
AUSTEN_VOCAB = AUSTEN\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
AUSTEN_VOCAB.index_name = 'term_str'
AUSTEN_VOCAB['n_chars'] = AUSTEN_VOCAB.index.str.len()
AUSTEN_VOCAB['p'] = AUSTEN_VOCAB.n / AUSTEN_VOCAB.n.sum()
AUSTEN_VOCAB['i'] = -np.log2(AUSTEN_VOCAB.p)
AUSTEN_VOCAB['max_pos'] = AUSTEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [26]:
MELVILLE_VOCAB = MELVILLE\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
MELVILLE_VOCAB.index_name = 'term_str'
MELVILLE_VOCAB['n_chars'] = MELVILLE_VOCAB.index.str.len()
MELVILLE_VOCAB['p'] = MELVILLE_VOCAB.n / MELVILLE_VOCAB.n.sum()
MELVILLE_VOCAB['i'] = -np.log2(MELVILLE_VOCAB.p)
MELVILLE_VOCAB['max_pos'] = MELVILLE[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

### Compute chapter-bag max mean TFIDF

In [27]:
AUSTEN_VOCAB['tfidf_chap_max_mean'] = get_TFIDF(get_BOW(AUSTEN, 'chap_id'), 'max').mean()
MELVILLE_VOCAB['tfidf_chap_max_mean'] = get_TFIDF(get_BOW(MELVILLE, 'chap_id'), 'max').mean()

### Return most significant adjectives

In [28]:
AUSTEN_VOCAB.sort_values('tfidf_chap_max_mean', ascending=False).query('max_pos=="JJ"').head(1)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_chap_max_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sure,778,4,0.000996,9.970878,JJ,0.013167


In [29]:
MELVILLE_VOCAB.sort_values('tfidf_chap_max_mean', ascending=False).query('max_pos=="JJ"').head(1)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_chap_max_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
thy,594,3,0.000465,11.071353,JJ,0.028653


**Melville has the most significant adjective in 'thy'.**