# HW05

```yaml
course:   DS 5001 
module:   Module 05 HW
topic:    BOW and TFIDF
author:   Ryan Lipps
date:     17 February 2024
```

# Set Up

## Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px

In [2]:
sns.set()

## Config

Change this to match the location of your data files.

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'austen-melville'

In [4]:
data_home

'/Users/ryanlipps/Documents/MSDS/DS5001/data'

In [5]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [6]:
bag = 'CHAPS'
# bag = 'BOOKS'

## Import LIB and CORPUS tables

In [7]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
CORPUS = pd.read_csv(f'{output_dir}/{data_prefix}-CORPUS.csv').set_index('book_id')

In [8]:
LIB.head()

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$,83624,24
121,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$,77601,31
141,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$,160378,48
158,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$,160926,55
161,/Users/ryanlipps/Documents/MSDS/DS5001/data/gu...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$,119873,50


In [9]:
CORPUS.head()

Unnamed: 0_level_0,chap_id,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,pos_group
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
105,1,1,0,0,"('Sir', 'NNP')",NNP,Sir,sir,NN
105,1,1,0,1,"('Walter', 'NNP')",NNP,Walter,walter,NN
105,1,1,0,2,"('Elliot,', 'NNP')",NNP,"Elliot,",elliot,NN
105,1,1,0,3,"('of', 'IN')",IN,of,of,IN
105,1,1,0,4,"('Kellynch', 'NNP')",NNP,Kellynch,kellynch,NN


### Set CORPUS index

In [33]:
CORPUS = CORPUS.reset_index()
CORPUS = CORPUS.set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num'])
CORPUS.shape

(2059272, 5)

## BOW Function

In [11]:
def get_BOW(corpus:pd.DataFrame, level:str):
    '''
    Function to get bag of words from a corpus

    Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

    PARAMETERS:

    `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

    `level` - string of OHCO to group by for bags

    OUTPUTS:

    pandas DataFrame of bag of words grouiped by `level`

    EXAMPLE:

    `BOW = get_BOW(CORPUS, 'chap_id')`
    '''
    # Get multi-index from `corpus` df
    idx = list(corpus.index.names)

    # Check to see that `level` exists in `corpus` OHCO
    # Raise error if not
    if (level not in idx):
        raise KeyError (f'{level} not found in corpus OHCO')

    # Split-apply-combine to generate BOW grouped by `level`
    return corpus.groupby(idx[:idx.index(level)+1]+['term_str'])\
        .term_str\
        .count()\
        .to_frame('n')

In [12]:
BOW = get_BOW(CORPUS, 'chap_id')
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
105,1,1,2
105,1,15,1
105,1,16,1
105,1,1760,1
105,1,1784,1


## TFIDF Function

In [13]:
# # OG WORK
# def get_TFIDF(bow:pd.DataFrame, tf_type:str, double_norm=0.5):
#     '''
#     DOCSTRING GOES HERE
#     '''
#     # Term frequency calculation dictionary
#     tf_dict = {
#         'sum':lambda x: x.n / x.n.sum(),
#         'max':lambda x: x.n / x.n.max(), 
#         'log':lambda x: np.log2(1 + x.n), 
#         'raw':lambda x: x.n, 
#         'double_norm':lambda x: (double_norm + (1 - double_norm) * (x.n / x.n.max())),
#         'binary':lambda x: x.n.astype('bool').astype('int'), 
#     }

#     # Dynamically find levels to drop from bow format
#     # This is because we have to group by levels of bow to get TF, but we don't want to repeat those levels in the output
#     bow_level_drop = [x for x in range(len(list(BOW.index.names))-1)]

#     # Calculate term frequency
#     # Assuming bow is indexed by bag-level OHCO, group by bag level
#     # Apply parameterized tf computation
#     # Cast as frame
#     # Drop redundant levels from groupby
#     # Rename column
#     tf = bow.groupby(list(bow.index.names)[:-1])\
#         .apply(tf_dict.get(tf_type))\
#         .to_frame()\
#         .droplevel(bow_level_drop)\
#         .rename(columns={'n':'tf'})
    
#     # Calculate document frequency
#     df = tf.tf.unstack(fill_value=0)\
#         .astype('bool')\
#         .sum()
    
#     # Calculate number of documents
#     N = tf.groupby(list(bow.index.names)[:-1])\
#         .count()\
#         .shape[0]

#     # idf calculations
#     idf = np.log2(N / df)
    
#     return tf * idf.T
        

In [14]:
def get_TFIDF(bow:pd.DataFrame, tf_type:str, double_norm=0.5):
    '''
    DOCSTRING GOES HERE
    '''

    DTCM = BOW.n.unstack(fill_value=0)

    # Term frequency calculation dictionary
    if tf_type == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_type == 'max':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'log':
        TF = np.log2(1 + DTCM.T)
        
    elif tf_type == 'raw':
        TF = DTCM.T
        
    elif tf_type == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    
    # Calculate document frequency
    DF = DTCM.astype('bool').sum()
    
    # Calculate number of documents
    N = DTCM.shape[0]

    # idf calculations
    IDF = np.log2(N / DF)
    
    TFIDF = (TF.T)*IDF
    return TFIDF

In [32]:
book_max = get_TFIDF(get_BOW(CORPUS, 'book_id'), 'max')\
    .mean()\
    .to_frame()\
    .sort_values(0, ascending=False)
book_max.head(20)

Unnamed: 0_level_0,0
term_str,Unnamed: 1_level_1
she,0.099678
her,0.098774
pierre,0.063955
i,0.058736
you,0.057449
mr,0.051118
mrs,0.043763
and,0.038344
thou,0.034672
my,0.034633


## Create VOCAB table from CORPUS

In [15]:
VOCAB = CORPUS\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
VOCAB.index_name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [16]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,1,9.713661e-07,19.973482,CD
1,23,1,1.117071e-05,16.44992,CD
10,6,2,2.914098e-06,18.388519,CD
100,2,3,9.713661e-07,19.973482,CD
1000,2,4,9.713661e-07,19.973482,CD


### Add TFIDF means to VOCAB

In [17]:
VOCAB['tfidf_book_max_mean'] = get_TFIDF(get_BOW(CORPUS, 'book_id'), 'max').mean()
VOCAB['tfidf_chap_sum_mean'] = get_TFIDF(get_BOW(CORPUS, 'chap_id'), 'sum').mean()

In [18]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2,1,9.713661e-07,19.973482,CD,4.9e-05,2e-06
1,23,1,1.117071e-05,16.44992,CD,0.000719,4.7e-05
10,6,2,2.914098e-06,18.388519,CD,0.000395,2e-05
100,2,3,9.713661e-07,19.973482,CD,0.000249,1.3e-05
1000,2,4,9.713661e-07,19.973482,CD,0.000123,1e-05


In [19]:
q2 = VOCAB.sort_values('tfidf_book_max_mean', ascending=False).head(20)
q2

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
she,12059,3,0.005857,7.415659,PRP,0.099678,0.00415
her,16927,3,0.008221,6.926443,PRP$,0.098774,0.004327
pierre,1526,6,0.000741,10.397942,NNP,0.063955,0.003317
i,27280,1,0.013249,6.237926,PRP,0.058736,0.002771
you,14347,3,0.006968,7.16502,PRP,0.057449,0.00262
mr,3388,2,0.001645,9.247263,NNP,0.051118,0.002084
mrs,2658,3,0.001291,9.597356,NNP,0.043763,0.001747
and,62954,3,0.030576,5.031471,CC,0.038344,0.002054
thou,912,4,0.000443,11.140592,NN,0.034672,0.001696
my,10237,2,0.004972,7.651976,PRP$,0.034633,0.001686


In [20]:
q3 = VOCAB.sort_values('tfidf_chap_sum_mean', ascending=False).head(20)
q3

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_book_max_mean,tfidf_chap_sum_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
her,16927,3,0.008221,6.926443,PRP$,0.098774,0.004327
she,12059,3,0.005857,7.415659,PRP,0.099678,0.00415
cosmopolitan,101,12,4.9e-05,14.31527,NN,0.024418,0.003485
pierre,1526,6,0.000741,10.397942,NNP,0.063955,0.003317
communion,9,9,4e-06,17.803557,NN,0.006523,0.003004
i,27280,1,0.013249,6.237926,PRP,0.058736,0.002771
sailors,617,7,0.0003,11.704355,NNS,0.013025,0.002668
you,14347,3,0.006968,7.16502,PRP,0.057449,0.00262
hypothetical,3,12,1e-06,19.388519,NNP,0.007525,0.002437
mr,3388,2,0.001645,9.247263,NNP,0.051118,0.002084


In [21]:
q2.max_pos.value_counts()

max_pos
NNP     7
PRP     4
PRP$    3
NN      2
CC      1
DT      1
TO      1
VBP     1
Name: count, dtype: int64

In [22]:
q3.max_pos.value_counts()

max_pos
NN      8
NNP     4
PRP     3
PRP$    1
NNS     1
CC      1
JJ      1
DT      1
Name: count, dtype: int64

In [23]:
austen_books = list(LIB.query('author == "AUSTEN, JANE"').index)
AUSTEN = CORPUS.query(f'book_id in {austen_books}')
AUSTEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
105,1,1,0,0,"('Sir', 'NNP')",NNP,Sir,sir,NN
105,1,1,0,1,"('Walter', 'NNP')",NNP,Walter,walter,NN
105,1,1,0,2,"('Elliot,', 'NNP')",NNP,"Elliot,",elliot,NN
105,1,1,0,3,"('of', 'IN')",IN,of,of,IN
105,1,1,0,4,"('Kellynch', 'NNP')",NNP,Kellynch,kellynch,NN
...,...,...,...,...,...,...,...,...,...
1342,61,18,0,8,"('and', 'CC')",CC,and,and,CC
1342,61,18,0,9,"('Prejudice,', 'NNP')",NNP,"Prejudice,",prejudice,NN
1342,61,18,0,10,"('by', 'IN')",IN,by,by,IN
1342,61,18,0,11,"('Jane', 'NNP')",NNP,Jane,jane,NN


In [24]:
melville = list(LIB.query('author == "MELVILLE, HERMAN"').index)
MELVILLE = CORPUS.query(f'book_id in {melville}')
MELVILLE

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1900,1,0,0,0,"('THE', 'DT')",DT,THE,the,DT
1900,1,0,0,1,"('SEA', 'NNP')",NNP,SEA,sea,NN
1900,1,0,0,2,"('LONGINGS', 'NNP')",NNP,LONGINGS,longings,NN
1900,1,0,0,3,"('FOR', 'NNP')",NNP,FOR,for,NN
1900,1,0,0,4,"('SHORE', 'NNP')",NNP,SHORE,shore,NN
...,...,...,...,...,...,...,...,...,...
34970,114,24,0,6,"('The', 'DT')",DT,The,the,DT
34970,114,24,0,7,"('Ambiguities,', 'NNP')",NNP,"Ambiguities,",ambiguities,NN
34970,114,24,0,8,"('by', 'IN')",IN,by,by,IN
34970,114,24,0,9,"('Herman', 'NNP')",NNP,Herman,herman,NN


In [25]:
AUSTEN_VOCAB = AUSTEN\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
AUSTEN_VOCAB.index_name = 'term_str'
AUSTEN_VOCAB['n_chars'] = AUSTEN_VOCAB.index.str.len()
AUSTEN_VOCAB['p'] = AUSTEN_VOCAB.n / AUSTEN_VOCAB.n.sum()
AUSTEN_VOCAB['i'] = -np.log2(AUSTEN_VOCAB.p)
AUSTEN_VOCAB['max_pos'] = AUSTEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [26]:
MELVILLE_VOCAB = MELVILLE\
    .term_str\
        .value_counts()\
        .to_frame('n')\
        .sort_index()
MELVILLE_VOCAB.index_name = 'term_str'
MELVILLE_VOCAB['n_chars'] = MELVILLE_VOCAB.index.str.len()
MELVILLE_VOCAB['p'] = MELVILLE_VOCAB.n / MELVILLE_VOCAB.n.sum()
MELVILLE_VOCAB['i'] = -np.log2(MELVILLE_VOCAB.p)
MELVILLE_VOCAB['max_pos'] = MELVILLE[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [27]:
AUSTEN_VOCAB['tfidf_chap_max_mean'] = get_TFIDF(get_BOW(AUSTEN, 'chap_id'), 'max').mean()
MELVILLE_VOCAB['tfidf_chap_max_mean'] = get_TFIDF(get_BOW(MELVILLE, 'chap_id'), 'max').mean()

In [28]:
AUSTEN_VOCAB.sort_values('tfidf_chap_max_mean', ascending=False).query('max_pos=="JJ"')

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_chap_max_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dear,960,4,0.001230,9.667620,JJ,0.021552
old,306,3,0.000392,11.317123,JJ,0.021171
much,2047,4,0.002622,8.575215,JJ,0.016099
such,2299,4,0.002945,8.407720,JJ,0.016078
good,1446,4,0.001852,9.076659,JJ,0.015268
...,...,...,...,...,...,...
unparalelled,1,12,0.000001,19.574510,JJ,0.000024
lawfull,1,7,0.000001,19.574510,JJ,0.000024
ungratefull,1,11,0.000001,19.574510,JJ,0.000024
involantary,1,11,0.000001,19.574510,JJ,0.000024


In [29]:
MELVILLE_VOCAB.sort_values('tfidf_chap_max_mean', ascending=False).query('max_pos=="JJ"')

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,tfidf_chap_max_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
thy,594,3,4.647156e-04,11.071364,JJ,0.024668
dear,248,4,1.940227e-04,12.331487,JJ,0.021552
old,2664,3,2.084179e-03,8.906305,JJ,0.021171
much,1434,4,1.121889e-03,9.799854,JJ,0.016099
such,1837,4,1.437176e-03,9.442547,JJ,0.016078
...,...,...,...,...,...,...
unrailed,1,8,7.823496e-07,20.285683,JJ,0.000002
unpoetic,1,8,7.823496e-07,20.285683,JJ,0.000002
unpleasurable,1,13,7.823496e-07,20.285683,JJ,0.000002
unpierced,1,9,7.823496e-07,20.285683,JJ,0.000002
