# M05 Homework
- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub: https://github.com/sqr8ap/DS5001-2025-01-R/blob/m05/lessons/M05_VectorSpaceModels/M05_HW.ipynb

In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly_express as px

In [41]:
sns.set()

In [31]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'austen-melville'

In [33]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [73]:
bags

{'SENTS': ['book_id', 'chap_id', 'para_num', 'sent_num'],
 'PARAS': ['book_id', 'chap_id', 'para_num'],
 'CHAPS': ['book_id', 'chap_id'],
 'BOOKS': ['book_id']}

In [35]:
bag = 'CHAPS'
# bag = 'BOOKS'

In [43]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
TOKEN = pd.read_csv(f'{output_dir}/{data_prefix}-CORPUS.csv').set_index(OHCO).dropna()

In [46]:
TOKEN.reset_index().book_id.value_counts().sort_index()

book_id
105       83613
121       77586
141      160366
158      160884
161      119858
946       23115
1212      33241
1342     122089
1900     108015
2701     215461
4045     102347
8118     119230
10712    143251
13720     96874
13721    102078
15422     65510
15859     75232
21816     95169
34970    155024
Name: count, dtype: int64

## Write a function to create BOW from TOKEN

In [100]:
def gen_bow(TOKENS, OHCO_LEVEL='CHAPS'):
    '''
    This function takes a tokens table and a choice of bag and returns a BOW representation in the form of a document-term count matrix. 

    Parameters
    TOKENS: tokens table; a dataframe
    OCHO_LEVEL: choice of bag; a string (either 'BOOKS', 'CHAPS', 'PARAS' or 'SENTS'); defaults to 'CHAPS'

    Returns
    DTCM: document-term count matrix
    '''
    
    bags = dict(
        SENTS = OHCO[:4],
        PARAS = OHCO[:3],
        CHAPS = OHCO[:2],
        BOOKS = OHCO[:1])
    
    BOW = TOKENS.groupby(bags[OHCO_LEVEL]+['term_str']).term_str.count().to_frame('n')
    DTCM = BOW.n.unstack(fill_value=0)

    return DTCM

In [134]:
my_dtcm = gen_bow(TOKEN, 'CHAPS')

## Write a function to return TFIDF values for a given BOW

In [178]:
def gen_tfidf(DTCM, TF_METHOD='sum'):
    '''
    This function takes a BOW table (DTCM) and type of tf metric and returns the TFIDF values for the BOW. 

    Parameters
    DTCM: BOW table; a dataframe
    TF_METHOD: a string; either 'sum', 'max', 'log', 'raw', 'double_norm' or 'binary'; defaults to 'sum'

    Returns
    TFIDF: a dataframe
    '''

    tf_norm_k = 0.5
    idf_method = 'standard'
    gradient_cmap = 'YlGnBu'
    tf = {
        'sum': (DTCM.T / DTCM.T.sum()).T,
        'max': (DTCM.T / DTCM.T.max()).T,
        'log': (np.log2(1 + DTCM.T)).T,
        'raw':  DTCM,
        'double_norm': (DTCM.T / DTCM.T.max()).T,
        'binary': DTCM.T.astype('bool').astype('int').T}

    TF = tf[TF_METHOD]

    DF = DTCM.astype('bool').sum() 

    N = DTCM.shape[0]   
    
    IDF = np.log2(N / DF)

    TFIDF = TF * IDF
    
    return TFIDF

In [180]:
gen_tfidf(my_dtcm).head()

Unnamed: 0_level_0,term_str,0,1,10,100,1000,10000,10000000,10440,10800,10th,...,zoroaster,zozo,zuma,zur,à,æneas,æniad,æson,æsops,ł20000
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,0.0,0.005048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Questions

#### 1. Show the function you created.

In [148]:
print(gen_bow.__doc__)


    This function takes a tokens table and a choice of bag and returns a BOW representation in the form of a document-term count matrix. 

    Parameters
    TOKENS: tokens table; a dataframe
    OCHO_LEVEL: choice of bag; a string (either 'BOOKS', 'CHAPS', 'PARAS' or 'SENTS'); defaults to 'CHAPS'

    Returns
    DTCM: document-term count matrix
    


In [150]:
print(gen_tfidf.__doc__)


    This function takes a BOW table (DTCM) and type of tf metric and returns the TFIDF values for the BOW. 

    Parameters
    DTCM: BOW table; a dataframe
    TF_METHOD: a string; either 'sum', 'max', 'log', 'raw', 'double_norm' or 'binary'; defaults to 'sum'

    Returns
    TFIDF: a dataframe
    


See above for implementations of both functions. 

#### 2. What are the top 20 words in the corpus by TFIDF mean using the `max` count method and `book` as the bag?

In [241]:
dtcm_books = gen_bow(TOKEN, 'BOOKS')
tfidf_books = gen_tfidf(dtcm_books, TF_METHOD='max')


tfidf_mean = tfidf_books.mean(axis=0)  # Mean TF-IDF per term
top20_book = pd.DataFrame(tfidf_mean.sort_values(ascending=False).head(20), columns = ['tfidf'])
top20_book

Unnamed: 0_level_0,tfidf
term_str,Unnamed: 1_level_1
elinor,0.03384
pierre,0.030911
vernon,0.02598
marianne,0.021347
emma,0.021164
darcy,0.019302
reginald,0.018486
babbalanja,0.018252
catherine,0.018238
frederica,0.017986


#### 3. What are the top 20 words in the corpus by TFIDF mean, if you using the `sum` count method and `chapter` as the bag? Note, because of the greater number of bags, this will take longer to compute.

In [248]:
dtcm_chap = gen_bow(TOKEN, 'CHAPS')
tfidf_chap = gen_tfidf(dtcm_chap, TF_METHOD='sum')


tfidf_mean = tfidf_chap.mean(axis=0)  # Mean TF-IDF per term
top20_chap = pd.DataFrame(tfidf_mean.sort_values(ascending=False).head(20), columns = ['tfidf'])
top20_chap

Unnamed: 0_level_0,tfidf
term_str,Unnamed: 1_level_1
her,0.004327
she,0.00415
cosmopolitan,0.003485
pierre,0.003317
communion,0.003004
i,0.002771
sailors,0.002668
you,0.00262
hypothetical,0.002437
mr,0.002084


#### 4. Characterize the general difference between the words in Question 3 and those in Question 2 in terms of part-of-speech.

In [209]:
VOCAB = TOKEN.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = TOKEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = TOKEN[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [243]:
pd.merge(top20_book, VOCAB[['max_pos']], left_index=True, right_index=True, how='left')

Unnamed: 0_level_0,tfidf,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
elinor,0.03384,NNP
pierre,0.030911,NNP
vernon,0.02598,NNP
marianne,0.021347,NNP
emma,0.021164,NNP
darcy,0.019302,NNP
reginald,0.018486,NNP
babbalanja,0.018252,NNP
catherine,0.018238,NNP
frederica,0.017986,NNP


In [250]:
pd.merge(top20_chap, VOCAB[['max_pos']], left_index=True, right_index=True, how='left')

Unnamed: 0_level_0,tfidf,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
her,0.004327,PRP$
she,0.00415,PRP
cosmopolitan,0.003485,NN
pierre,0.003317,NNP
communion,0.003004,NN
i,0.002771,PRP
sailors,0.002668,NNS
you,0.00262,PRP
hypothetical,0.002437,NNP
mr,0.002084,NNP


The top 20 words when book is used as the bag are all proper nouns, whereas the top 20 words with chapter as the bag include multiple parts of speech, such as pronouns, nouns, adjectives, determiners and conjunctions. The top 20 words with chapter as the bag still largely consist of nouns, but we see other parts of speech here as well. 

#### 5. Compute mean `TFIDF` for vocabularies conditioned on individual author, using *chapter* as the bag and `max` as the `TF` count method. Among the two authors, whose work has the most significant adjective?

In [263]:
LIB = LIB.reset_index()

In [275]:
# Get author information
TOKEN2 = TOKEN.merge(LIB[['book_id', 'author']], on='book_id', how='left').set_index(TOKEN.index)
TOKEN2.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,book_id,pos_tuple,pos,token_str,term_str,pos_group,author
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15859,1,12,2,51,15859,"('a', 'DT')",DT,a,a,DT,"MELVILLE, HERMAN"
2701,136,31,0,39,2701,"('the""', 'NN')",NN,"the""",the,NN,"MELVILLE, HERMAN"
105,10,47,1,1,105,"('could', 'MD')",MD,could,could,MD,"AUSTEN, JANE"
13720,57,4,2,29,13720,"('universe.', 'NN')",NN,universe.,universe,NN,"MELVILLE, HERMAN"
105,2,6,5,9,105,"('a', 'DT')",DT,a,a,DT,"AUSTEN, JANE"


In [307]:
TOKEN_AUSTEN = TOKEN2[TOKEN2['author'] == 'AUSTEN, JANE'].drop('book_id',axis=1)
TOKEN_HERMAN = TOKEN2[TOKEN2['author'] == 'MELVILLE, HERMAN'].drop('book_id',axis=1)

In [311]:
dtcm_austen = gen_bow(TOKEN_AUSTEN, 'CHAPS')
dtcm_herman = gen_bow(TOKEN_HERMAN, 'CHAPS')

tfidf_austen = gen_tfidf(dtcm_austen, TF_METHOD='max')
tfidf_herman = gen_tfidf(dtcm_herman, TF_METHOD='max')

tfidf_austen_mean = tfidf_austen.mean(axis=0)
tfidf_herman_mean = tfidf_herman.mean(axis=0)

In [326]:
tfidf_austen_mean = pd.DataFrame(tfidf_austen_mean)

In [330]:
tfidf_herman_mean = pd.DataFrame(tfidf_herman_mean)

In [318]:
## I'm a little confused about what the question is looking for, but I'm going to 
##  compare mean tfidfs for adjectives only across authors, find the most
##  significant adjective for each, and then report which author's most significant
##  adjective is more significant. I'll also average all tfidfs across adjectives
##  and report which author has a higher average adjective signifiance. 

In [332]:
# Merge vocab with each author's tfidf table
tfidf_vocab_austen = tfidf_austen_mean.merge(VOCAB[['max_pos']], left_index=True, right_index=True)
tfidf_vocab_herman = tfidf_herman_mean.merge(VOCAB[['max_pos']], left_index=True, right_index=True)

In [342]:
# Filter so that it's only adjectives
tfidf_vocab_austen_adj = tfidf_vocab_austen[tfidf_vocab_austen['max_pos'].isin(['JJ', 'JJR', 'JJS'])]
tfidf_vocab_herman_adj = tfidf_vocab_herman[tfidf_vocab_herman['max_pos'].isin(['JJ', 'JJR', 'JJS'])]

In [354]:
tfidf_vocab_austen_adj.columns = ['tfidf', 'max_pos']
tfidf_vocab_herman_adj.columns = ['tfidf', 'max_pos']

In [356]:
# Find the most significant adjective for each author
tfidf_vocab_austen_adj.sort_values(by='tfidf',ascending=False).head(5)

Unnamed: 0_level_0,tfidf,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
sure,0.013167,JJ
dear,0.012992,JJ
poor,0.012213,JJ
upper,0.011347,JJ
old,0.011327,JJ


In [358]:
tfidf_vocab_herman_adj.sort_values(by='tfidf',ascending=False).head(5)

Unnamed: 0_level_0,tfidf,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
thy,0.028653,JJ
old,0.021042,JJ
ugh,0.015733,JJ
little,0.014585,JJ
good,0.014173,JJ


According to this method, Herman's most significant adjective is more significant than Austen's most significant adjective. Thus, the most significant adjective across these works (conditional on author) is 'thy.' Let's try the other method. 

In [374]:
tfidf_vocab_austen_adj.tfidf.mean()

0.0015357254418560016

In [368]:
tfidf_vocab_herman_adj.tfidf.mean()

0.0007973502092473931

According to this method, Herman's adjectives have lower tfidfs, on average, than Austen's adjectives. 