# M06 Homework

- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub: https://github.com/sqr8ap/DS5001-2025-01-R/blob/m06/lessons/M06_ClusteringSimilarity/M06_HW.ipynb

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_hone = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [17]:
data_prefix = 'austen-melville'
OHCO = ['book_id', 'chap_id']
OHCO_token = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
bag = 'CHAPS'
colors = "YlGnBu"
tf_agg = 'sum'

In [7]:
import pandas as pd
import numpy as np
import plotly_express as px
import seaborn as sns; sns.set()
from numpy.linalg import norm
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

## Pre-Question Tasks

### Import & Process Data

In [19]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
TOKEN = pd.read_csv(f'{output_dir}/{data_prefix}-CORPUS.csv').set_index(OHCO_token).dropna()

In [29]:
LIB.head()

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$,83624,24
121,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$,77601,31
141,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$,160378,48
158,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$,160926,55
161,/Users/Samantha/Desktop/MSDS/DS5001/data/guten...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$,119873,50


In [39]:
# Add feature to LIB for publication year
title_to_year = {
    'EMMA': '1815 EMMA',
    'LADY SUSAN': '1794 LADY SUSAN',
    'LOVE AND FREINDSHIP SIC': '1790 LOVE AND FREINDSHIP',
    'MANSFIELD PARK': '1814 MANSFIELD PARK',
    'NORTHANGER ABBEY': '1803 NORTHANGER ABBEY',
    'PERSUASION': '1818 PERSUASION',
    'PRIDE AND PREJUDICE': '1813 PRIDE AND PREJUDICE',
    'SENSE AND SENSIBILITY': '1811 SENSE AND SENSIBILITY'
}
LIB['year'] = LIB['title'].map(title_to_year)

In [46]:
# Bring in functions from last homework, modify TFIDF to include DFIDF
def gen_bow(TOKENS, OHCO_LEVEL='CHAPS'):
    '''
    This function takes a tokens table and a choice of bag and returns a BOW representation in the form of a document-term count matrix. 

    Parameters
    TOKENS: tokens table; a dataframe
    OCHO_LEVEL: choice of bag; a string (either 'BOOKS', 'CHAPS', 'PARAS' or 'SENTS'); defaults to 'CHAPS'

    Returns
    DTCM: document-term count matrix
    '''
    
    bags = dict(
        SENTS = OHCO[:4],
        PARAS = OHCO[:3],
        CHAPS = OHCO[:2],
        BOOKS = OHCO[:1])
    
    BOW = TOKENS.groupby(bags[OHCO_LEVEL]+['term_str']).term_str.count().to_frame('n')
    DTCM = BOW.n.unstack(fill_value=0)

    return DTCM

In [48]:
def gen_tfidf(DTCM, TF_METHOD='sum'):
    '''
    This function takes a BOW table (DTCM) and type of tf metric and returns the TFIDF values for the BOW. 

    Parameters
    DTCM: BOW table; a dataframe
    TF_METHOD: a string; either 'sum', 'max', 'log', 'raw', 'double_norm' or 'binary'; defaults to 'sum'

    Returns
    TFIDF: a dataframe
    '''

    tf_norm_k = 0.5
    idf_method = 'standard'
    gradient_cmap = 'YlGnBu'
    tf = {
        'sum': (DTCM.T / DTCM.T.sum()).T,
        'max': (DTCM.T / DTCM.T.max()).T,
        'log': (np.log2(1 + DTCM.T)).T,
        'raw':  DTCM,
        'double_norm': (DTCM.T / DTCM.T.max()).T,
        'binary': DTCM.T.astype('bool').astype('int').T}

    TF = tf[TF_METHOD]

    DF = DTCM.astype('bool').sum() 

    N = DTCM.shape[0]   
    
    IDF = np.log2(N / DF)

    TFIDF = TF * IDF
    DFIDF = DF * IDF
    
    return TFIDF, DFIDF

In [78]:
# Apply to Austen's works only using chapters as bags and max as the tf method
LIB = LIB.loc[LIB['author'] == 'AUSTEN, JANE']
TOKEN = TOKEN.loc[TOKEN.index.get_level_values('book_id').isin([158, 946, 1212, 141, 121, 105, 1342, 161])]

my_dtcm = gen_bow(TOKEN) # default bag level is chapter
idfs = gen_tfidf(my_dtcm, TF_METHOD = 'max')
TFIDF, DFIDF = idfs[0], idfs[1]

In [102]:
# Reduce number of features in TFIDF matrix

## First generate VOCAB table
VOCAB = TOKEN.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = TOKEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = TOKEN[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

## Filter and reduce TFIDF matrix
pos = {"NN", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS", "RB", "RBR", "RBS"}
pos_match = VOCAB[VOCAB['max_pos'].isin(pos)].index
filtered_dfidf = DFIDF[DFIDF.index.isin(pos_match)]
top_terms = filtered_dfidf.nlargest(1000).index
TFIDF = TFIDF[top_terms]

In [104]:
TFIDF

Unnamed: 0_level_0,term_str,forward,greatest,respect,stay,thinking,assure,fortune,marriage,believed,entered,...,number,picture,powers,scene,sensations,stairs,tired,truly,unable,till
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,0.000000,0.000000,0.023245,0.000000,0.023245,0.000000,0.023435,0.035152,0.000000,0.000000,...,0.0000,0.0000,0.0,0.023321,0.046643,0.000000,0.0,0.000000,0.000000,0.000000
105,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015457,0.030415,0.000000,...,0.0000,0.0000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
105,3,0.000000,0.011911,0.011911,0.000000,0.011911,0.024016,0.048032,0.000000,0.000000,0.000000,...,0.0239,0.0239,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.004817
105,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.057354,0.000000,0.037619,0.000000,...,0.0000,0.0000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
105,5,0.010443,0.000000,0.000000,0.041774,0.010443,0.042115,0.000000,0.021057,0.000000,0.000000,...,0.0000,0.0000,0.0,0.020955,0.000000,0.000000,0.0,0.000000,0.020955,0.016893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,57,0.023626,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.142915,0.000000,0.000000,...,0.0000,0.0000,0.0,0.000000,0.000000,0.000000,0.0,0.047407,0.000000,0.009554
1342,58,0.000000,0.000000,0.014858,0.000000,0.000000,0.029958,0.000000,0.000000,0.044212,0.000000,...,0.0000,0.0000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.012017
1342,59,0.000000,0.019742,0.019742,0.019742,0.000000,0.000000,0.019904,0.039807,0.000000,0.039165,...,0.0000,0.0000,0.0,0.000000,0.000000,0.039614,0.0,0.039614,0.079229,0.015967
1342,60,0.018243,0.000000,0.018243,0.000000,0.018243,0.000000,0.000000,0.018392,0.000000,0.000000,...,0.0000,0.0000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.014755
