# M08a Homework
- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub:

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

import configparser
config = configparser.ConfigParser()
config.read('../../../env.ini')
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [19]:
data_prefix = 'novels'

OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]

In [48]:
TOKENS = pd.read_csv(f'{data_home}/{data_prefix}-CORPUS.csv').set_index(OHCO)
TOKENS.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
moonstone,68,6,5,11,IN,in
styles,9,52,0,2,NNP,lawrence
christmascarole,5,47,0,39,CC,and


In [50]:
LIB = pd.read_csv(f'{data_home}/{data_prefix}-LIB.csv').set_index('book_id')
LIB.sample(3)

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
usher,g,poe
christmascarole,g,dickens
reddeath,g,poe


In [52]:
# CountVectorizer parameters
max_features = 4000
stop_words = 'english'

# LatentDirichletAllocation parameters
n_components = 20
max_iter = 5
learning_offset = 50.
random_state = 0

# Hyperparameters
n_words = 7

### Topic model 1: paragraph as bag

In [155]:
BAG = PARA
BAG

['book_id', 'chap_id', 'para_num']

In [157]:
DOCS_para = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(map(str,x)))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})
DOCS_para.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_str
book_id,chap_id,para_num,Unnamed: 3_level_1
secretadversary,17,99,door opened faint light outside gas it first followed threshold picture force head crash glass minute door key lock door inside volley curses
monk,2,84,eer lines world days remorse conscience here
secretadversary,24,47,head


In [159]:
count_engine = CountVectorizer(max_features=max_features, stop_words='english')
count_model = count_engine.fit_transform(DOCS_para.doc_str)
TERMS_para = count_engine.get_feature_names_out()

VOCAB_para = pd.DataFrame(index=TERMS_para)
VOCAB_para.index.name = 'term_str'

DTM_para = pd.DataFrame(count_model.toarray(), index=DOCS_para.index, columns=TERMS_para)

In [161]:
VOCAB_para['doc_count'] = DTM_para.astype('bool').astype('int').sum()
DOCS_para['term_count'] = DTM_para.sum(1)

In [163]:
VOCAB_para.sample(3)

Unnamed: 0_level_0,doc_count
term_str,Unnamed: 1_level_1
police,182
range,14
endeavour,17


In [165]:
lda_engine = LDA(n_components=n_components, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)

In [167]:
TNAMES_para = [f"T{str(x).zfill(len(str(n_components)))}" for x in range(n_components)]

In [169]:
## Theta

lda_model = lda_engine.fit_transform(count_model)

THETA_para = pd.DataFrame(lda_model, index=DOCS_para.index)
THETA_para.columns.name = 'topic_id'
THETA_para.columns = TNAMES_para

In [173]:
THETA_para.sample(10).T.style.background_gradient(cmap="YlGnBu", axis=None)

book_id,udolpho,udolpho,moonstone,dracula,udolpho,udolpho,secretadversary,secretadversary,scarlet,monk
chap_id,18,25,77,1,34,20,21,19,7,7
para_num,70,46,22,1,18,104,52,62,43,27
T00,0.025,0.001724,0.0125,0.016667,0.142258,0.00625,0.267843,0.01,0.406512,0.002632
T01,0.025,0.001724,0.0125,0.016667,0.001724,0.338692,0.0125,0.01,0.265943,0.002632
T02,0.025,0.001724,0.0125,0.016667,0.001724,0.00625,0.0125,0.01,0.0025,0.002632
T03,0.025,0.001724,0.0125,0.016667,0.001724,0.00625,0.0125,0.01,0.0025,0.002632
T04,0.025,0.001724,0.0125,0.016667,0.001724,0.00625,0.0125,0.668467,0.0025,0.002632
T05,0.025,0.001724,0.0125,0.016667,0.300716,0.548808,0.0125,0.01,0.0025,0.362824
T06,0.025,0.001724,0.0125,0.35,0.001724,0.00625,0.0125,0.01,0.0025,0.002632
T07,0.025,0.001724,0.0125,0.016667,0.001724,0.00625,0.0125,0.01,0.0025,0.002632
T08,0.025,0.001724,0.7625,0.016667,0.001724,0.00625,0.0125,0.01,0.0025,0.002632
T09,0.025,0.001724,0.0125,0.016667,0.001724,0.00625,0.0125,0.01,0.0025,0.002632


In [175]:
## Phi

PHI_para = pd.DataFrame(lda_engine.components_, columns=TERMS_para, index=TNAMES_para)
PHI_para.index.name = 'topic_id'
PHI_para.columns.name = 'term_str'

In [179]:
PHI_para.T.sample(10).style.background_gradient(cmap="YlGnBu", axis=None)

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
look,33.865934,63.203583,12.34912,29.696811,0.064484,56.671679,3.049927,13.58603,2.726025,4.635823,11.533026,2.236096,8.572692,3.262962,8.242366,52.639352,128.585504,16.57971,7.005439,9.493436
clung,0.05,0.078132,0.05,3.021868,0.05,0.05,5.217958,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1.212569,0.05,0.05,0.05,1.719473,0.05
streets,0.05,0.081643,0.05,9.086383,0.05,0.05,0.728946,0.162312,5.161197,6.212605,0.05,0.057477,42.670372,0.050433,0.050199,0.050049,0.05,0.05,12.288327,0.050056
habit,7.191909,20.120166,1.476443,1.747944,5.233403,43.140901,4.761565,2.401394,7.902001,1.503321,3.753748,4.874863,3.593097,0.355806,0.721456,10.067931,0.050002,2.3018,5.011161,0.79109
pause,0.05,0.084734,0.05,1.725974,0.402096,66.96009,0.05,1.15984,1.637789,0.05,3.338539,0.05,0.569472,2.386707,11.162682,8.385899,10.601576,10.886187,0.053259,0.395157
restless,0.05,3.05,2.05,0.05,0.05,0.05,4.972255,0.05,0.05,1.552626,0.05,0.05,0.05,3.937383,0.05,0.05,6.000581,3.787155,0.05,0.05
expressive,0.05,0.05,0.05,0.05,0.05,5.229241,0.05,0.66943,0.05,0.05,0.05,0.05,0.05,0.05,2.12968,0.05,0.05,0.05,0.05,2.17165
edmund,0.05,12.170854,0.05,0.050329,11.039525,0.05,0.05,0.05,0.280823,0.055453,0.059427,21.793127,0.05,1.031487,0.05,0.373721,3.51078,0.05,0.05,4.184474
musing,0.05,1.05,0.05,0.05,0.065514,0.05,2.274947,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,6.809539,0.05,0.05
rocks,0.050083,5.95455,1.526565,2.296597,0.050082,0.05,0.120707,0.050207,0.05,3.73475,0.903271,0.07633,1.328502,0.05,24.307736,0.05,0.069701,0.066152,57.607331,0.657435


In [185]:
## Topics

TOPICS_para = PHI_para.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(n_words).reset_index().term_str))\
    .to_frame('top_terms')

In [187]:
TOPICS_para.style

Unnamed: 0_level_0,top_terms
topic_id,Unnamed: 1_level_1
T00,time mother house girl lady family matter
T01,man lady wife sort room help way
T02,words ah things time hand note sure
T03,house door night friend body servants gentleman
T04,opinion power man affair eyes manner way
T05,heart moment eyes life countenance tears world
T06,night dont business mind work time dinner
T07,day letter time dear friends friend way
T08,morning house time place way thing mr
T09,oh man good head end castle moment


### Topic model 2: chapter as bag

In [189]:
BAG = CHAP
BAG

['book_id', 'chap_id']

In [191]:
DOCS_chap = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(map(str,x)))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})

In [193]:
count_engine = CountVectorizer(max_features=max_features, stop_words='english')
count_model = count_engine.fit_transform(DOCS_chap.doc_str)
TERMS_chap = count_engine.get_feature_names_out()

VOCAB_chap = pd.DataFrame(index=TERMS_chap)
VOCAB_chap.index.name = 'term_str'

DTM_chap = pd.DataFrame(count_model.toarray(), index=DOCS_chap.index, columns=TERMS_chap)

In [195]:
VOCAB_chap['doc_count'] = DTM_chap.astype('bool').astype('int').sum()
DOCS_chap['term_count'] = DTM_chap.sum(1)

In [197]:
VOCAB_chap.sample(3)

Unnamed: 0_level_0,doc_count
term_str,Unnamed: 1_level_1
environs,13
safe,18
peril,13


In [199]:
lda_engine = LDA(n_components=n_components, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)

In [201]:
TNAMES_chap = [f"T{str(x).zfill(len(str(n_components)))}" for x in range(n_components)]

In [203]:
## Theta

lda_model = lda_engine.fit_transform(count_model)

THETA_chap = pd.DataFrame(lda_model, index=DOCS_chap.index)
THETA_chap.columns.name = 'topic_id'
THETA_chap.columns = TNAMES_chap

In [205]:
THETA_chap.sample(10).T.style.background_gradient(cmap="YlGnBu", axis=None)

book_id,northangerabbey,moonstone,secretadversary,northangerabbey,frankenstein,udolpho,northangerabbey,styles,northangerabbey,moonstone
chap_id,28,71,14,4,38,54,27,9,26,78
T00,9.7e-05,8.8e-05,0.000172,0.000258,0.000114,5.5e-05,0.000357,9.3e-05,0.000137,0.208333
T01,9.7e-05,8.8e-05,0.796979,0.000258,0.000114,5.5e-05,0.000357,0.887709,0.000137,0.122527
T02,9.7e-05,8.8e-05,0.000172,0.000258,0.000114,5.5e-05,0.000357,9.3e-05,0.000137,8.5e-05
T03,9.7e-05,0.057187,0.000172,0.000258,0.000114,5.5e-05,0.000357,0.049432,0.000137,8.5e-05
T04,9.7e-05,8.8e-05,0.000172,0.000258,0.001868,5.5e-05,0.000357,9.3e-05,0.000137,8.5e-05
T05,0.372473,8.8e-05,0.000172,0.290034,0.303782,0.618033,0.000357,9.3e-05,0.000137,8.5e-05
T06,9.7e-05,8.8e-05,0.000172,0.000258,0.000114,5.5e-05,0.000357,9.3e-05,0.000137,0.033388
T07,0.527404,0.941234,0.157898,0.455023,0.232546,5.5e-05,0.497971,9.3e-05,0.622583,0.603751
T08,9.7e-05,8.8e-05,0.000172,0.000258,0.000114,5.5e-05,0.000357,9.3e-05,0.000137,8.5e-05
T09,9.7e-05,8.8e-05,0.000172,0.000258,0.000114,5.5e-05,0.000357,9.3e-05,0.000137,8.5e-05


In [207]:
## Phi

PHI_chap = pd.DataFrame(lda_engine.components_, columns=TERMS_chap, index=TNAMES_chap)
PHI_chap.index.name = 'topic_id'
PHI_chap.columns.name = 'term_str'

In [209]:
PHI_chap.T.sample(10).style.background_gradient(cmap="YlGnBu", axis=None)

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
identity,1.093618,15.361059,0.05,0.052559,0.05,0.178992,12.443326,0.945756,0.05,0.05,0.050002,0.05,0.05,0.05,0.092877,0.050004,0.05,0.05,1.231807,0.05
signal,0.05,10.95339,0.05,0.05,0.05,5.830583,1.061554,0.135245,1.220334,4.346551,0.050005,0.05,0.05,0.05,7.871996,1.44621,0.100861,3.325064,2.258208,0.05
commencement,0.05,0.05,0.05,0.05,0.05,0.050069,2.357567,0.05,0.05,0.085443,0.05,0.05,0.05,0.05,1.118133,2.073308,0.05,6.600704,0.064775,0.05
exertion,0.05,0.05,0.05,0.05,0.05,7.759057,0.137018,2.710342,2.267408,0.05,3.408,0.05,0.05,0.05,0.081592,1.190523,0.963863,5.059432,1.728278,2.244488
illusion,0.05,3.476117,1.05,0.05,0.05,2.521688,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.661545,0.05,0.05,2.004491,0.05,7.586159
conjectures,0.05,1.660889,0.05,0.05,0.05,5.861677,0.05,1.897934,0.05,1.220263,0.05,3.05,0.05,0.05,1.43067,0.05,0.05,1.228567,0.05,0.05
sentence,0.05,13.418836,0.05,0.05,0.05,19.497461,2.535357,13.939468,0.05,5.821109,0.05,9.27219,0.05,4.893405,0.050024,4.370074,1.45426,1.121501,0.230049,4.046268
waistcoat,2.502937,6.813978,0.05,5.446087,0.05,0.05,0.05,1.492795,0.05,0.05,0.05,0.05,0.05,0.05,0.05,7.496339,0.05,0.05,3.547864,0.05
shall,0.050017,25.884795,1.067199,4.232454,2.799157,9.90845,0.05,10.649716,0.05,0.264055,2.466336,0.05,0.05,0.05,0.050009,0.050009,0.05,0.505551,6.73495,1.037303
rod,0.05,0.139321,0.05,0.05,0.05,0.05,0.05,0.05,12.853088,1.053378,0.05,0.05,0.05,0.05,1.154213,0.05,0.05,0.05,0.05,0.05


In [211]:
## Topics

TOPICS_chap = PHI_chap.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(n_words).reset_index().term_str))\
    .to_frame('top_terms')

In [213]:
TOPICS_chap.style

Unnamed: 0_level_0,top_terms
topic_id,Unnamed: 1_level_1
T00,lady time house way birthday man father
T01,man time room yes way door sir
T02,sea rocks time day man way letter
T03,night time door room way day things
T04,man heart creator rage creature brother companion
T05,heart eyes time hand moment night voice
T06,body corpse evidence murder river period water
T07,time house way room man day mind
T08,voice house door head room man words
T09,moor man night hound death house eyes


## Questions

#### 1. Use the PHI table from each model to compute the entropy H of the distribution over topics. Which bag generates a lower entropy distribution? Hint: To get H work with the L1 normalized vector of word weight sums by topic in the PHI table.

#### 2. Sort the topics in each model's PHI table by topic entropy in descending order. Are the first topics in the two models about the same? In other words, do they yield similar interpretations?

#### 3. What topic from each model is most strongly associated with each genre? Note that your answer have four parts.

#### 4. Using the THETA table from the Chapters model, get the mean topic weights for each book. Which book is most strongly associated with the gothic genre g, based on the weight of that genre's most representative topic (as discovered in the previous question)?

#### 5. How would you characterize the subject matter of the two genres based on their topic models? Consider the words associated with the dominant topics from each model, but also the models overall.