# Homework 08

```yaml
Course:    DS 5001 
Module:    08 Homework
Topic:     LDA with SciKit Learn
Author:    Ryan Lipps
Date:      23 March 2023
```

## Setup

### Packages

In [1]:
import pandas as pd
import numpy as np
import configparser
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

### Config

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

### Read data

In [3]:
CORPUS = pd.read_csv(f'{data_home}/novels/novels-CORPUS.csv')
LIB = pd.read_csv(f'{data_home}/novels/novels-LIB.csv').set_index('book_id')

In [4]:
CORPUS.head()

Unnamed: 0,book_id,chap_id,para_num,sent_num,token_num,pos,term_str
0,secretadversary,1,0,1,0,DT,the
1,secretadversary,1,0,1,1,NNP,young
2,secretadversary,1,0,1,2,NNP,adventurers
3,secretadversary,1,0,1,3,NNP,ltd
4,secretadversary,1,1,0,0,JJ,tommy


In [5]:
CORPUS = CORPUS.set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']).dropna()
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


In [6]:
LIB.head()

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
secretadversary,d,christie
styles,d,christie
moonstone,d,collins
adventures,d,doyle
baskervilles,d,doyle


## Topic Model Class

In [7]:
class TopicModel():
    '''
    DOCSTRING GOES HERE
    '''
    def __init__(self, corpus:pd.DataFrame, lib:pd.DataFrame, bag=list, pos_filter=str):
        '''
        DOCSTRING GOES HERE
        '''
        self.corpus = corpus
        self.lib = lib
        self.bag = bag
        self.pos_filter = pos_filter
        self.docs = self.__create_docs(pos_filter)


    def __create_docs(self, pos_filter:list):
        '''
        Function to generate docs by a specified bag from a corpus

        Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

        PARAMETERS:

        `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

        `level` - string of OHCO to group by for bags

        OUTPUTS:

        pandas DataFrame of bag documents grouped by `level`

        EXAMPLE:

        `DOC = get_doc(CORPUS, 'chap_id')`
        '''
        # Get multi-index from `corpus` df
        idx = list(self.corpus.index.names)

        # Check to see that `level` exists in `corpus` OHCO
        # Raise error if not
        #if (self.bag not in idx):
            #raise KeyError (f'{self.bag} not found in corpus OHCO')

        # Filter corpus for pos
        # Split-apply-combine to generate DOCs grouped by `level`
        return self.corpus[self.corpus.pos.str.match(fr'^{pos_filter}?$')]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})


    def count_vectorize(self, max_features=4000, ngram_range = (1,1), stop_words='english'):
        '''
        DOCSTRING GOES HERE
        '''
        self.count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
        self.count_model = self.count_engine.fit_transform(self.docs.doc_str)
        self.terms = self.count_engine.get_feature_names_out()

        return None
    

    def generate_lda_model(self, random_state:int, n_topics:int=20, max_iter:int=5, learning_offset:float=50.):
        '''
        DOCSTRING GOES HERE
        '''
        self.lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
        self.lda_model = self.lda_engine.fit_transform(self.count_model)
        self.topic_names = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        return None

    def generate_vocab(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.vocab = pd.DataFrame(index=self.terms)
        self.vocab.index.name = 'term_str'
        return None
    

    def generate_dtm(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.dtm = pd.DataFrame(self.count_model.toarray(), index=self.docs.index, columns=self.terms)
        self.vocab['doc_count'] = self.dtm.astype('bool').astype('int').sum()
        self.docs['term_count'] = self.dtm.sum(1)
        return None
    
    
    def generate_theta(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.theta = pd.DataFrame(self.lda_model, index=self.docs.index)
        self.theta.columns.name = 'topic_id'
        self.theta.columns = self.topic_names
        return None
    

    def generate_phi(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.phi = pd.DataFrame(self.lda_engine.components_, columns=self.terms, index=self.topic_names)
        self.phi.index.name = 'topic_id'
        self.phi.columns.name = 'term_str'
        return None
    

    def generate_topics(self, n_top_terms:int=7):
        '''
        DOCSTRING GOES HERE
        '''
        self.topics = self.phi.stack().groupby('topic_id')\
            .apply(lambda x: ' '.join(x.sort_values(ascending=False)\
                                      .head(n_top_terms)\
                                        .reset_index().term_str))\
            .to_frame('top_terms')
        self.topics['doc_weight_sum'] = self.theta.sum()
        self.topics['term_freq'] = self.phi.sum(1) / self.phi.sum(1).sum()
        return None
    
    def append_metadata_to_topics(self, metadata:str, join_on:str):
        '''
        DOCSTRING GOES HERE
        '''
        _md_frame = sorted(self.lib[metadata].value_counts().index.to_list())
        self.topics[_md_frame] = self.theta.join(self.lib, on=join_on).groupby(metadata)[self.topic_names].mean().T
        self.topics[metadata] = self.topics[_md_frame].idxmax(1)
        return None

In [8]:
paragraph_model = TopicModel(CORPUS, LIB, ['book_id', 'chap_id', 'para_num'], 'NNS')
paragraph_model.count_vectorize()
paragraph_model.generate_vocab()
paragraph_model.generate_dtm()
paragraph_model.generate_lda_model(random_state=0)
paragraph_model.generate_phi()
paragraph_model.generate_theta()
paragraph_model.generate_topics()
paragraph_model.append_metadata_to_topics('genre_id', 'book_id')
paragraph_model.append_metadata_to_topics('author_id', 'book_id')

In [9]:
chapter_model = TopicModel(CORPUS, LIB, ['book_id', 'chap_id'], 'NNS')
chapter_model.count_vectorize()
chapter_model.generate_vocab()
chapter_model.generate_dtm()
chapter_model.generate_lda_model(random_state=0)
chapter_model.generate_phi()
chapter_model.generate_theta()
chapter_model.generate_topics()
chapter_model.append_metadata_to_topics('genre_id', 'book_id')
chapter_model.append_metadata_to_topics('author_id', 'book_id')

In [10]:
paragraph_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.05,1.392494,1.097508,0.05,5.556778,0.05,5.531952,0.05,16.093926,0.960684,...,0.05,0.05,0.05,0.05,0.05,0.05,3.598732,0.05,0.05,2.321462
T01,1.389969,0.05,0.05,0.05,1.05,0.05,0.05,0.05,2.151322,4.05,...,2.846534,5.585872,3.05,0.05,1.836272,2.599044,0.731493,0.05,0.05,0.05
T02,0.05,0.05,0.05,0.05,5.045453,0.05,0.05,0.05,1.460901,0.05,...,0.05,0.05,18.417949,0.05,0.05,31.982578,0.086155,1.05,0.064635,1.047011
T03,0.051125,0.05,0.05,0.05,0.05,0.05,0.05,1.184665,5.130487,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.244053,0.05,26.009869,0.091742
T04,0.05,0.1113,0.05,3.570093,0.05,9.544803,0.05,0.05,1.19821,0.756529,...,0.05,0.05,0.05,0.05,0.505932,0.05,3.277104,2.120083,0.051028,0.05


In [11]:
paragraph_model.theta.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
book_id,chap_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
adventures,1,2,0.001563,0.001563,0.174495,0.001563,0.202608,0.457993,0.001563,0.06939,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.072077,0.001563,0.001563,0.001563,0.001563
adventures,1,3,0.001087,0.001087,0.001087,0.001087,0.001087,0.091133,0.001087,0.001087,0.582047,0.001087,0.001087,0.001087,0.061502,0.167385,0.001087,0.001087,0.001087,0.001087,0.001087,0.081629
adventures,1,4,0.001471,0.001471,0.001471,0.001471,0.001471,0.001471,0.261646,0.001471,0.001471,0.001471,0.388717,0.001471,0.001471,0.001471,0.324637,0.001471,0.001471,0.001471,0.001471,0.001471
adventures,1,5,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.460783,0.004167,0.299014,0.16937,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167


In [35]:
chapter_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.05,0.05,0.05,0.05,0.050011,0.05,1.425109,0.05,0.703066,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.702322,0.05,0.05,0.05
T01,0.05,0.05,0.732067,1.381887,2.463208,1.428486,2.968166,5.892598,17.19947,1.056784,...,0.05,8.754761,21.900551,19.614846,5.881342,49.740533,8.642145,0.05,21.015508,4.510612
T02,0.05,7.035474,0.749836,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
T03,0.05,0.064885,0.05,0.05,0.05,0.05,0.05,0.05,0.086202,0.05,...,0.05,0.05,1.199449,1.59606,4.477756,3.35327,0.055813,0.05,4.119849,0.05
T04,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,4.339374,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [13]:
chapter_model.theta.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adventures,1,4.2e-05,0.629197,4.2e-05,4.2e-05,4.2e-05,0.015826,4.2e-05,0.026276,4.2e-05,4.2e-05,4.2e-05,0.007424,4.2e-05,4.2e-05,4.2e-05,0.320648,4.2e-05,4.2e-05,4.2e-05,4.2e-05
adventures,2,3.9e-05,0.427198,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,0.572104,3.9e-05,3.9e-05,3.9e-05,3.9e-05
adventures,3,5.3e-05,0.749236,5.3e-05,5.3e-05,5.3e-05,0.029199,5.3e-05,0.058224,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,0.162498,5.3e-05,5.3e-05,5.3e-05,5.3e-05
adventures,4,3.5e-05,0.456002,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,0.00983,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,0.058243,3.5e-05,3.5e-05,0.475366,3.5e-05
adventures,5,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,0.999148,4.5e-05,4.5e-05,4.5e-05,4.5e-05


## Question 1:
Use the PHI table from each model to compute the entropy H of the distribution over topics. Which bag generates a lower entropy distribution? Hint: To get H work with the L1 normalized vector of word weight sums by topic in the PHI table.

### Answer 1:
**The chapter model has a lower entropy distribution**

In [48]:
para_p = paragraph_model.phi.sum(axis=1)/paragraph_model.phi.sum(axis=1).sum()
np.sum(para_p*np.log2(1/para_p))


4.252625756822118

In [47]:
chap_p = chapter_model.phi.sum(axis=1)/chapter_model.phi.sum(axis=1).sum()
np.sum(chap_p*np.log2(1/chap_p))

3.691437805520074

## Question 2:
Sort the topics in each model's PHI table by topic entropy in descending order. Are the first topics in the two models about the same? In other words, do they yield similar interpretations?

### Answer 2:
**In comparing the first topic from the paragraph model to the first topic from the chapter model, the topics yield different interpretations.**

**In comparing the top 5 topics within each model, the top topics within a model have similar interpretations.**

In [56]:
paragraph_model.phi['p'] = para_p.values
paragraph_model.phi['entropy'] = paragraph_model.phi.p.apply(lambda x: x*np.log2(1/x))
paragraph_model.phi.sort_values('entropy', ascending=False).head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,youd,youll,young,youre,youth,youths,youve,zeal,p,entropy
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T05,0.07403,0.050186,14.832808,2.713785,0.05,8.808527,0.05,1.068624,10.233638,3.992138,...,0.05,0.05,0.05,0.05,50.099111,0.172917,0.05,2.933685,0.099513,0.331277
T07,0.069005,16.741903,0.05,1.05,0.078702,0.05,0.240719,0.05,42.216149,1.417465,...,0.05,0.05,0.05,0.05,7.175745,0.052731,0.05,0.05,0.065637,0.25791
T18,0.05,0.592585,0.05,0.396563,1.167733,0.05,0.05,0.05,2.344729,0.055257,...,0.05,0.05,14.836089,0.115114,2.656137,0.05,0.05,0.0532,0.06459,0.255294
T17,32.980654,0.05,0.054853,0.05,0.05,3.00129,0.05,0.05,24.769163,6.491141,...,0.05,0.05,0.053799,0.05,12.508823,0.05,0.05,0.05,0.063722,0.253108
T14,0.05,3.46667,0.05,0.05,0.05,0.336767,0.05,0.05,0.05,0.05,...,0.05,0.058115,0.05,0.05,0.901473,0.051108,0.05,0.05,0.062923,0.251079


In [60]:
paragraph_model.topics.loc[['T05', 'T07', 'T18', 'T17', 'T14']]

Unnamed: 0_level_0,top_terms,doc_weight_sum,term_freq,d,g,genre_id,austen,christie,collins,dickens,doyle,lewis,poe,radcliffe,reeve,shelley,stoker,walpole,author_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T05,heart moment eyes life countenance tears world,1779.931438,0.099513,0.031558,0.112456,g,0.060057,0.034214,0.035265,0.044468,0.026239,0.231468,0.071381,0.116044,0.077624,0.185503,0.04461,0.106025,lewis
T07,day letter time dear friends friend way,1580.127698,0.065637,0.0555,0.064961,g,0.173456,0.048806,0.076781,0.043419,0.049359,0.047996,0.029437,0.055815,0.073169,0.058657,0.059865,0.033667,austen
T18,day country sun home sea mountains place,1186.278037,0.06459,0.034613,0.057719,g,0.044241,0.031171,0.031032,0.045731,0.040042,0.025806,0.040079,0.08102,0.032935,0.132585,0.058514,0.031009,shelley
T17,heart man mind love time evening friend,1275.016572,0.063722,0.02666,0.075412,g,0.064283,0.028777,0.023392,0.031951,0.026679,0.076002,0.032426,0.113387,0.058914,0.118792,0.024855,0.078381,shelley
T14,sir voice light sound steps door distance,1330.316374,0.062923,0.040668,0.062377,g,0.03248,0.043844,0.033459,0.044936,0.041631,0.063996,0.07581,0.103899,0.037154,0.048001,0.034828,0.049594,radcliffe


In [59]:
chapter_model.phi['p'] = chap_p.values
chapter_model.phi['entropy'] = chapter_model.phi.p.apply(lambda x: x*np.log2(1/x))
chapter_model.phi.sort_values('entropy', ascending=False).head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,youd,youll,young,youre,youth,youths,youve,zeal,p,entropy
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T01,0.05,0.05,0.732067,1.381887,2.463208,1.428486,2.968166,5.892598,17.19947,1.056784,...,21.900551,19.614846,5.881342,49.740533,8.642145,0.05,21.015508,4.510612,0.174892,0.439935
T05,3.08805,1.268309,8.114823,7.234996,2.178292,19.12973,0.821483,3.658566,44.854167,8.799061,...,0.05,0.05,5.4457,0.05,53.549357,2.570966,0.05,6.946125,0.157768,0.420314
T07,0.05,10.478463,0.05,0.05,5.311726,2.08187,4.426664,0.05,13.061299,0.05,...,0.05,0.05,2.566685,0.05,3.191737,0.192755,0.060049,4.155831,0.109744,0.349841
T17,0.066299,0.05,5.369975,0.05,1.612951,1.25179,0.050026,0.05,5.834247,3.304101,...,0.05,0.05,0.05,0.05,7.209968,0.05,0.05,1.331515,0.098331,0.329036
T19,60.995651,0.05,0.05,0.05,0.050026,0.05,0.706235,0.05,10.479349,0.05,...,0.05,0.05,0.05,0.05,17.215743,0.05,0.05,0.503554,0.061076,0.246335


In [62]:
chapter_model.topics.loc[['T01', 'T05', 'T07', 'T17', 'T19']]

Unnamed: 0_level_0,top_terms,doc_weight_sum,term_freq,d,g,genre_id,austen,christie,collins,dickens,doyle,lewis,poe,radcliffe,reeve,shelley,stoker,walpole,author_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T01,man time room yes way door sir,67.923332,0.174892,0.467562,0.008593,d,0.000163,0.943067,0.12744,0.229655,0.412561,0.001713,5e-05,0.000762,0.003529,0.000158,0.010079,4.1e-05,christie
T05,heart eyes time hand moment night voice,38.742588,0.157768,0.007177,0.211929,g,0.133702,0.008957,0.007403,7.1e-05,0.005625,0.977318,0.074825,0.157503,0.400547,0.28431,0.015929,0.023148,lewis
T07,time house way room man day mind,46.961651,0.109744,0.220831,0.087661,d,0.351083,0.012362,0.621086,0.020979,0.043042,0.00756,5e-05,0.016003,0.196985,0.025802,0.034058,4.1e-05,collins
T17,castle night mind chamber door voice room,25.905096,0.098331,0.004452,0.141982,g,0.003637,0.002609,0.000338,7.1e-05,0.009616,0.002916,0.001265,0.336,5.5e-05,0.195686,0.018019,4.1e-05,radcliffe
T19,chateau mind tears woods evening heart counten...,20.609669,0.061076,0.000787,0.115157,g,0.02932,0.002483,0.000114,7.1e-05,8.9e-05,3.4e-05,0.007166,0.333553,5.5e-05,0.019194,6.2e-05,4.1e-05,radcliffe


## Question 3:
What topic from each model is most strongly associated with each genre? Note that your answer have four parts.

### Answer 3:
**Paragraph Model**

* **Detective is most strongly associated with topic 8**
* **Gothic is most strongly associated with topic 5**

**Chapter Model**

* **Detective is most strongly associated with topic 1**
* **Gothic is most strongly associated with topic 5**


In [25]:
paragraph_model.topics[['top_terms', 'd']].sort_values('d', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,d
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T08,morning house time place way thing mr,0.072432


In [26]:
paragraph_model.topics[['top_terms', 'g']].sort_values('g', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,g
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T05,heart moment eyes life countenance tears world,0.112456


In [27]:
chapter_model.topics[['top_terms', 'd']].sort_values('d', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,d
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T01,man time room yes way door sir,0.467562


In [28]:
chapter_model.topics[['top_terms', 'g']].sort_values('g', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,g
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T05,heart eyes time hand moment night voice,0.211929


## Question 4:
Using the THETA table from the Chapters model, get the mean topic weights for each book. Which book is most strongly associated with the gothic genre g, based on the weight of that genre's most representative topic (as discovered in the previous question)?

### Answer 4:
**Monk**

In [29]:
chapter_model.theta.groupby('book_id')\
    .mean()\
    .sort_values('T05', ascending=False)

Unnamed: 0_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
monk,3.4e-05,0.001713,3.4e-05,0.00186,3.4e-05,0.977318,3.4e-05,0.00756,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,0.001467,3.4e-05,3.4e-05,0.002916,0.006724,3.4e-05
oldenglishbaron,5.5e-05,0.003529,5.5e-05,0.013446,5.5e-05,0.400547,0.001634,0.196985,5.5e-05,5.5e-05,5.5e-05,0.242466,5.5e-05,0.099948,0.009078,0.012164,0.01233,5.5e-05,0.007379,5.5e-05
usher,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,0.292059,0.027631,4.3e-05,0.180845,0.11597,0.025197,4.3e-05,4.3e-05,4.3e-05,0.156455,4.3e-05,0.159632,0.006116,4.3e-05,0.035622
frankenstein,0.003804,0.000158,0.000158,0.00666,0.062474,0.28431,0.006261,0.025802,0.000158,0.002357,0.016381,0.005853,0.000158,0.001606,0.152014,0.036595,0.000158,0.195686,0.180214,0.019194
udolpho,0.005035,0.000762,0.000828,0.00251,0.000579,0.157503,0.000796,0.016003,8.7e-05,0.000306,8.7e-05,0.005993,8.7e-05,8.7e-05,0.103448,0.001391,0.031048,0.336,0.003899,0.333553
northangerabbey,0.00085,0.000163,0.032283,0.000163,0.032263,0.133702,0.000163,0.351083,0.007458,0.000163,0.013114,0.000163,0.000163,0.000163,0.000163,0.002167,0.082044,0.003637,0.310777,0.02932
reddeath,0.000121,0.000121,0.000121,0.000121,0.000121,0.068769,0.000121,0.000121,0.000121,0.057754,0.000121,0.000121,0.0745,0.000121,0.491069,0.000121,0.306092,0.000121,0.000121,0.000121
castleofotranto,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05,0.023148,4.1e-05,4.1e-05,0.000587,4.1e-05,4.1e-05,0.811165,4.1e-05,0.164443,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05
dracula,6.2e-05,0.010079,0.040029,0.541501,6.2e-05,0.015929,6.2e-05,0.034058,6.2e-05,0.001156,0.028031,6.2e-05,6.2e-05,6.2e-05,0.01582,0.170249,0.017614,0.018019,0.107016,6.2e-05
styles,8.7e-05,0.9356,8.7e-05,0.020793,8.7e-05,0.014734,8.7e-05,8.7e-05,8.7e-05,8.7e-05,0.00481,8.7e-05,8.7e-05,8.7e-05,8.7e-05,8.7e-05,0.004922,8.7e-05,0.010485,0.007526


## Question 5:
How would you characterize the subject matter of the two genres based on their topic models? Consider the words associated with the dominant topics from each model, but also the models overall.

### Answer 5:
**Overall, the genres are pretty similar by these topic models. They are characterized together by focus on setting, time, and people. The discerning words within the gothic genre appear to be concerned more with emotion, ethereal concepts, and broader setting, whereas detective novels are associated with more immediate, and definite people and setting. I think this is reflected in the assignment of topics to genres: More topics have max association with detective novels at the paragraph level, whereas more topics have max association with gothic novels at the chapter level. This highlights that gothic novels convey a broader sense of the world, developing the narrative along more of the text, whereas detective novels are more narrowly focused on specific events and things, and can develop themes in a more concise format, such as the paragraph.**

In [72]:
paragraph_model.topics[['top_terms', 'd', 'g', 'genre_id']].sort_values(['genre_id', 'g'], ascending=False)

Unnamed: 0_level_0,top_terms,d,g,genre_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T05,heart moment eyes life countenance tears world,0.031558,0.112456,g
T17,heart man mind love time evening friend,0.02666,0.075412,g
T07,day letter time dear friends friend way,0.0555,0.064961,g
T14,sir voice light sound steps door distance,0.040668,0.062377,g
T18,day country sun home sea mountains place,0.034613,0.057719,g
T16,door room chamber hand apartment look moment,0.047446,0.054972,g
T19,subject person tuppence chateau lord night course,0.035546,0.05198,g
T15,face eyes hand hands night man lips,0.066446,0.050724,d
T00,time mother house girl lady family matter,0.069709,0.048663,d
T11,yes father son box matter place way,0.055947,0.047285,d


In [70]:
chapter_model.topics[['top_terms', 'd', 'g', 'genre_id']].sort_values(['genre_id', 'g'], ascending=False)

Unnamed: 0_level_0,top_terms,d,g,genre_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T05,heart eyes time hand moment night voice,0.007177,0.211929,g
T17,castle night mind chamber door voice room,0.004452,0.141982,g
T19,chateau mind tears woods evening heart counten...,0.000787,0.115157,g
T18,man time father brother day men way,0.030585,0.114263,g
T03,night time door room way day things,0.013777,0.08667,g
T14,men time party way light night man,0.027221,0.06652,g
T11,son man castle father thou lord youth,0.000544,0.039335,g
T16,room door bed chamber rooms lamp apartment,0.004643,0.030339,g
T04,man heart creator rage creature brother companion,0.000745,0.015651,g
T02,sea rocks time day man way letter,0.003621,0.011994,g
