# Homework 08

```yaml
Course:    DS 5001 
Module:    08 Homework
Topic:     LDA with SciKit Learn
Author:    Ryan Lipps
Date:      23 March 2023
```

## Setup

### Packages

In [1]:
import pandas as pd
import numpy as np
import configparser
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

### Config

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

### Read data

In [3]:
CORPUS = pd.read_csv(f'{data_home}/novels/novels-CORPUS.csv')
LIB = pd.read_csv(f'{data_home}/novels/novels-LIB.csv')

In [4]:
CORPUS.head()

Unnamed: 0,book_id,chap_id,para_num,sent_num,token_num,pos,term_str
0,secretadversary,1,0,1,0,DT,the
1,secretadversary,1,0,1,1,NNP,young
2,secretadversary,1,0,1,2,NNP,adventurers
3,secretadversary,1,0,1,3,NNP,ltd
4,secretadversary,1,1,0,0,JJ,tommy


In [5]:
CORPUS = CORPUS.set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']).dropna()
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


## Topic Model Class

In [6]:
class TopicModel():
    '''
    DOCSTRING GOES HERE
    '''
    def __init__(self, corpus:pd.DataFrame, lib:pd.DataFrame, bag=str, pos_filter=str):
        '''
        DOCSTRING GOES HERE
        '''
        self.corpus = corpus
        self.lib = lib
        self.bag = bag
        self.pos_filter = pos_filter
        self.docs = self.__create_docs(pos_filter)


    def __create_docs(self, pos_filter:list):
        '''
        Function to generate docs by a specified bag from a corpus

        Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

        PARAMETERS:

        `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

        `level` - string of OHCO to group by for bags

        OUTPUTS:

        pandas DataFrame of bag documents grouped by `level`

        EXAMPLE:

        `DOC = get_doc(CORPUS, 'chap_id')`
        '''
        # Get multi-index from `corpus` df
        idx = list(self.corpus.index.names)

        # Check to see that `level` exists in `corpus` OHCO
        # Raise error if not
        if (self.bag not in idx):
            raise KeyError (f'{self.bag} not found in corpus OHCO')

        # Filter corpus for pos
        # Split-apply-combine to generate DOCs grouped by `level`
        return self.corpus[self.corpus.pos.str.match(fr'^{pos_filter}?$')]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})


    def count_vectorize(self, max_features=4000, ngram_range = (1,1), stop_words='english'):
        '''
        DOCSTRING GOES HERE
        '''
        self.count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
        self.count_model = self.count_engine.fit_transform(self.docs.doc_str)
        self.terms = self.count_engine.get_feature_names_out()

        return None
    

    def generate_lda_model(self, random_state:int, n_topics:int=20, max_iter:int=5, learning_offset:float=50.):
        '''
        DOCSTRING GOES HERE
        '''
        self.lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
        self.lda_model = self.lda_engine.fit_transform(self.count_model)
        self.topic_names = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        return None

    def generate_vocab(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.vocab = pd.DataFrame(index=self.terms)
        self.vocab.index.name = 'term_str'
        return None
    

    def generate_dtm(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.dtm = pd.DataFrame(self.count_model.toarray(), index=self.docs.index, columns=self.terms)
        self.vocab['doc_count'] = self.dtm.astype('bool').astype('int').sum()
        self.docs['term_count'] = self.dtm.sum(1)
        return None
    
    
    def generate_theta(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.theta = pd.DataFrame(self.lda_model, index=self.docs.index)
        self.theta.columns.name = 'topic_id'
        self.theta.columns = self.topic_names
        return None
    

    def generate_phi(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.phi = pd.DataFrame(self.lda_engine.components_, columns=self.terms, index=self.topic_names)
        self.phi.index.name = 'topic_id'
        self.phi.columns.name = 'term_str'
        return None
    

    def generate_topics(self, n_top_terms:int=7):
        '''
        DOCSTRING GOES HERE
        '''
        self.topics = self.phi.stack().groupby('topic_id')\
            .apply(lambda x: ' '.join(x.sort_values(ascending=False)\
                                      .head(n_top_terms)\
                                        .reset_index().term_str))\
            .to_frame('top_terms')
        self.topics['doc_weight_sum'] = self.theta.sum()
        self.topics['term_freq'] = self.phi.sum(1) / self.phi.sum(1).sum()
        return None


In [7]:
paragraph_model = TopicModel(CORPUS, LIB, 'para_num', 'NNS')
paragraph_model.count_vectorize()
paragraph_model.generate_vocab()
paragraph_model.generate_dtm()
paragraph_model.generate_lda_model(random_state=0)
paragraph_model.generate_phi()
paragraph_model.generate_theta()
paragraph_model.generate_topics()

In [8]:
chapter_model = TopicModel(CORPUS, LIB, 'chap_id', 'NNS')
chapter_model.count_vectorize()
chapter_model.generate_vocab()
chapter_model.generate_dtm()
chapter_model.generate_lda_model(random_state=0)
chapter_model.generate_phi()
chapter_model.generate_theta()
chapter_model.generate_topics()

In [9]:
paragraph_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
T01,0.05,0.05,1.05,0.05,0.05,0.05,0.05,0.05,1.076186,1.05,...,0.05,1.05,0.05,0.05,0.05,0.05,3.138981,0.05,0.05,0.05
T02,0.05,0.05,0.05,0.05,0.054389,0.05,0.05,0.05,1.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.412343,0.05,0.426918,0.05,1.05
T03,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,2.05,1.05,0.05,0.05,0.05,1.05,1.870255,0.05,0.05
T04,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.050836,0.05,0.05,1.334526,0.05,0.05,0.05


In [10]:
chapter_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,21.726215,0.05,1.213645,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,2.56685,0.05,0.05,0.05
T01,0.05,0.05,2.042846,0.961227,0.810405,0.05,1.034973,0.05,0.051406,0.05,...,0.05,0.05,0.05,1.053415,1.050881,3.245727,0.498672,0.05,0.05,0.05
T02,0.05,0.05,0.05,0.05,0.05,1.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,2.05,0.05,0.05,0.05,0.05,0.05,0.05
T03,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,3.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
T04,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


## Question 1:
Use the PHI table from each model to compute the entropy H of the distribution over topics. Which bag generates a lower entropy distribution? Hint: To get H work with the L1 normalized vector of word weight sums by topic in the PHI table.

### Answer 1:

In [14]:
L1_para = paragraph_model.phi.apply(lambda x: x/x.sum(), 1)

In [32]:
L1_para['entropy'] = np.sum(L1_para.apply(lambda x: -1*x*np.log2(x)))

In [38]:
np.sum(L1_para.iloc[0]*np.log2(1/L1_para.iloc[0]))

10.158467263875188