# LDA

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Topic Model Class

In [2]:
class TopicModel():
    '''
    DOCSTRING GOES HERE
    '''
    def __init__(self, corpus:pd.DataFrame, lib:pd.DataFrame, bag=list, pos_filter=str):
        '''
        DOCSTRING GOES HERE
        '''
        self.corpus = corpus
        self.lib = lib
        self.bag = bag
        self.pos_filter = pos_filter
        self.docs = self.__create_docs(pos_filter)


    def __create_docs(self, pos_filter:list):
        '''
        Function to generate docs by a specified bag from a corpus

        Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

        PARAMETERS:

        `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

        `level` - string of OHCO to group by for bags

        OUTPUTS:

        pandas DataFrame of bag documents grouped by `level`

        EXAMPLE:

        `DOC = get_doc(CORPUS, 'chap_id')`
        '''
        # Get multi-index from `corpus` df
        idx = list(self.corpus.index.names)

        # Check to see that `level` exists in `corpus` OHCO
        # Raise error if not
        #if (self.bag not in idx):
            #raise KeyError (f'{self.bag} not found in corpus OHCO')

        # Filter corpus for pos
        # Split-apply-combine to generate DOCs grouped by `level`
        return self.corpus[self.corpus.pos.str.match(fr'^{pos_filter}?$')]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})


    def count_vectorize(self, max_features=4000, ngram_range = (1,1), stop_words='english'):
        '''
        DOCSTRING GOES HERE
        '''
        self.count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
        self.count_model = self.count_engine.fit_transform(self.docs.doc_str)
        self.terms = self.count_engine.get_feature_names_out()

        return None
    

    def generate_lda_model(self, random_state:int, n_topics:int=20, max_iter:int=10, learning_offset:float=50.):
        '''
        DOCSTRING GOES HERE
        '''
        self.lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
        self.lda_model = self.lda_engine.fit_transform(self.count_model)
        self.topic_names = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        return None

    def generate_vocab(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.vocab = pd.DataFrame(index=self.terms)
        self.vocab.index.name = 'term_str'
        return None
    

    def generate_dtm(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.dtm = pd.DataFrame(self.count_model.toarray(), index=self.docs.index, columns=self.terms)
        self.vocab['doc_count'] = self.dtm.astype('bool').astype('int').sum()
        self.docs['term_count'] = self.dtm.sum(1)
        return None
    
    
    def generate_theta(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.theta = pd.DataFrame(self.lda_model, index=self.docs.index)
        self.theta.columns.name = 'topic_id'
        self.theta.columns = self.topic_names
        return None
    

    def generate_phi(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.phi = pd.DataFrame(self.lda_engine.components_, columns=self.terms, index=self.topic_names)
        self.phi.index.name = 'topic_id'
        self.phi.columns.name = 'term_str'
        return None
    

    def generate_topics(self, n_top_terms:int=7):
        '''
        DOCSTRING GOES HERE
        '''
        self.topics = self.phi.stack().groupby('topic_id')\
            .apply(lambda x: ' '.join(x.sort_values(ascending=False)\
                                      .head(n_top_terms)\
                                        .reset_index().term_str))\
            .to_frame('top_terms')
        self.topics['doc_weight_sum'] = self.theta.sum()
        self.topics['term_freq'] = self.phi.sum(1) / self.phi.sum(1).sum()
        return None
    
    def append_metadata_to_topics(self, metadata:str, join_on:str):
        '''
        DOCSTRING GOES HERE
        '''
        _md_frame = sorted(self.lib[metadata].value_counts().index.to_list())
        self.topics[_md_frame] = self.theta.join(self.lib, on=join_on).groupby(metadata)[self.topic_names].mean().T
        self.topics[metadata] = self.topics[_md_frame].idxmax(1)
        return None

## Read Data

In [3]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
DTCM_SONG = pd.read_csv('../tables/DTCM_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_L2_SONG = pd.read_csv('../tables/TFIDF_L2_SONG.csv', sep='|').set_index(['album_id', 'song_num'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
DTCM_ALBUM = pd.read_csv('../tables/DTCM_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_L2_ALBUM = pd.read_csv('../tables/TFIDF_L2_ALBUM.csv', sep='|').set_index(['album_id'])

## Setup

In [4]:
ngram_range = (1, 2)
n_terms = 4000
n_topics = 10
max_iter = 20
n_top_terms = 5

In [5]:
import warnings
warnings.filterwarnings('ignore')

### Get stopwords from `VOCAB`

In [6]:
stops = list(VOCAB[VOCAB.stop == 1])

## Generate Topic Model info with Album as bag

In [7]:
album_model = TopicModel(CORPUS, LIB, ['album_id'], 'NNS')
album_model.count_vectorize(stop_words=stops)
album_model.generate_vocab()
album_model.generate_dtm()
album_model.generate_lda_model(random_state=0)
album_model.generate_phi()
album_model.generate_theta()
album_model.generate_topics()
album_model.append_metadata_to_topics('genre', 'album_id')
album_model.append_metadata_to_topics('artist', 'album_id')

In [8]:
album_model.topics

Unnamed: 0_level_0,top_terms,doc_weight_sum,term_freq,alternative dance,alternative rock,art pop,brooklyn indie,canadian indie,chamber pop,chillwave,...,LCD Soundsystem,MGMT,Metric,Mr. Twin Sister,Pure Bathing Culture,Radiohead,St. Vincent,The National,Yeah Yeah Yeahs,artist
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,minds everything first memories arms blood man,1.88823,0.026271,0.000131,0.050072,0.000149,0.000213,9.6e-05,0.17548,0.000203,...,0.000152,0.000111,9.6e-05,0.000268,0.00016,0.110879,0.000112,0.17548,0.000111,The National
T01,time yeah dark spinnin duh today its,6.706822,0.069286,0.157655,0.049286,0.076059,0.000213,0.22409,8.3e-05,0.000203,...,0.115791,0.005589,0.22409,0.000268,0.00016,0.10609,0.094468,8.3e-05,0.199519,Metric
T02,love war me arms eye roses everything,3.183366,0.045567,0.000131,0.050024,0.052724,0.000213,0.092072,8.3e-05,0.000203,...,0.000152,0.000111,0.092072,0.000268,0.00016,0.110772,0.000112,8.3e-05,0.000111,Big Thief
T03,way day time love la night light,3.307127,0.04145,0.000131,0.003292,0.041726,0.000213,0.005569,0.224029,0.199636,...,0.000152,0.012264,0.005569,0.000268,0.332549,0.000173,0.000112,0.224029,0.000111,Pure Bathing Culture
T04,oh lie love time side room hey,2.039288,0.027262,0.000131,0.000254,0.042914,0.000213,0.110988,8.3e-05,0.000203,...,0.000152,0.000111,0.110988,0.000268,0.00016,0.000173,0.004231,8.3e-05,0.000111,Big Thief
T05,time its life you way love things,11.56252,0.125968,0.000131,0.213745,0.005704,0.443586,0.013096,0.199757,0.19976,...,0.000152,0.422675,0.013096,0.000268,0.332754,0.018046,0.000112,0.199757,0.000111,Holychild
T06,time people way oh love youre eyes,7.889465,0.066888,0.264274,0.149221,0.041699,0.000213,9.6e-05,8.3e-05,0.000203,...,0.326047,0.000111,9.6e-05,0.000268,0.00016,0.110795,0.000112,8.3e-05,0.202501,Ethel Cain
T07,nothing back days case world stone it,5.954851,0.050763,0.000131,0.141688,0.041634,0.249401,0.124418,8.3e-05,0.000203,...,0.000152,0.000111,0.124418,0.000268,0.00016,0.31447,0.000112,8.3e-05,0.000111,Radiohead
T08,pills you anybody lover seduction please girls,2.202249,0.025616,0.000131,0.009739,0.041927,0.248957,9.6e-05,8.3e-05,0.000203,...,0.000152,0.000111,9.6e-05,0.000268,0.00016,0.000173,0.143352,8.3e-05,0.000111,Ava Luna
T09,love yeah time one heart citys you,11.031327,0.123067,0.103399,0.057611,0.149976,0.000213,0.11793,8.3e-05,0.398183,...,0.206687,0.029994,0.11793,0.49749,0.331978,0.111031,0.13858,8.3e-05,0.000111,Mr. Twin Sister


In [9]:
VOCAB.loc['doo']

n                       86
n_chars                  3
p                 0.000447
i                11.129036
max_pos                 NN
max_pos_group           NN
stop                     1
porter_stem            doo
song_dfidf        31.58533
album_dfidf      17.902934
Name: doo, dtype: object