# LDA

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.preprocessing import normalize

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Topic Model Class

In [2]:
class TopicModel():
    '''
    DOCSTRING GOES HERE
    '''
    def __init__(self, corpus:pd.DataFrame, lib:pd.DataFrame, bag=list, pos_filter=str):
        '''
        DOCSTRING GOES HERE
        '''
        self.corpus = corpus
        self.lib = lib
        self.bag = bag
        self.pos_filter = pos_filter
        self.docs = self.__create_docs(pos_filter)


    def __create_docs(self, pos_filter:list):
        '''
        Function to generate docs by a specified bag from a corpus

        Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

        PARAMETERS:

        `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

        `level` - string of OHCO to group by for bags

        OUTPUTS:

        pandas DataFrame of bag documents grouped by `level`

        EXAMPLE:

        `DOC = get_doc(CORPUS, 'chap_id')`
        '''
        # Get multi-index from `corpus` df
        idx = list(self.corpus.index.names)

        # Check to see that `level` exists in `corpus` OHCO
        # Raise error if not
        #if (self.bag not in idx):
            #raise KeyError (f'{self.bag} not found in corpus OHCO')

        # Filter corpus for pos
        # Split-apply-combine to generate DOCs grouped by `level`
        return self.corpus[self.corpus.pos.str.match(fr'^{pos_filter}?$')]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})


    def count_vectorize(self, max_features=5000, ngram_range = (1,1), stop_words='english'):
        '''
        DOCSTRING GOES HERE
        '''
        self.count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
        self.count_model = self.count_engine.fit_transform(self.docs.doc_str)
        self.terms = self.count_engine.get_feature_names_out()

        return None
    

    def generate_lda_model(self, random_state:int, n_topics:int=20, max_iter:int=10, learning_offset:float=50.):
        '''
        DOCSTRING GOES HERE
        '''
        self.lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
        self.lda_model = self.lda_engine.fit_transform(self.count_model)
        self.topic_names = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        return None

    def generate_vocab(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.vocab = pd.DataFrame(index=self.terms)
        self.vocab.index.name = 'term_str'
        return None
    

    def generate_dtm(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.dtm = pd.DataFrame(self.count_model.toarray(), index=self.docs.index, columns=self.terms)
        self.vocab['doc_count'] = self.dtm.astype('bool').astype('int').sum()
        self.docs['term_count'] = self.dtm.sum(1)
        return None
    
    
    def generate_theta(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.theta = pd.DataFrame(self.lda_model, index=self.docs.index)
        self.theta.columns.name = 'topic_id'
        self.theta.columns = self.topic_names
        return None
    

    def generate_phi(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.phi = pd.DataFrame(self.lda_engine.components_, columns=self.terms, index=self.topic_names)
        self.phi.index.name = 'topic_id'
        self.phi.columns.name = 'term_str'
        return None
    

    def generate_topics(self, n_top_terms:int=7):
        '''
        DOCSTRING GOES HERE
        '''
        self.topics = self.phi.stack().groupby('topic_id')\
            .apply(lambda x: ' '.join(x.sort_values(ascending=False)\
                                      .head(n_top_terms)\
                                        .reset_index().term_str))\
            .to_frame('top_terms')
        self.topics['doc_weight_sum'] = self.theta.sum()
        self.topics['doc_mean_weight'] = self.theta.mean()
        self.topics['term_freq'] = self.phi.sum(1) / self.phi.sum(1).sum()
        return None
    
    def append_metadata_to_topics(self, metadata:str, join_on:str):
        '''
        DOCSTRING GOES HERE
        '''
        _md_frame = sorted(self.lib[metadata].value_counts().index.to_list())
        self.topics[_md_frame] = self.theta.join(self.lib, on=join_on).groupby(metadata)[self.topic_names].mean().T
        self.topics[metadata] = self.topics[_md_frame].idxmax(1)
        return None

## Read Data

In [3]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
DTCM_SONG = pd.read_csv('../tables/DTCM_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_L2_SONG = pd.read_csv('../tables/TFIDF_L2_SONG.csv', sep='|').set_index(['album_id', 'song_num'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
DTCM_ALBUM = pd.read_csv('../tables/DTCM_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_L2_ALBUM = pd.read_csv('../tables/TFIDF_L2_ALBUM.csv', sep='|').set_index(['album_id'])

## Setup

In [4]:
ngram_range = (1, 2)
n_terms = 4000
n_topics = 10
max_iter = 20
n_top_terms = 5

In [5]:
import warnings
warnings.filterwarnings('ignore')

### Get stopwords from `VOCAB`

In [6]:
stops = list(VOCAB[VOCAB.stop == 1].index.values)

## Generate Topic Model info with Album as bag

In [7]:
album_model = TopicModel(CORPUS, LIB, ['album_id'], 'NNS')
album_model.count_vectorize(stop_words=stops)
album_model.generate_vocab()
album_model.generate_dtm()
album_model.generate_lda_model(random_state=0)
album_model.generate_phi()
album_model.generate_theta()
album_model.generate_topics(n_top_terms=5)
album_model.append_metadata_to_topics('genre', 'album_id')
album_model.append_metadata_to_topics('album_title', 'album_id')
album_model.append_metadata_to_topics('artist', 'album_id')

In [8]:
album_model.topics.sort_values('doc_mean_weight', ascending=False).head()

Unnamed: 0_level_0,top_terms,doc_weight_sum,doc_mean_weight,term_freq,alternative dance,alternative rock,art pop,brooklyn indie,canadian indie,chamber pop,...,Ethel Cain,Holychild,Metric,Mr. Twin Sister,Pure Bathing Culture,Radiohead,St. Vincent,The National,Yeah Yeah Yeahs,artist
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T03,time love way everything heart,10.642987,0.119584,0.114333,0.084469,0.148827,0.093563,0.32334,0.110175,0.201783,...,0.000251,0.000114,0.110175,0.00027,0.000172,0.000188,0.000125,0.201783,0.000127,MGMT
T08,way home day love baby,10.350072,0.116293,0.101603,0.000152,0.050133,0.119752,0.000228,0.111753,0.39727,...,0.624112,0.000114,0.111753,0.00027,0.000172,0.110971,0.264314,0.39727,0.000127,Ethel Cain
T05,time bam citys love man,10.276546,0.115467,0.116054,0.199796,0.060986,0.166879,0.248933,0.110988,8.8e-05,...,0.000251,0.000114,0.110988,0.00027,0.000172,0.11087,0.14269,8.8e-05,0.199661,Caroline Polachek
T13,way time night door love,9.865845,0.110852,0.091332,0.199585,0.000281,0.130643,0.000228,0.126754,8.8e-05,...,0.371378,0.401517,0.126754,0.25037,0.332463,0.000188,0.000125,8.8e-05,0.199366,Caroline Polachek
T00,sun sacrilege dreamt love way,7.851686,0.088221,0.088754,0.110131,0.150124,0.043983,0.249127,0.000104,0.199784,...,0.000251,0.000114,0.000104,0.330734,0.010834,0.110948,0.020716,0.199784,0.199831,Mr. Twin Sister


In [9]:
PHI_ALBUM = album_model.phi
THETA_ALBUM = album_model.theta
TOPICS_ALBUM = album_model.topics

## Generate Topic Model info with Song as bag

Two songs were dropped out since they only have the word "yeah" a bunch of times

* Make a reduced `SONG_LIB` to be able to add metadata to `TOPIC_SONG`
* You have to run the cells below the following cell first and then loop back through with the updated `song_lib`

In [20]:
song_lib  = SONG_LIB.drop(list(set(SONG_LIB.index) - set(THETA_SONG.index)), axis=0)

In [38]:
song_model = TopicModel(CORPUS, song_lib, ['album_id', 'song_num'], 'NNS')
song_model.count_vectorize(stop_words=stops)
song_model.generate_vocab()
song_model.generate_dtm()
song_model.generate_lda_model(random_state=0)
song_model.generate_phi()
song_model.generate_theta()
song_model.generate_topics(n_top_terms=5)
song_model.append_metadata_to_topics('genre', ['album_id', 'song_num'])
song_model.append_metadata_to_topics('artist', ['album_id', 'song_num'])
song_model.append_metadata_to_topics('title', ['album_id', 'song_num'])
song_model.append_metadata_to_topics('album', ['album_id', 'song_num'])

In [47]:
song_model.topics.sort_values('doc_mean_weight', ascending=False)[['top_terms', 'doc_mean_weight', 'artist']].head()

Unnamed: 0_level_0,top_terms,doc_mean_weight,artist
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T13,way time something eyes heart,0.070136,Ethel Cain
T05,time war house love arms,0.070091,MGMT
T00,love life sacrilege eyes look,0.063392,Cults
T15,time raindrops guns duh days,0.061916,Yeah Yeah Yeahs
T08,dreamt baby love everything lie,0.06128,Mr. Twin Sister


* T13 interpretation: this topic seems to indicate some waywardness and being lost
* T05 interpretation: this topic seems to be about conflict, maybe relating to domestic relationships
* T00 interpretation: this topic seems to be about infidelity or lust
* T15 interpretation: this topic seems to be about malaise
* T08 interpretation: this topic seems to be about uncertainty in relationships

In [40]:
PHI_SONG = song_model.phi
THETA_SONG = song_model.theta
TOPIC_SONG = song_model.topics

## PCA on Song topics

In [41]:
pca_engine_phi = PCA(2)

In [50]:
PHI_COMPS_SONG = pd.DataFrame(pca_engine_phi.fit_transform(normalize(PHI_SONG, norm='l2', axis=1)), index=PHI_SONG.index)

In [51]:
px.scatter(PHI_COMPS.reset_index(), 0, 1, 
           size=TOPIC_SONG.doc_mean_weight, 
           color=TOPIC_SONG.artist, 
           text='topic_id', hover_name=TOPIC_SONG.album, height=600, width=700)

* A few artists are not max assigned to a topic.
* HOLYCHILD and Caroline Polachek are separated along PC1, which is interesting given they're relatively similar in genre.
* Using hover info, max artist assignment and max album assignment don't always line up for topics
    * Topics 1 and 12 have agreement between Big Thief and Big Thief albums, and in my opinion those two albums are definitely their most focused.

## Save tables

In [65]:
PHI_COMPS_SONG.to_csv('../tables/PHI_COMPS_SONG.csv', sep='|')
THETA_SONG.to_csv('../tables/THETA_SONG.csv', sep='|')
PHI_SONG.to_csv('../tables/PHI_SONG.csv', sep='|')
pd.DataFrame.sparse.from_spmatrix(song_model.count_model).to_csv('../tables/LDA_COUNT_MATRIX_SONG.csv', sep='|')