### <center>LDA Topic Modelling</center>

+ In this file we will perform the topic modelling of hansard data using Latent Dirichlet Allocation algorithm

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning)

from utils.utils import *

import pyLDAvis
import pyLDAvis.gensim
from pylab import rcParams
%matplotlib inline

  from imp import reload


In [2]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spaCy for Lemmatization
import spacy

  from scipy.linalg.special_matrices import triu
  from .autonotebook import tqdm as notebook_tqdm


LDA analysis of the data, deciding the topics based on the perplexity and coherence metrics

In [3]:
class HOCLDATopic:

    """
         A class used to perform topic modelling on the hansard data, testing the LDA model
         and plotting the results.
    """

    def __init__(self, df_head_dt_gp_21):
        self.df_head_dt_gp_21 = df_head_dt_gp_21

    ### LDA Analysis ###

    def sent_to_words(self, sentences):
        """ The function performs gensim preprocessing on the sentences
            sentences - data """
        for sentence in sentences:
            # deacc=True removes punctuations
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
            
    def lemmatization(self, texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """ The function lemmatizes the words using spacy
            text - data """

        texts_out = []
        # Loading the spacy module
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out

    def get_lda_data(self, df):
        """ The function creates data for lda analysis and returns the corpus of words
            df - Datframe """

        data = df.speech_processed.values.tolist()
        data_words = list(self.sent_to_words(data))

        data_lemmatized = self.lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']);

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)
        # Create Corpus
        texts = data_lemmatized
        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]
        return id2word, corpus, data_lemmatized

    def get_lda_model(self, id2word, corpus, topics):
        """ The function creates lda returns it
            id2word - dictionary
            corpus - corpus of words
            topics - no of topics """

        # Build LDA model
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=topics)
        doc_lda = lda_model[corpus]
        # Return LDA model
        return lda_model

    def test_lda_model(self, corpus, id2word, lda_model, data_lemmatized):
        """ The function test the lda model using perpexlity and cohesion
            id2word - dictionary
            corpus - corpus of words 
            lda_model - LDA model"""
        perp=[]
        cohe=[]
        for k in range(2,20):
            lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=k)
            perp.append(lda_model.log_perplexity(corpus))
            coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
            cohe.append(coherence_model_lda.get_coherence())

        return perp, cohe

    def plot_lda_test(self, perp, cohe):
        """ The function plots the perplexity and cohesion of lda model
            id2word - dictionary
            corpus - corpus of words 
            lda_model - LDA model"""

        rcParams['figure.figsize'] = 12, 8
        plt.plot(range(2,20),perp,'r-o')

        # Title of the plot
        plt.title("Perpexility in Topics", fontsize=20)
        # Labels of the plot
        plt.xlabel("Topics")
        plt.ylabel("Perpexility")
        plt.show()

        plt.plot(range(2,20),cohe,'g-o')
        # Title of the Coherence plot
        plt.title("Coherence in Topics", fontsize=20)
        plt.xlabel("Topics", fontsize=12)
        plt.ylabel("Coherence",fontsize=12)
        plt.show()
