### <center>Custom Topic Modelling</center>

In this file we will do the custom topic modelling based on below methods
+ Document embedding via Cr5 model
+ UMAP for dimensionality reduction
+ KMeans/ HDBSCAN for clustering and finding the topics 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
from pylab import rcParams
filterwarnings(action='ignore', category=DeprecationWarning)
filterwarnings(action='ignore', category=FutureWarning)
from utils.utils import *

%matplotlib inline

In [2]:
import sys
sys.path.append("./models/Cr5-master/src")

from gensim.utils import tokenize
from cr5 import Cr5_Model
from scipy.spatial import distance

import umap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class HOCCustomTopic:
    """
         A class used to create the custom topic modelling by using document embeddings via Cr5 model,
         then reducing the embeddding via UMAP and then clustering using KMeans to find the topics.
    """

    base_path = r'./assets/images/'

    def get_model(self, path):
        """ The function returns the Cr5 model
            path - path to the model
        """
        # Load the model
        model = Cr5_Model(path,'joint_28')
        # Model for en language
        model.load_langs(['en'])

        return model

    def get_cr5_embeddings(self, df, model):
        """ The function returns the Cr5 emdeddings
            df - Dataframe
            model - model
        """

        token_ = [list(tokenize(speech)) for speech in df.speech_processed]
        # Calculate the embeddings for the speeches
        embeddings = [model.get_document_embedding(doc,'en') for doc in token_ ]

        return embeddings

    def get_umap_embeddings(self, embedding):
        """ The function returns the reduced UMAP emdeddings for Cr5 embeddings
            embedding - Cr5 embeddings
        """
        # Create UMAP embeddings
        umap_embeddings = umap.UMAP(n_neighbors=15, 
                                n_components=5, 
                                min_dist = 0.2,
                                metric='cosine').fit_transform(embedding)
        return umap_embeddings

    def plot_elbow(self, umap_embeddings, k_clusters):
        """ The function returns Elbow plot of KMeans clustering
            umap_embeddings - UMAP embeddings
            k_clusters - cluster range
        """

        score = []
        # Iterate over cluster range for KMeans
        for i in range(1,k_clusters + 1):
            kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=5,random_state=0)
            kmeans.fit(umap_embeddings)
            score.append(kmeans.inertia_)
            
        plt.plot(range(1,k_clusters + 1 ),score)
        plt.title('The Elbow Method')
        plt.xlabel('Number of clusters')
        plt.ylabel('Score')
        plt.show()

    def kmeans_model(self, embedding, k_clusters):
        """ The function uses KMeans clustering and returns the clusters
            embedding - embeddings
            k_clusters - cluster range
        """
        # Create KMeans model
        model = KMeans(n_clusters=k_clusters, init='k-means++', n_init=10, max_iter=600, tol=0.000001, random_state=0)
        model.fit(embedding)
        # Predict the clusters
        clusters = model.predict(embedding)
        return clusters

    def create_cluster_df(self, df, clusters):
        """ The function aggregates the speeches in the clusters topic wise and returns the dataframe
            df - Dataframe
            clusters - cluster 
        """

        df['Topic'] = clusters
        df['Doc_ID'] = range(len(df))
        # Aggregate the speech processed in the clusters
        docs_per_topic = df.groupby(['Topic'], as_index = False).agg({'speech_processed': ' '.join})
        
        return docs_per_topic

    def plot_pca(self, embeddings, k_clusters):
        """ The function plots the PCA of the embeddings
            embeddings - embeddings
            k_clusters - cluster range 
        """
        # PCA model
        sklearn_pca = PCA(n_components = 2)
        # Apply PCA on embeddings
        Y_sklearn = sklearn_pca.fit_transform(embeddings)
        kmeans = KMeans(n_clusters=k_clusters, max_iter=600, algorithm = 'lloyd')
        # Fit the KMeans model on the PCA embeddings
        fitted = kmeans.fit(Y_sklearn)
        prediction = kmeans.predict(Y_sklearn)

        plt.figure(figsize=(12, 6))
        # Plot the PCA predictions
        plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1], c=prediction, s=40, cmap='viridis', linewidths=5)
        # Find centers of the clusters
        centers = fitted.cluster_centers_
        plt.scatter(centers[:, 0], centers[:, 1],c='black', s=200, alpha=0.6);

    def c_tf_idf(self, documents, m, ngram_range=(1, 1)):
        """ The function creates the categorical tf-idf matrix
            documents - documents
            ngram_range - ngram range 
        """
        # Create the count vectorizer
        count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
        t = count.transform(documents).toarray()
        w = t.sum(axis=1)
        # Create the tf 
        tf = np.divide(t.T, w)
        sum_t = t.sum(axis=0)
        # Create the idf vector
        idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
        # Calculate the tf-idf matrix
        tf_idf = np.multiply(tf, idf)

        return tf_idf, count

    def extract_top_n_words_per_topic(self, tf_idf, count, docs_per_topic, n=20):
        """ The function extracts the top n words per topic
            tf_idf - tf-idf matrix
            count - Count vectorizer
            docs_per_topic - dataframe
        """
        words = count.get_feature_names()
        # Get the topic label in dataframe
        labels = list(docs_per_topic.Topic)
        # Transpose the tf-idf matrix
        tf_idf_transposed = tf_idf.T
        indices = tf_idf_transposed.argsort()[:, -n:]
        # Get the top n words for each topic
        top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
        return top_n_words

    def extract_topic_sizes(self, df):
        """ The function creates the dataframe with the topic sizes
            df - dataframe
        """
        topic_sizes = (df.groupby(['Topic'])
                        .speech_processed
                        .count()
                        .reset_index()
                        .rename({"Topic": "Topic", "speech_processed": "Size"}, axis='columns')
                        .sort_values("Size", ascending=False))
        return topic_sizes
    