<a href="https://colab.research.google.com/github/prith189/GLG_DL/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pipeline suggested by BERTopic:

- Generate embeddings using the Sentence Transformer model (Each block of text is converted to a 384 dimensional vector)

- Reduce the dimensionality using UMAP for 384 dimensions to 5 dimensions

- Cluster the 5 dimensional vectors using HDBSCAN

- For each cluster, run TF-IDF to generate a representation of the topic


Changes made to use the News dataset

- For clustering, HDBSCAN classifies most of the vectors in the embedded space as noise

- Kmeans clusters all data points into clusters, therefore KMeans was used

- In the below notebook, UMAP was used for dimensionality reduction and Kmeans was used for clustering

In [15]:
RUN_SENTENCE_TRANSFORMER = False #Set this to True if we need to generate embeddings from scratch. Requires GPU else very slow.
RUN_UMAP = True #Set this to True if we need to reduce the dimensionality of the embeddings using UMAP (Requires >100GB of RAM to run for all data points)
RUN_KMEANS = True #Set this to True if we need to run KMeans on the reduce dimension vectors
EXTRACT_TOPICS = True #Set this to True to extract a description of each of the topics
TEST_NEW_TEXT = True #To test out new topics

In [17]:
!pip install umap-learn

In [None]:
!pip install transformers[sentencepiece] sentence-transformers

In [None]:
use_drive = False

In [None]:
if(use_drive):
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_PATH = '/content/drive/My Drive/fourthbrain/'
else:
    BASE_PATH = '/content/'

In [19]:
#!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-3gKYoipfdPkeQHnHa0M2vD7QEug5wNS' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1-3gKYoipfdPkeQHnHa0M2vD7QEug5wNS" -O all-the-news-embeddings-title.npy && rm -rf /tmp/cookies.txt

In [20]:
#!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-5IsScXPtUY5jXVe_83RuQ0uI7eQqDMI' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1-5IsScXPtUY5jXVe_83RuQ0uI7eQqDMI" -O all-the-news-embeddings-title-index.npy && rm -rf /tmp/cookies.txt

In [None]:
#!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1AOiXZ6-nKt-_b2md0hdEj0uSkeiDiRd0' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1AOiXZ6-nKt-_b2md0hdEj0uSkeiDiRd0" -O all-the-news-2-1.csv && rm -rf /tmp/cookies.txt

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import pickle
import os

class FeatureExtraction:
    def __init__(self):
        #Load the pretrained model
        self.fe = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=0)

    def run_fe_batch(self, list_of_input_text):
        list_of_fe_vec = self.fe.encode(list_of_input_text, show_progress_bar=False)
        return list_of_fe_vec


class NewsDataset:
    def __init__(self):
        self.df = pd.read_csv(csv_file)
        self.preprocess()
        self.ner = None
    
    def preprocess(self):
        self.df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','date','year','month','day','article','publication'], inplace=True)
        print('Shape of dataframe before dropping nan:{}'.format(self.df.shape))
        self.df = self.df.dropna(subset=['title'])
        print('Shape of dataframe after dropping nan:{}'.format(self.df.shape))

In [None]:
features_file = os.path.join(BASE_PATH, 'all-the-news-embeddings-title.npy')
idx_file = os.path.join(BASE_PATH, 'all-the-news-embeddings-title-index.npy')
if(RUN_SENTENCE_TRANSFORMER):    
    feature_extractor = FeatureExtraction()
    news = NewsDataset()
    df_text = news.df['title'].to_list()
    features = feature_extractor.run_fe_batch(df_text)
    df_idx = news.df.index
    np.save(features_file, features)
    np.save(idx_file, df_idx)
else:
    features = np.load(features_file)
    df_idx = np.load(idx_file)

In [23]:
dim_red_embeddings_file = os.path.join(BASE_PATH, 'embeddings-title-umap.npy')
umap_model_file = os.path.join(BASE_PATH, 'umap-model.p')
if(RUN_UMAP):
    from umap import UMAP
    umap_model = UMAP(n_neighbors=15,n_components=5,min_dist=0.0,metric='cosine',low_memory=True, verbose=True)
    umap_model.fit(features)
    dim_red_embeddings = umap_model.transform(features)
    np.save(dim_red_embeddings_file, dim_red_embeddings)
    f = open(umap_model_file, 'wb')
    pickle.dump(umap_model, f)
    f.close()
else:
    dim_red_embeddings = np.load(dim_red_embeddings_file)
    f = open(umap_model_file, 'rb')
    umap_model = pickle.dump(f)
    f.close()

In [29]:
kmeans_model_file = os.path.join(BASE_PATH, 'kmeans_model.p')
labels_file = os.path.join(BASE_PATH, 'umap-kmeans-labels.npy')
import pickle as p
if(RUN_KMEANS):
    from sklearn.cluster import MiniBatchKMeans
    kmn = MiniBatchKMeans(n_clusters=25, verbose=1)
    labels = kmn.fit_predict(dim_red_embeddings)
    print('Number of datapoints in each cluster---')
    print(np.unique(labels, return_counts=True))
    f = open(kmeans_model_file, 'wb')
    pickle.dump(kmn, f)
    f.close()
    np.save(labels_file, labels)
else:
    f = open(kmeans_model_file, 'rb')
    kmn = pickle.load(f)
    f.close()
    labels = np.load(labels_file)

Init 1/3 with method k-means++
Inertia for init 1/3: 17336.4921875
Init 2/3 with method k-means++
Inertia for init 2/3: 16734.111328125
Init 3/3 with method k-means++
Inertia for init 3/3: 16596.646484375
[MiniBatchKMeans] Reassigning 1 cluster centers.
Minibatch step 1/262582: mean batch inertia: 6.0992841720581055
Minibatch step 2/262582: mean batch inertia: 5.47242546081543, ewa inertia: 5.47242546081543
Minibatch step 3/262582: mean batch inertia: 4.346141338348389, ewa inertia: 5.471567608296459
Minibatch step 4/262582: mean batch inertia: 4.34904146194458, ewa inertia: 5.47071261810078
Minibatch step 5/262582: mean batch inertia: 4.4434943199157715, ewa inertia: 5.469930220669216
Minibatch step 6/262582: mean batch inertia: 4.735731601715088, ewa inertia: 5.469371006415787
Minibatch step 7/262582: mean batch inertia: 4.992175579071045, ewa inertia: 5.4690075428001474
Minibatch step 8/262582: mean batch inertia: 4.42432975769043, ewa inertia: 5.468211847067968
Minibatch step 9/262

In [58]:
#The following functions were copied from the BERTopic module to extract topic descriptions from a set of clusters

import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.utils import check_array
import numpy as np
import scipy.sparse as sp


class ClassTFIDF(TfidfTransformer):
    """
    A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
    ![](../img/ctfidf.png)
    C-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
    by joining all documents per class. Thus, each class is converted to a single document
    instead of set of documents. Then, the frequency of words **t** are extracted for
    each class **i** and divided by the total number of words **w**.
    Next, the total, unjoined, number of documents across all classes **m** is divided by the total
    sum of word **i** across all classes.
    """
    def __init__(self, *args, **kwargs):
        super(ClassTFIDF, self).__init__(*args, **kwargs)

    def fit(self, X, multiplier):
        """Learn the idf vector (global term weights).
        Arguments:
            X: A matrix of term/token counts.
            multiplier: A multiplier for increasing/decreasing certain IDF scores
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = np.float64

        if self.use_idf:
            _, n_features = X.shape

            # Calculate the frequency of words across all classes
            df = np.squeeze(np.asarray(X.sum(axis=0)))

            # Calculate the average number of samples as regularization
            avg_nr_samples = int(X.sum(axis=1).mean())

            # Divide the average number of samples by the word frequency
            # +1 is added to force values to be positive
            idf = np.log((avg_nr_samples / df)+1)

            # Multiplier to increase/decrease certain idf scores
            if multiplier is not None:
                idf = idf * multiplier

            self._idf_diag = sp.diags(idf, offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

    def transform(self, X):
        """Transform a count-based matrix to c-TF-IDF
        Arguments:
            X (sparse matrix): A matrix of term/token counts.
        Returns:
            X (sparse matrix): A c-TF-IDF matrix
        """
        if self.use_idf:
            X = normalize(X, axis=1, norm='l1', copy=False)
            X = X * self._idf_diag

        return X


def c_tf_idf(documents_per_topic):
    """ Calculate a class-based TF-IDF where m is the number of total documents.
    Arguments:
        documents_per_topic: The joined documents per topic such that each topic has a single
                              string made out of multiple documents
        m: The total number of documents (unjoined)
        fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model
    Returns:
        tf_idf: The resulting matrix giving a value (importance score) for each word per topic
        words: The names of the words to which values were given
    """
    documents = preprocess_text(documents_per_topic['title'].values)

    vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words='english')

    vectorizer_model.fit(documents)

    words = vectorizer_model.get_feature_names()
    X = vectorizer_model.transform(documents)

    transformer = ClassTFIDF().fit(X, multiplier=None)

    c_tf_idf = transformer.transform(X)

    topic_sim_matrix = cosine_similarity(c_tf_idf)

    return c_tf_idf, words

def preprocess_text(documents):
    """ Basic preprocessing of text
    Steps:
        * Lower text
        * Replace \n and \t with whitespace
        * Only keep alpha-numerical characters
    """
    cleaned_documents = [doc.lower() for doc in documents]
    cleaned_documents = [doc.replace("\n", " ") for doc in cleaned_documents]
    cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]
    cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents]
    cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents]
    return cleaned_documents

def top_n_idx_sparse(matrix, n):
    """ Return indices of top n values in each row of a sparse matrix
    Retrieved from:
        https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    Arguments:
        matrix: The sparse matrix from which to get the top n indices per row
        n: The number of highest values to extract from each row
    Returns:
        indices: The top n indices per row
    """
    indices = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]
        values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
        indices.append(values)
    return np.array(indices)

def top_n_values_sparse(matrix, indices):
    """ Return the top n values for each row in a sparse matrix
    Arguments:
        matrix: The sparse matrix from which to get the top n indices per row
        indices: The top n indices per row
    Returns:
        top_values: The top n scores per row
    """
    top_values = []
    for row, values in enumerate(indices):
        scores = np.array([matrix[row, value] if value is not None else 0 for value in values])
        top_values.append(scores)
    return np.array(top_values)

def extract_words_per_topic(words,c_tf_idf,labels):
        """ Based on tf_idf scores per topic, extract the top n words per topic
        If the top words per topic need to be extracted, then only the `words` parameter
        needs to be passed. If the top words per topic in a specific timestamp, then it
        is important to pass the timestamp-based c-TF-IDF matrix and its corresponding
        labels.
        Arguments:
            words: List of all words (sorted according to tf_idf matrix position)
            c_tf_idf: A c-TF-IDF matrix from which to calculate the top words
            labels: A list of topic labels
        Returns:
            topics: The top words per topic
        """

        # Get the top 30 indices and values per row in a sparse c-TF-IDF matrix
        indices = top_n_idx_sparse(c_tf_idf, 30)
        scores = top_n_values_sparse(c_tf_idf, indices)
        sorted_indices = np.argsort(scores, 1)
        indices = np.take_along_axis(indices, sorted_indices, axis=1)
        scores = np.take_along_axis(scores, sorted_indices, axis=1)

        # Get top 30 words per topic based on c-TF-IDF score
        topics = {label: [(words[word_index], score)
                          if word_index is not None and score > 0
                          else ("", 0.00001)
                          for word_index, score in zip(indices[index][::-1], scores[index][::-1])
                          ]
                  for index, label in enumerate(labels)}

        # Extract word embeddings for the top 30 words per topic and compare it
        # with the topic embedding to keep only the words most similar to the topic embedding
        # if self.diversity is not None:
        #     if self.embedding_model is not None:

        #         for topic, topic_words in topics.items():
        #             words = [word[0] for word in topic_words]
        #             word_embeddings = self._extract_embeddings(words,
        #                                                        method="word",
        #                                                        verbose=False)
        #             topic_embedding = self._extract_embeddings(" ".join(words),
        #                                                        method="word",
        #                                                        verbose=False).reshape(1, -1)
        #             topic_words = mmr(topic_embedding, word_embeddings, words,
        #                               top_n=self.top_n_words, diversity=self.diversity)
        #             topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
        # topics = {label: values[:self.top_n_words] for label, values in topics.items()}

        return topics

In [30]:
topics_file = os.path.join(BASE_PATH, 'umap-kmeans-topics.p')
if(EXTRACT_TOPICS):
    csv_file = os.path.join(BASE_PATH, 'all-the-news-2-1.csv')
    df = pd.read_csv(csv_file, usecols=['title'])
    df = df.iloc[df_idx]
    df['Topic'] = labels
    df = df[['title', 'Topic']]
    n_topics = df['Topic'].unique().shape[0]
    documents_per_topic = df.groupby(['Topic'], as_index=False).agg({'title': ' '.join})
    sizes = df.groupby(['Topic']).count().sort_values("title", ascending=False).reset_index()
    topic_sizes = dict(zip(sizes['Topic'], sizes['title']))
    labels = sorted(list(topic_sizes.keys()))
    documents_per_topic = df.groupby(['Topic'], as_index=False).agg({'title': ' '.join})
    c_tf_idf_m, words = c_tf_idf(documents_per_topic)
    topics = extract_words_per_topic(words, c_tf_idf_m, labels)
    f = open(topics_file, 'wb')
    pickle.dump(topics, f)
    f.close()
else:
    f = open(topics_file, 'rb')
    topics = pickle.load(f)
    f.close()

In [None]:
def get_sample_text():
    st = []
    st.append('Secret Service on the defensive over allegations agents were duped by men impersonating feds') #Government
    st.append('Microsoft and other tech firms take aim at prolific cybercrime gang') #Technology
    st.append('Phoenix Suns favorites to win NBA title, but they still feel disrespected. Are they overlooked?') #Sports
    st.append("Natural gas spikes to highest level since 2008 as rare nor'easter looms") #Business
    st.append("Will rising prices sink Biden’s midterm hopes for Democrats?") #Politics
    st.append("Large and dangerous' tornadoes hit Texas and Oklahoma; South faces more severe weather") #Climate
    st.append("Here is a list of the best beaches in Hawaii and other tropical islands") #Travel
    return st

In [None]:
if(TEST_NEW_TEXT):
    feature_extractor = FeatureExtraction()
    new_text = get_sample_text()
    test_embeddings = feature_extractor.run_fe_batch(new_text)
    test_dim_red_embeddings = umap_model.transform(test_embeddings)
    test_labels = list(kmn.predict(test_dim_red_embeddings))
    for label, text in zip(test_labels, new_text):
        print('Test text:', text)
        print('Predicted topic:', '_'.join(topics[label][:5]))