<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [None]:
!pip install turftopic

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [None]:
# Function to get inpout text and return full text embedding (Edit code to get embedding sentence by sentence)
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [79]:
# Read and load dataset
dataset = pd.read_csv('gdb_abstract.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

Node Content: (1000, 4)
     Unnamed: 0                                              title  \
0             0  Phenotypic variability of Niemann-Pick disease...   
1             1  Recurrent hypoglycemia secondary to metformin ...   
2             2  Adaptation of the Ambulatory and Home Care Rec...   
3             3  Multidimensional family therapy in adolescents...   
4             4  Balanced crystalloids versus isotonic saline i...   
..          ...                                                ...   
995         995  Molecular Sex Identification in Dioecious  Hip...   
996         996  Antimicrobial Peptides: Powerful Biorecognitio...   
997         997  Analysis of Metabolites in White Flowers of  M...   
998         998  Improved Cold Tolerance of Mango Fruit with En...   
999         999  Characterization of the Complete Chloroplast G...   

                                              abstract  year  
0    Background Niemann-Pick disease type C (NPC) i...  2018  
1    Back

In [80]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# # sample the dataset
# abstracts = abstracts[995]

# Display a few samples to verify
print(abstracts)

0      Background Niemann-Pick disease type C (NPC) i...
1      Background Metformin toxicity is well known to...
2      Background Measuring service use and costs is ...
3      Background Substance use and delinquency are c...
4      Objectives Intravenous fluids are one of the m...
                             ...                        
995    The dioecious property of the sea buckthorn ( ...
996    Bacterial infections represent a serious threa...
997    A total of seven phenolics and 44 metabolites ...
998    Red fruits were suggested to be tolerant to co...
999    Buddleja colvilei Hook.f. & Thomson (Scrophula...
Name: abstract, Length: 1000, dtype: object


In [None]:
from tqdm import tqdm

In [None]:
# embeddings = []
# # Loop through rows and extract embeddings
# for text in tqdm(abstracts, desc="Extracting embeddings"):
#     embedding = get_text_embedding(str(text))
#     embeddings.append(embedding)

# Read abstract_embeddings.csv
embeddings = pd.read_csv('abstract_embeddings.csv')
embeddings = embeddings.values

In [81]:
# Save the embedding to a csv file
embedding_df = pd.DataFrame(embeddings)
embedding_df.to_csv('abstract_embeddings.csv', index=False)

# Show first 10 embeddings
embeddings[:10]

array([[ 0.30497485, -0.20870261, -0.18874016, ..., -1.1213069 ,
         0.6365246 , -0.5493275 ],
       [ 0.46238533, -0.65232116,  0.29970357, ..., -1.2555141 ,
         1.1282659 , -0.3443873 ],
       [-0.23266688, -0.51089686, -0.01024171, ..., -1.5995297 ,
         0.76797116, -0.77237827],
       ...,
       [-0.0948136 , -0.5833389 , -0.36094257, ..., -1.4242927 ,
         0.8000725 , -0.93337435],
       [ 0.38207036, -0.570439  , -0.11699587, ..., -1.4142264 ,
         0.73408806, -0.8472013 ],
       [ 0.39557433, -0.59677   , -0.29335314, ..., -2.138954  ,
         0.7247665 , -0.69985723]])

## Topic Modelling using turftopics
1. Semantic Signal Separation
2. KeyNMF
3. ClusteringTopicModel

### Semantic Signal Separation

In [None]:
# import turftopics library "SemanticSignalSeparation"
from turftopic import SemanticSignalSeparation

In [None]:
# Initialize SemanticSignalSeparation with your encoder
model = SemanticSignalSeparation(4, encoder="answerdotai/ModernBERT-base", random_state=42)

# Fit the model using both abstracts and their precomputed embeddings
doc_topic_matrix = model.fit_transform(abstracts, embeddings=embeddings)

In [82]:
model.print_topics(top_k=10)

In [83]:
model.plot_concept_compass(0, 1)

In [None]:
model.rename_topics({
    0: "Topic0",
    1: "Topic1",
    2: "Topic2",
    3: "Topic4",
})

In [84]:
model.print_topic_distribution("I am a socialist and I am concerned with the growing inequality in our societies. I'd like to see governments do more to prevent the exploitation of workers.")

In [85]:
import plotly.express as px

df = pd.DataFrame(doc_topic_matrix, columns=model.topic_names)
# df["party"] = ["Liberal" if label == 1 else "Conservative" for label in ds["label"]]

fig = px.scatter_matrix(df, dimensions=model.topic_names, color="Topic0", template="plotly_white")
fig = fig.update_traces(diagonal_visible=False, showupperhalf=False, marker=dict(opacity=0.6))
fig.show()