<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [None]:
!pip install turftopic

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [None]:
# Function to get inpout text and return full text embedding (Edit code to get embedding sentence by sentence)
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [None]:
# Read and load dataset
dataset = pd.read_csv('gdb_abstract.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

In [None]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# # sample the dataset
# abstracts = abstracts[995]

# Display a few samples to verify
print(abstracts)

In [None]:
from tqdm import tqdm

In [None]:
# embeddings = []
# # Loop through rows and extract embeddings
# for text in tqdm(abstracts, desc="Extracting embeddings"):
#     embedding = get_text_embedding(str(text))
#     embeddings.append(embedding)

# Read abstract_embeddings.csv
embeddings = pd.read_csv('abstract_embeddings.csv')
embeddings = embeddings.values

In [None]:
# Save the embedding to a csv file
embedding_df = pd.DataFrame(embeddings)
embedding_df.to_csv('abstract_embeddings.csv', index=False)

# Show first 10 embeddings
embeddings[:10]

In [None]:
# get a sample
sample = embeddings[0]

sample.shape

## Topic Modelling using turftopics
1. Semantic Signal Separation
2. KeyNMF
3. ClusteringTopicModel

### Semantic Signal Separation

In [None]:
# import turftopics library "SemanticSignalSeparation"
from turftopic import SemanticSignalSeparation

In [None]:
# Initialize SemanticSignalSeparation with your encoder
model = SemanticSignalSeparation(4, encoder="answerdotai/ModernBERT-base", random_state=42)

# Fit the model using both abstracts and their precomputed embeddings
doc_topic_matrix = model.fit_transform(abstracts, embeddings=embeddings)

In [None]:
model.print_topics(top_k=10)

In [None]:
model.plot_concept_compass(0, 1)

In [None]:
model.rename_topics({
    0: "Topic0",
    1: "Topic1",
    2: "Topic2",
    3: "Topic4",
})

In [None]:
model.print_topic_distribution("I am a socialist and I am concerned with the growing inequality in our societies. I'd like to see governments do more to prevent the exploitation of workers.")

In [None]:
import plotly.express as px

df = pd.DataFrame(doc_topic_matrix, columns=model.topic_names)

fig = px.scatter_matrix(df, dimensions=model.topic_names, color="Topic0", template="plotly_white")
fig = fig.update_traces(diagonal_visible=False, showupperhalf=False, marker=dict(opacity=0.6))
fig.show()

### KeyNMF

In [None]:
!pip install turftopic[topic-wizard]

In [None]:
from turftopic import KeyNMF

model1 = KeyNMF(
    n_components=15,
    random_state=42,
    encoder="answerdotai/ModernBERT-base",
    seed_phrase="Religion and Morality"
)
topic_data = model1.prepare_topic_data(abstracts, embeddings=embeddings)

In [None]:
topic_data.print_topics()

In [None]:
topic_data.print_representative_documents(11)

In [None]:
fig = topic_data.figures.word_map()
fig.show()

In [None]:
import plotly.express as px

groups = [
    "group1",
    "group2",
    "group3",
]

doc_topic_df = pd.DataFrame(topic_data.document_topic_matrix, columns=model.topic_names)
doc_topic_df["group"] = np.random.choice(groups, size=len(doc_topic_df))  # Replace with real labels if available
group_topic_matrix = doc_topic_df.groupby("group").mean()

fig = px.imshow(group_topic_matrix,
                labels=dict(x="Topic", y="Group", color="Intensity"),
                x=group_topic_matrix.columns,
                y=group_topic_matrix.index)
fig.show()
