<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Read and load dataset
dataset = pd.read_csv('gdb_dataset.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

In [None]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# Display a few samples to verify
abstracts

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
encoder = SentenceTransformer('paraphrase-MiniLM-L12-v2')
embeddings = encoder.encode(abstracts, show_progress_bar=True)

In [None]:
embeddings

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
# Load ModernBERT tokenizer and model from Hugging Face
model_name = 'answerdotai/ModernBERT-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:
# Function to get inpout text and return full text embedding (Edit code to get embedding sentence by sentence)
def get_text_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text.tolist(), return_tensors='pt', padding=True, truncation=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [None]:
# Ensure all text entries are strings (handle potential non-string entries)
abstract_cleaned = abstracts.astype(str)

# Get embeddings for each abstract
full_text_embeddings = get_text_embeddings(abstract_cleaned)

# Show results
print("\nEmbeddings shape:", full_text_embeddings.shape)
print("First embedding vector (10 values):", full_text_embeddings[0][:10])