<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [None]:
# Function to get inpout text and return full text embedding (Edit code to get embedding sentence by sentence)
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [32]:
# Read and load dataset
dataset = pd.read_csv('gdb_abstract.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

Node Content: (1000, 4)
     Unnamed: 0                                              title  \
0             0  Phenotypic variability of Niemann-Pick disease...   
1             1  Recurrent hypoglycemia secondary to metformin ...   
2             2  Adaptation of the Ambulatory and Home Care Rec...   
3             3  Multidimensional family therapy in adolescents...   
4             4  Balanced crystalloids versus isotonic saline i...   
..          ...                                                ...   
995         995  Molecular Sex Identification in Dioecious  Hip...   
996         996  Antimicrobial Peptides: Powerful Biorecognitio...   
997         997  Analysis of Metabolites in White Flowers of  M...   
998         998  Improved Cold Tolerance of Mango Fruit with En...   
999         999  Characterization of the Complete Chloroplast G...   

                                              abstract  year  
0    Background Niemann-Pick disease type C (NPC) i...  2018  
1    Back

In [33]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# # sample the dataset
# abstracts = abstracts[995]

# Display a few samples to verify
abstracts

Unnamed: 0,abstract
0,Background Niemann-Pick disease type C (NPC) i...
1,Background Metformin toxicity is well known to...
2,Background Measuring service use and costs is ...
3,Background Substance use and delinquency are c...
4,Objectives Intravenous fluids are one of the m...
...,...
995,The dioecious property of the sea buckthorn ( ...
996,Bacterial infections represent a serious threa...
997,A total of seven phenolics and 44 metabolites ...
998,Red fruits were suggested to be tolerant to co...


In [None]:
from tqdm import tqdm

In [None]:
embeddings = []
# Loop through rows and extract embeddings
for text in tqdm(abstracts, desc="Extracting embeddings"):
    embedding = get_text_embedding(str(text))
    embeddings.append(embedding)

In [34]:
# Save the embedding to a csv file
embedding_df = pd.DataFrame(embeddings)
embedding_df.to_csv('abstract_embeddings.csv', index=False)

# Show first 10 embeddings
embeddings[:10]

[array([ 3.04974854e-01, -2.08702609e-01, -1.88740164e-01, -2.54584432e-01,
        -4.97645229e-01, -3.29334319e-01, -1.26483667e+00, -8.18423390e-01,
         9.52510953e-01, -9.62193012e-01, -3.15317839e-01,  7.95340002e-01,
        -1.11008954e+00, -3.51076961e-01,  1.53460936e-03, -4.94086355e-01,
        -9.92672324e-01,  5.06247878e-01,  3.00714880e-01,  6.64672792e-01,
         3.30274314e-01, -4.88807142e-01,  4.32899743e-01,  4.71427351e-01,
         4.21915680e-01, -5.02351522e-02, -1.89797014e-01,  7.15771675e-01,
         1.32417351e-01,  9.41675901e-01, -1.50587893e+00,  1.40471184e+00,
        -4.76803072e-02, -5.28306603e-01, -5.52664399e-01,  5.29329062e-01,
         6.08932972e-01,  7.10950136e-01, -4.13425803e-01,  1.17305830e-01,
         3.41136307e-01, -1.92102432e-01,  6.54589385e-02,  2.91757494e-01,
        -2.56420057e-02,  1.01304567e+00,  2.84975488e-02,  3.70321006e-01,
        -5.14248490e-01, -2.82540172e-01,  1.94323242e+00, -4.74125922e-01,
        -5.8