<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Read and load dataset
dataset = pd.read_csv('gdb_dataset.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

Node Content: (1000, 4)
     Unnamed: 0                                              title  \
0             0  Phenotypic variability of Niemann-Pick disease...   
1             1  Recurrent hypoglycemia secondary to metformin ...   
2             2  Adaptation of the Ambulatory and Home Care Rec...   
3             3  Multidimensional family therapy in adolescents...   
4             4  Balanced crystalloids versus isotonic saline i...   
..          ...                                                ...   
995         995  Molecular Sex Identification in Dioecious  Hip...   
996         996  Antimicrobial Peptides: Powerful Biorecognitio...   
997         997  Analysis of Metabolites in White Flowers of  M...   
998         998  Improved Cold Tolerance of Mango Fruit with En...   
999         999  Characterization of the Complete Chloroplast G...   

                                              abstract  year  
0    Background Niemann-Pick disease type C (NPC) i...  2018  
1    Back

In [3]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# Display a few samples to verify
abstracts

Unnamed: 0,abstract
0,Background Niemann-Pick disease type C (NPC) i...
1,Background Metformin toxicity is well known to...
2,Background Measuring service use and costs is ...
3,Background Substance use and delinquency are c...
4,Objectives Intravenous fluids are one of the m...
...,...
995,The dioecious property of the sea buckthorn ( ...
996,Bacterial infections represent a serious threa...
997,A total of seven phenolics and 44 metabolites ...
998,Red fruits were suggested to be tolerant to co...


In [4]:
from sentence_transformers import SentenceTransformer

In [6]:
encoder = SentenceTransformer('paraphrase-MiniLM-L12-v2')
embeddings = encoder.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [11]:
embeddings

array([[-0.18085338, -0.02877363, -0.01457172, ...,  0.25280115,
         0.00423281, -0.11268181],
       [-0.09924375, -0.01678823, -0.02370375, ...,  0.08323234,
         0.04386317, -0.1766841 ],
       [ 0.0769803 , -0.10633864, -0.03089524, ...,  0.23365135,
        -0.14438109, -0.07243013],
       ...,
       [-0.05083591,  0.02510978, -0.15768585, ...,  0.15259632,
         0.2336707 , -0.03094873],
       [-0.03681072,  0.01801709, -0.19822915, ..., -0.04061321,
        -0.11218563,  0.22541869],
       [-0.1522122 ,  0.2695344 , -0.19867367, ...,  0.03437565,
         0.07468198,  0.06973903]], dtype=float32)

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [8]:
# Load ModernBERT tokenizer and model from Hugging Face
model_name = 'answerdotai/ModernBERT-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [9]:
# Function to get inpout text and return full text embedding (Edit code to get embedding sentence by sentence)
def get_text_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text.tolist(), return_tensors='pt', padding=True, truncation=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [None]:
# Ensure all text entries are strings (handle potential non-string entries)
abstract_cleaned = abstracts.astype(str)

# Get embeddings for each abstract
full_text_embeddings = get_text_embeddings(abstract_cleaned)

# Show results
print("\nEmbeddings shape:", full_text_embeddings.shape)
print("First embedding vector (10 values):", full_text_embeddings[0][:10])