<a href="https://colab.research.google.com/github/raz0208/ModernBERT/blob/main/ModernBERT_TokenEmbedding_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Extract embedding form inpot text using ModernBERT Version 1

In [1]:
# import required libraries
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

### Load NLP and ModernBert models

In [None]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

### Extract emmbedings based on full text

In [3]:
# Function to get inpout text and return full text embedding
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

## Use Neo4j to connect the graph database

In [4]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/312.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [7]:
from neo4j import GraphDatabase

# Define Neo4j connection credentials
NEO4J_URI = "neo4j://143.225.233.156:7687"
NEO4J_USER = "rezaazari"
NEO4J_PASSWORD = "rAzari987"

# Initialize the driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# Function to test connection
def test_connection():
    with driver.session() as session:
        greeting = session.run("RETURN 'Connected to Neo4j' AS message").single()["message"]
        print(greeting)

if __name__ == "__main__":
    test_connection()

Connected to Neo4j


In [8]:
# Function to run Cypher query
def run_query(cypher_query, parameters=None):
    with driver.session() as session:
        result = session.run(cypher_query, parameters or {})
        return [record.data() for record in result]

# Query of showing example nodes
query = "MATCH (n) RETURN n LIMIT 5"
results = run_query(query)
for r in results:
    print(r)

{'n': {'date': '1-12-1987', 'journal': 'The Journal of Cell Biology', 'hub': 0.0, 'auth': 2.6175247320960168e-12, 'subjects': 'Articles', 'pmc': 'PMC2114721', 'abstract': 'Meiosis I in males of the Dipteran Sciara coprophila results in the nonrandom distribution of maternally and paternally derived chromosome sets to the two division products. Based on an earlier study (Kubai, D.F. 1982. J. Cell Biol. 93:655-669), I suggested that the meiosis I spindle does not play a direct role in the nonrandom sorting of chromosomes but that, instead, haploid sets are already separated in prophase nuclei well before the onset of spindle formation. Here I report more direct evidence that this hypothesis is true; this evidence was gained from ultrastructural reconstruction analyses of the arrangement of chromosomes in germ line nuclei (prophase nuclei in spermatogonia and spermatocytes) of males heterozygous for an X- autosome chromosome translocation. Because of this translocation, the maternal and p

In [9]:
# Function to find similar nodes using cosine similarity
def find_similar_nodes(text_embedding, top_n):
     embedding_list = text_embedding.tolist()
     cypher_query = """
     MATCH (n)-[:HAS_EMBEDDING]->(e:ABSTRACT)
     WHERE e.embedding IS NOT NULL
     WITH n, e, gds.similarity.cosine($sent_embedding, e.embedding) AS similarity
     RETURN n, similarity
     ORDER BY similarity DESC
     LIMIT $limit
     """
     parameters = {"sent_embedding": embedding_list, "limit": top_n}
     results = run_query(cypher_query, parameters)
     return results

In [None]:
# # Call function to run similarity query
# similar_nodes = find_similar_nodes(full_text_embedding, top_n=5)

# # show the result
# print(f"\nTop {len(similar_nodes)} similar nodes:")
# for node_data in similar_nodes:
#          print(f"Node: {node_data['n']}, Similarity: {node_data['similarity']:.4f}")

### Exacute the app and get output

In [10]:
### --- ### Sample text for test ### --- ###

# 1- This is an application about Breast Cancer.
# 2- Treating high blood pressure, high blood lipids, diabetes.
# 3- Heart failure, heart attack, stroke, aneurysm, peripheral artery disease, sudden cardiac arrest. Deaths: 17.9 million / 32% (2015)
# 4- Heart failure and stroke are common causes of death.

In [11]:
# Example usage (Sentence: This is an application about Breast Cancer.)
if __name__ == "__main__":
    user_text = input("Enter your text: ")

    # Get sentence embedding
    full_text_embedding = get_text_embedding(user_text)
    print("\nSentence Embedding vector shape:", full_text_embedding.shape)
    print("Sentence Embedding (first 10 values):", full_text_embedding[:10])

    # Call function to run similarity query
    similar_nodes = find_similar_nodes(full_text_embedding, top_n=5)

    # Show the result
    print(f"\nTop {len(similar_nodes)} similar nodes:")
    for node_data in similar_nodes:
      print(f"Node: {node_data['n']}, Similarity: {node_data['similarity']:.4f}")

Enter your text: Heart failure, heart attack, stroke, aneurysm, peripheral artery disease, sudden cardiac arrest. Deaths: 17.9 million / 32% (2015)

Sentence Embedding vector shape: (768,)
Sentence Embedding (first 10 values): [ 0.43869126 -0.26175603 -0.7007977   0.22565924 -0.38932806 -0.41834965
 -1.1746391  -0.8032964   0.19345134 -0.10638831]

Top 5 similar nodes:
Node: {'date': '01-7-2023', 'journal': 'Journal of Health Psychology', 'hub': 0.0, 'auth': 2.843039039510408e-56, 'subjects': 'Articles', 'pmc': 'PMC9679309', 'abstract': 'The COVID-19 pandemic continues to impact global psychological wellbeing. To investigate the sustained impact of COVID-19 on wellbeing, the current study longitudinally assessed fear of COVID-19, anxiety, depression, intolerance of uncertainty, worry, sleep quality, loneliness and alcohol use during the pandemic in the United Kingdom. Timepoint 1 (T1; N =\u2009445) took place in February 2021 following the highest number of pandemic-related deaths in t