In [1]:
import os
import getpass
from dotenv import load_dotenv

#get env setup
load_dotenv('podcast-gds.env', override=True)

if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [2]:
from neo4j import GraphDatabase

# load into People nodes in Neo4j

#instantiate driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#test neo4j connection
driver.execute_query("MATCH(n) RETURN count(n)")

EagerResult(records=[<Record count(n)=584>], summary=<neo4j._work.summary.ResultSummary object at 0x1292913d0>, keys=['count(n)'])

In [3]:
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [4]:
from neo4j import GraphDatabase
from openai import OpenAI
import nltk
from tiktoken import get_encoding

nltk.download('punkt')
encoding = get_encoding("cl100k_base")

# --- Step 1: Chunking function ---
def chunk_text(text, max_tokens=400, overlap=50):
    sentences = nltk.sent_tokenize(text)
    chunks, current_chunk, current_length = [], [], 0

    for sentence in sentences:
        sentence_tokens = len(encoding.encode(sentence))
        if current_length + sentence_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            # add small overlap for context continuity
            overlap_tokens = encoding.encode(" ".join(current_chunk))[-overlap:]
            overlap_text = encoding.decode(overlap_tokens)
            current_chunk = [overlap_text, sentence]
            current_length = len(encoding.encode(overlap_text)) + sentence_tokens
        else:
            current_chunk.append(sentence)
            current_length += sentence_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# This model produces 1536-dimensional vectors
def embed_text(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# --- Step 3: Write transaction for chunks ---
def create_chunk_node(tx, episode_number, file_name, order, text, embedding):
    tx.run("""
    MERGE (ep:Episode {number: $episode_number})
    MERGE (chunk:Chunk {order: $order, fileName: $file_name})
      SET chunk.text = $text,
          chunk.embedding = $embedding
    MERGE (ep)-[:HAS_CHUNK]->(chunk)
    MERGE (chunk)-[:BELONGS_TO_EPISODE]->(ep)
    """, episode_number=episode_number, file_name=file_name, order=order, text=text, embedding=embedding)

def add_chunks_to_neo4j(episode_number, file_name, text):
    chunks = chunk_text(text)
    with driver.session() as session:
        for i, chunk in enumerate(chunks, start=1):
            embedding = embed_text(chunk)
            session.execute_write(
                create_chunk_node,
                episode_number,
                file_name,
                i,
                chunk,
                embedding
            )



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sangeethar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

import os.path as osp


BASE_DIR = "/Users/sangeethar/workspace/AI-Workspace/neo4j-employee-graph/neo4j-employee-graph/input-podcast-episodes-data/"

#Already Processed
#osp.join(BASE_DIR, "AnthropicAndModelContextProtocol-MCP-WithDavidSoriaParra-ep1836.txt"),
#osp.join(BASE_DIR, "delayed-view-semantics-incremental-data-processing-ep473.txt"),
#osp.join(BASE_DIR, "DuckLake-SimplifyingLakehouseEcosystem-ep480.txt"),
#osp.join(BASE_DIR, "kuzudb-embeddable-graph-database-ep477.txt"),




# --- Step 4: Example usage ---


#### Chunk and Embed PrompsAsFunctions-BAML and Iceberg-At_NetflixAndBeyong-1654.txt

In [None]:
FILE_PATHS = [
    osp.join(BASE_DIR, "PromptsAsFunctions-BAML-Revolution_AI-Engineering-ep2025040307.txt"),
    osp.join(BASE_DIR, "Iceberg-At-NetflixAndBeyond-RyanBlue-ep1654.txt")
]


#### Chunk and Embed delayed-view-semantics-incremental-data-processing-ep473.txt

In [None]:
file_path = osp.join(BASE_DIR, "delayed-view-semantics-incremental-data-processing-ep473.txt")
file_name = "delayed-view-semantics-incremental-data-processing-ep473.txt"

with open(file_path, "r") as f:
    text = f.read()

add_chunks_to_neo4j(473, file_name, text)

#### Chunk and Embed Kuzu, MCP, DuckLate Episodes

In [None]:
import os.path as osp


BASE_DIR = "/Users/sangeethar/workspace/AI-Workspace/neo4j-employee-graph/neo4j-employee-graph/input-podcast-episodes-data/"

FILE_PATHS = [
    osp.join(BASE_DIR, "AnthropicAndModelContextProtocol-MCP-WithDavidSoriaParra-ep1836.txt"),
    osp.join(BASE_DIR, "DuckLake-SimplifyingLakehouseEcosystem-ep480.txt"),
    osp.join(BASE_DIR, "kuzudb-embeddable-graph-database-ep477.txt"),
]



#### Batch process the files from FILE_PATHS

In [None]:
# Process all files in FILE_PATHS
print("üöÄ Starting batch processing of all podcast episodes...")

for file_path in FILE_PATHS:
    # Extract filename from full path
    file_name = osp.basename(file_path)
    
    # Extract episode number from filename (assuming format: name-ep###.txt)
    try:
        episode_number = int(file_name.split('-ep')[-1].split('.')[0])
    except (ValueError, IndexError):
        print(f"‚ö†Ô∏è  Warning: Could not extract episode number from {file_name}, skipping...")
        continue
    
    print(f"\nüìÑ Processing {file_name} (Episode #{episode_number})")
    
    # Read file content
    try:
        with open(file_path, "r") as f:
            text = f.read()
        
        # Add chunks to Neo4j
        add_chunks_to_neo4j(episode_number, file_name, text)
        print(f"‚úÖ Successfully processed {file_name}")
        
    except FileNotFoundError:
        print(f"‚ùå File not found: {file_path}")
    except Exception as e:
        print(f"‚ùå Error processing {file_name}: {str(e)}")

print("\nüéâ Batch processing completed!")


#### Helper Methods to create embedding on Episode, Topic, Concept, Technology and ReferenceLink

In [17]:
# Text Summarization Functions for Embedding Generation

def create_embedding_text_for_episode(episode_props):
    """Create a comprehensive text summary for Episode nodes"""
    text_parts = []
    
    # Add basic episode information
    if episode_props.get('name'):
        text_parts.append(f"Episode: {episode_props['name']}")
    
    if episode_props.get('number'):
        text_parts.append(f"Episode Number: {episode_props['number']}")
    
    if episode_props.get('description'):
        text_parts.append(f"Description: {episode_props['description']}")
    
    if episode_props.get('published_date'):
        text_parts.append(f"Published: {episode_props['published_date']}")
    
    if episode_props.get('link'):
        text_parts.append(f"Link: {episode_props['link']}")
    
    return " | ".join(text_parts)

def create_embedding_text_for_topic(topic_props):
    """Create a comprehensive text summary for Topic nodes"""
    text_parts = []
    
    if topic_props.get('name'):
        text_parts.append(f"Topic: {topic_props['name']}")
    
    if topic_props.get('description'):
        text_parts.append(f"Description: {topic_props['description']}")
    
    return " | ".join(text_parts)

def create_embedding_text_for_concept(concept_props):
    """Create a comprehensive text summary for Concept nodes"""
    text_parts = []
    
    if concept_props.get('name'):
        text_parts.append(f"Concept: {concept_props['name']}")
    
    if concept_props.get('description'):
        text_parts.append(f"Description: {concept_props['description']}")
    
    return " | ".join(text_parts)

def create_embedding_text_for_technology(tech_props):
    """Create a comprehensive text summary for Technology nodes"""
    text_parts = []
    
    if tech_props.get('name'):
        text_parts.append(f"Technology: {tech_props['name']}")
    
    if tech_props.get('description'):
        text_parts.append(f"Description: {tech_props['description']}")
    
    return " | ".join(text_parts)

def create_embedding_text_for_reference_link(ref_props):
    """Create a comprehensive text summary for ReferenceLink nodes"""
    text_parts = []
    
    if ref_props.get('text'):
        text_parts.append(f"Reference: {ref_props['text']}")
    
    if ref_props.get('url'):
        text_parts.append(f"URL: {ref_props['url']}")
    
    return " | ".join(text_parts)

print("‚úÖ Text summarization functions loaded successfully!")


‚úÖ Text summarization functions loaded successfully!


#### Add Embedding properties to Episode, Topic, Concept, Technology, ReferenceLink

In [None]:
# Neo4j Transaction Functions for Adding Embeddings

def add_embedding_to_episode(tx, episode_number, embedding):
    """Add embedding property to an Episode node"""
    tx.run("""
    MATCH (ep:Episode {number: $episode_number})
    SET ep.embedding = $embedding
    """, episode_number=episode_number, embedding=embedding)

def add_embedding_to_topic(tx, topic_name, embedding):
    """Add embedding property to a Topic node"""
    tx.run("""
    MATCH (t:Topic {name: $topic_name})
    SET t.simple_embedding = $embedding
    """, topic_name=topic_name, embedding=embedding)

def add_embedding_to_concept(tx, concept_name, embedding):
    """Add embedding property to a Concept node"""
    tx.run("""
    MATCH (c:Concept {name: $concept_name})
    SET c.embedding = $embedding
    """, concept_name=concept_name, embedding=embedding)

def add_embedding_to_technology(tx, tech_name, embedding):
    """Add embedding property to a Technology node"""
    tx.run("""
    MATCH (tech:Technology {name: $tech_name})
    SET tech.embedding = $embedding
    """, tech_name=tech_name, embedding=embedding)

def add_embedding_to_reference_link(tx, ref_text, ref_url, embedding):
    """Add embedding property to a ReferenceLink node"""
    tx.run("""
    MATCH (rl:ReferenceLink {text: $ref_text, url: $ref_url})
    SET rl.embedding = $embedding
    """, ref_text=ref_text, ref_url=ref_url, embedding=embedding)

print("‚úÖ Neo4j transaction functions loaded successfully!")


#### Add embedding to Episode, Topic, Concept, Technology, ReferenceLink

In [None]:
# FIXED: Main Execution Function with Proper Transaction Handling
# This fixes ResultConsumedError by collecting data INSIDE the transaction

def add_embeddings_to_all_nodes():
    """Process all nodes and add embedding properties - FIXED for transaction scope"""
    
    with driver.session() as session:
        print("üöÄ Starting embedding generation for all nodes...")
        
        # Process Episode nodes
        print("\nüì∫ Processing Episode nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        episodes_data = session.execute_read(lambda tx: 
            [record.data() for record in tx.run("""
                MATCH (ep:Episode)
                RETURN ep.number as number, properties(ep) as props
            """)]
        )
        
        episode_count = 0
        # Process collected data AFTER transaction closes
        for record in episodes_data:
            episode_number = record['number']
            episode_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_episode(episode_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_episode, episode_number, embedding)
                    episode_count += 1
        
        print(f"‚úÖ Processed {episode_count} Episode nodes")
        
        # Process Topic nodes
        print("\nüìö Processing Topic nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        topics_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (t:Topic)
                RETURN t.name as name, properties(t) as props
            """)]
        )
        
        topic_count = 0
        for record in topics_data:
            topic_name = record['name']
            topic_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_topic(topic_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_topic, topic_name, embedding)
                    topic_count += 1
        
        print(f"‚úÖ Processed {topic_count} Topic nodes")
        
        # Process Concept nodes
        print("\nüí° Processing Concept nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        concepts_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (c:Concept)
                RETURN c.name as name, properties(c) as props
            """)]
        )
        
        concept_count = 0
        for record in concepts_data:
            concept_name = record['name']
            concept_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_concept(concept_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_concept, concept_name, embedding)
                    concept_count += 1
        
        print(f"‚úÖ Processed {concept_count} Concept nodes")
        
        # Process Technology nodes
        print("\nüîß Processing Technology nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        techs_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (tech:Technology)
                RETURN tech.name as name, properties(tech) as props
            """)]
        )
        
        tech_count = 0
        for record in techs_data:
            tech_name = record['name']
            tech_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_technology(tech_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_technology, tech_name, embedding)
                    tech_count += 1
        
        print(f"‚úÖ Processed {tech_count} Technology nodes")
        
        # Process ReferenceLink nodes
        print("\nüîó Processing ReferenceLink nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        refs_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (rl:ReferenceLink)
                RETURN rl.text as text, rl.url as url, properties(rl) as props
            """)]
        )
        
        ref_count = 0
        for record in refs_data:
            ref_text = record['text']
            ref_url = record['url']
            ref_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_reference_link(ref_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_reference_link, ref_text, ref_url, embedding)
                    ref_count += 1
        
        print(f"‚úÖ Processed {ref_count} ReferenceLink nodes")
        
        print(f"\nüéâ Embedding generation completed!")
        print(f"Total nodes processed: {episode_count + topic_count + concept_count + tech_count + ref_count}")

print("‚úÖ FIXED embedding generation function loaded successfully!")


#### Trigger Embedding creation for Episode, Topic, Concept, Technology, ReferenceLink

In [None]:
# Execute the embedding generation
print("üöÄ Starting embedding generation for all nodes...")
add_embeddings_to_all_nodes()


#### Verify embeddings were created

In [None]:
# Verification: Check that embeddings were created
print("üîç Verifying embeddings were created...")

with driver.session() as session:
    # Check Episode embeddings
    episodes_with_embeddings = session.execute_read(lambda tx: tx.run("""
        MATCH (ep:Episode)
        WHERE ep.embedding IS NOT NULL
        RETURN count(ep) as count
    """))
    print(f"Episodes with embeddings: {episodes_with_embeddings.records[0]['count']}")
    
    # Check Topic embeddings
    topics_with_embeddings = session.execute_read(lambda tx: tx.run("""
        MATCH (t:Topic)
        WHERE t.embedding IS NOT NULL
        RETURN count(t) as count
    """))
    print(f"Topics with embeddings: {topics_with_embeddings.records[0]['count']}")
    
    # Check Concept embeddings
    concepts_with_embeddings = session.execute_read(lambda tx: tx.run("""
        MATCH (c:Concept)
        WHERE c.embedding IS NOT NULL
        RETURN count(c) as count
    """))
    print(f"Concepts with embeddings: {concepts_with_embeddings.records[0]['count']}")
    
    # Check Technology embeddings
    techs_with_embeddings = session.execute_read(lambda tx: tx.run("""
        MATCH (tech:Technology)
        WHERE tech.embedding IS NOT NULL
        RETURN count(tech) as count
    """))
    print(f"Technologies with embeddings: {techs_with_embeddings.records[0]['count']}")
    
    # Check ReferenceLink embeddings
    refs_with_embeddings = session.execute_read(lambda tx: tx.run("""
        MATCH (rl:ReferenceLink)
        WHERE rl.embedding IS NOT NULL
        RETURN count(rl) as count
    """))
    print(f"ReferenceLinks with embeddings: {refs_with_embeddings.records[0]['count']}")

print("‚úÖ Verification completed!")


#### Add Comprehensive topic embedding (topic+concept+technologies)

In [19]:
# Comprehensive Topic Data Collection and Enhanced Embedding Generation

def get_comprehensive_topics_data():
    """Get all topics with their related concepts and technologies"""
    
    with driver.session() as session:
        print("üîç Collecting comprehensive topic data...")
        
        # Get comprehensive topic data with all related concepts and technologies
        comprehensive_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (t:Topic)
                OPTIONAL MATCH (t)-[:COVERS]->(c:Concept)
                OPTIONAL MATCH (t)-[:COVERS]->(tech:Technology)
                RETURN t.name as topic_name,
                       collect(DISTINCT {
                           concept_name: c.name,
                           concept_description: c.description
                       }) as concepts,
                       collect(DISTINCT {
                           tech_name: tech.name
                       }) as technologies
                ORDER BY t.name
            """)]
        )
        
        print(f"‚úÖ Collected data for {len(comprehensive_data)} topics")
        return comprehensive_data

def create_comprehensive_embedding_text_for_topic(topic_data):
    """Create comprehensive embedding text for a topic including all related concepts and technologies"""
    text_parts = []
    
    # Add topic information (only name, no description)
    if topic_data.get('topic_name'):
        text_parts.append(f"Topic: {topic_data['topic_name']}")
    
    # Add all related concepts
    concepts = topic_data.get('concepts', [])
    if concepts:
        concept_texts = []
        for concept in concepts:
            if concept.get('concept_name'):
                concept_text = f"Concept: {concept['concept_name']}"
                if concept.get('concept_description'):
                    concept_text += f" - {concept['concept_description']}"
                concept_texts.append(concept_text)
        
        if concept_texts:
            text_parts.append("Related Concepts: " + " | ".join(concept_texts))
    
    # Add all related technologies (only name, no description)
    technologies = topic_data.get('technologies', [])
    if technologies:
        tech_texts = []
        for tech in technologies:
            if tech.get('tech_name'):
                tech_text = f"Technology: {tech['tech_name']}"
                tech_texts.append(tech_text)
        
        if tech_texts:
            text_parts.append("Related Technologies: " + " | ".join(tech_texts))
    
    return " | ".join(text_parts) if text_parts else None

def add_comprehensive_embedding_to_topic(tx, topic_name, embedding):
    """Add comprehensive embedding property to a Topic node"""
    tx.run("""
    MATCH (t:Topic {name: $topic_name})
    SET t.embedding = $embedding
    """, topic_name=topic_name, embedding=embedding)

def process_comprehensive_topic_embeddings():
    """Process all topics and create comprehensive embeddings including concepts and technologies"""
    
    with driver.session() as session:
        print("üöÄ Starting comprehensive topic embedding generation...")
        
        # Get comprehensive topic data
        topics_data = get_comprehensive_topics_data()
        
        topic_count = 0
        for topic_data in topics_data:
            topic_name = topic_data['topic_name']
            
            # Create comprehensive embedding text
            embedding_text = create_comprehensive_embedding_text_for_topic(topic_data)
            if embedding_text:
                # Generate embedding
                embedding = embed_text(embedding_text)
                if embedding:
                    # Add comprehensive embedding to topic
                    session.execute_write(add_comprehensive_embedding_to_topic, topic_name, embedding)
                    topic_count += 1
                    print(f"‚úÖ Processed comprehensive embedding for topic: {topic_name}")
        
        print(f"\nüéâ Processed {topic_count} topics with comprehensive embeddings!")

print("‚úÖ Comprehensive topic embedding functions loaded successfully!")


‚úÖ Comprehensive topic embedding functions loaded successfully!


#### Trigger Comprehensice Topic embedding creation

In [20]:
# Execute comprehensive topic embedding generation
print("üöÄ Starting comprehensive topic embedding generation...")
process_comprehensive_topic_embeddings()

üöÄ Starting comprehensive topic embedding generation...
üöÄ Starting comprehensive topic embedding generation...
üîç Collecting comprehensive topic data...




‚úÖ Collected data for 6 topics
‚úÖ Processed comprehensive embedding for topic: Apache Iceberg Table Format and Data Lakes
‚úÖ Processed comprehensive embedding for topic: BAML: Prompts as Structured Functions
‚úÖ Processed comprehensive embedding for topic: Delayed View Semantics and Incremental Processing
‚úÖ Processed comprehensive embedding for topic: Duck Lake: The SQL-Backed Open Standard for Simplified Lakehouse Metadata
‚úÖ Processed comprehensive embedding for topic: Embeddable Graph Databases with KuzuDB
‚úÖ Processed comprehensive embedding for topic: Model Context Protocol (MCP): The Open Standard for Context-Aware AI

üéâ Processed 6 topics with comprehensive embeddings!


#### Verifying comprehensive topic embedding was creatted successfully

In [11]:
# Verification: Check comprehensive embeddings were created (FIXED for transaction scope)
print("üîç Verifying comprehensive topic embeddings were created...")

with driver.session() as session:
    # Check topics with comprehensive embeddings
    # FIX: Collect data INSIDE transaction using .data()
    topics_data = session.execute_read(lambda tx:
        [record.data() for record in tx.run("""
            MATCH (t:Topic)
            WHERE t.comprehensive_embedding IS NOT NULL
            RETURN count(t) as count
        """)]
    )
    
    # Process collected data AFTER transaction closes
    for record in topics_data:
        print(f"Topics with comprehensive embeddings: {record['count']}")
        break  # Only need the first (and only) record
    
    # Show sample of comprehensive embedding data
    # FIX: Collect data INSIDE transaction using .data()
    sample_data = session.execute_read(lambda tx:
        [record.data() for record in tx.run("""
            MATCH (t:Topic)
            WHERE t.comprehensive_embedding IS NOT NULL
            RETURN t.name as topic_name, 
                   size(t.comprehensive_embedding) as embedding_dimension
            LIMIT 10
        """)]
    )
    
    print("\nüìä Sample comprehensive embeddings:")
    for record in sample_data:
        print(f"  - {record['topic_name']}: {record['embedding_dimension']} dimensions")

print("‚úÖ Verification completed!")


üîç Verifying comprehensive topic embeddings were created...
Topics with comprehensive embeddings: 6

üìä Sample comprehensive embeddings:
  - Embeddable Graph Databases with KuzuDB: 1536 dimensions
  - Duck Lake: The SQL-Backed Open Standard for Simplified Lakehouse Metadata: 1536 dimensions
  - Delayed View Semantics and Incremental Processing: 1536 dimensions
  - Model Context Protocol (MCP): The Open Standard for Context-Aware AI: 1536 dimensions
  - Apache Iceberg Table Format and Data Lakes: 1536 dimensions
  - BAML: Prompts as Structured Functions: 1536 dimensions
‚úÖ Verification completed!


#### List all the keys to verify the property is listed

In [None]:
MATCH (e:Topic)
UNWIND keys(e) AS key // Get the list of keys for each episode and flatten them
RETURN DISTINCT key AS topicPropertyKey
ORDER BY key

In [None]:
MATCH (e:Topic)
RETURN properties(e)

#### Handle Technology Nodes with missing names -- Process and create embeddings for them. These are auto detected Technology (entity) nodes

In [None]:
# Handle Technology Nodes with Missing Names

def create_embedding_text_for_mentioned_technology(tech_props):
    """Create embedding text for Technology nodes that only have id property"""
    text_parts = []
    
  
    if tech_props.get('name'):
        text_parts.append(f"Name: {tech_props['name']}")
    
    return " | ".join(text_parts) if text_parts else None

def process_technology_nodes_without_embeddings():
    """Process Technology nodes that have name and id but no embedding, add embedding"""
    
    with driver.session() as session:
        print("üöÄ Processing Technology nodes with missing embeddings...")
        
        # Get Technology nodes with name and id but no embedding
        techs_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (n:Technology)
                WHERE n.embedding IS NULL 
                  AND n.name IS NOT NULL
                  AND n.id IS NOT NULL
                RETURN n.name as name, properties(n) as props
            """)]
        )
        
        tech_count = 0
        for record in techs_data:
            tech_name = record['name']
            tech_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_mentioned_technology(tech_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_embedding_to_technology, tech_name, embedding)
                    tech_count += 1
                    print(f"‚úÖ Processed Technology: {tech_name}")
        
        print(f"\nüéâ Processed {tech_count} Technology nodes with missing embeddings!")

print("‚úÖ Technology name and embedding functions loaded successfully!")


#### Trigger Process Technology names without embedding

In [None]:
# Execute the Technology node processing
print("üöÄ Starting processing of Technology nodes with missing embeddings...")
process_technology_nodes_without_embeddings()


#### Embeddng generation summary

## Embedding Generation Summary

### What was created:
- **Episode embeddings**: Combines name, number, description, published_date, and link
- **Topic embeddings**: Combines name and description
- **Concept embeddings**: Combines name and description  
- **Technology embeddings**: Combines name and description
- **ReferenceLink embeddings**: Combines text and url

### How to use the embeddings:

1. **Similarity Search**: Use vector similarity to find related nodes
2. **Semantic Search**: Search for nodes based on meaning rather than exact text
3. **Recommendation Systems**: Find similar episodes, topics, or concepts
4. **Clustering**: Group similar nodes together

### Example Cypher queries with embeddings:

```cypher
// Find episodes similar to a given episode
MATCH (ep1:Episode {number: 1654})
MATCH (ep2:Episode)
WHERE ep1 <> ep2 AND ep2.embedding IS NOT NULL
RETURN ep2.name, ep2.number, 
       gds.similarity.cosine(ep1.embedding, ep2.embedding) as similarity
ORDER BY similarity DESC
LIMIT 5

// Find topics similar to a concept
MATCH (c:Concept {name: "Data Engineering"})
MATCH (t:Topic)
WHERE t.embedding IS NOT NULL
RETURN t.name, 
       gds.similarity.cosine(c.embedding, t.embedding) as similarity
ORDER BY similarity DESC
LIMIT 3
```

### Best Practices:
- Embeddings are generated using OpenAI's `text-embedding-3-small` model
- Each embedding is a 1536-dimensional vector
- Embeddings are stored as arrays in Neo4j
- Use GDS (Graph Data Science) library for similarity calculations
- Consider updating embeddings when node properties change


#### Add Simple Embedding to Topic Nodes

In [12]:
# Add Simple Embedding to Topic Nodes (Extracted from add_embeddings_to_all_nodes)

def add_simple_embedding_to_topic_tx(tx, topic_name, embedding):
    """Add simple_embedding property to a Topic node (transaction function)"""
    tx.run("""
    MATCH (t:Topic {name: $topic_name})
    SET t.simple_embedding = $embedding
    """, topic_name=topic_name, embedding=embedding)

def add_simple_embedding_to_topic():
    """
    Process all Topic nodes and add simple_embedding properties.
    This method extracts only the Topic-specific logic from add_embeddings_to_all_nodes().
    """
    with driver.session() as session:
        print("üöÄ Starting simple embedding generation for Topic nodes...")
        
        # Process Topic nodes
        print("\nüìö Processing Topic nodes...")
        # FIX: Collect data INSIDE transaction using .data()
        topics_data = session.execute_read(lambda tx:
            [record.data() for record in tx.run("""
                MATCH (t:Topic)
                RETURN t.name as name, properties(t) as props
            """)]
        )
        
        topic_count = 0
        for record in topics_data:
            topic_name = record['name']
            topic_props = dict(record['props'])
            
            # Create embedding text and generate embedding
            embedding_text = create_embedding_text_for_topic(topic_props)
            if embedding_text:
                embedding = embed_text(embedding_text)
                if embedding:
                    session.execute_write(add_simple_embedding_to_topic_tx, topic_name, embedding)
                    topic_count += 1
        
        print(f"‚úÖ Processed {topic_count} Topic nodes")
        return topic_count

print("‚úÖ Simple embedding functions for Topic nodes loaded successfully!")


‚úÖ Simple embedding functions for Topic nodes loaded successfully!


Trigger creation of simple_embedding property to Topic Nodes

In [18]:
print("üöÄ Starting comprehensive topic embedding generation...")
add_simple_embedding_to_topic()

üöÄ Starting comprehensive topic embedding generation...
üöÄ Starting simple embedding generation for Topic nodes...

üìö Processing Topic nodes...
‚úÖ Processed 6 Topic nodes


6