# Neo4j Aura Professional Trial Instance : Episode with Graph Data Science (GDS)

## Use APIKey Credentials generated to work with GDS calls

In [1]:
import os
import json
import getpass
from dotenv import load_dotenv
from neo4j_viz.neo4j import from_neo4j

#get env setup for Free Trial
#load_dotenv('.env', override=True)

#get env set up for PROFESSIONAL free trial with data science graph algorithms enabled
load_dotenv('podcast-gds.env', override=True)

# Neo4j connection variables
if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

# Client credentials for API access
if not os.environ.get('CLIENT_SECRET'):
    os.environ['CLIENT_SECRET'] = getpass.getpass('CLIENT_SECRET:\n')
if not os.environ.get('CLIENT_ID'):
    os.environ['CLIENT_ID'] = getpass.getpass('CLIENT_ID:\n')
if not os.environ.get('CLIENT_NAME'):
    os.environ['CLIENT_NAME'] = getpass.getpass('CLIENT_NAME:\n')

# Assign environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')
AURA_INSTANCEID = os.getenv('AURA_INSTANCEID')
AURA_INSTANCENAME = os.getenv('AURA_INSTANCENAME')

# Client credentials
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_NAME = os.getenv('CLIENT_NAME')

# OpenAI API key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Print loaded credentials (without sensitive values)
print("‚úÖ Environment variables loaded:")
print(f"  NEO4J_URI: {NEO4J_URI}")
print(f"  NEO4J_USERNAME: {NEO4J_USERNAME}")
print(f"  NEO4J_DATABASE: {NEO4J_DATABASE}")
print(f"  AURA_INSTANCEID: {AURA_INSTANCEID}")
print(f"  AURA_INSTANCENAME: {AURA_INSTANCENAME}")
print(f"  CLIENT_ID: {CLIENT_ID}")
print(f"  CLIENT_NAME: {CLIENT_NAME}")
print(f"  CLIENT_SECRET: {'*' * len(CLIENT_SECRET) if CLIENT_SECRET else 'Not set'}")
print(f"  OPENAI_API_KEY: {'*' * len(OPENAI_API_KEY) if OPENAI_API_KEY else 'Not set'}")

‚úÖ Environment variables loaded:
  NEO4J_URI: neo4j+s://2236ba22.databases.neo4j.io
  NEO4J_USERNAME: neo4j
  NEO4J_DATABASE: neo4j
  AURA_INSTANCEID: 2236ba22
  AURA_INSTANCENAME: Instance02
  CLIENT_ID: Z62ZOi3OpauFjwLDGa9eQngrTLV78BCJ
  CLIENT_NAME: MentalModelAuraPAPIKey
  CLIENT_SECRET: ****************************************************************
  OPENAI_API_KEY: ********************************************************************************************************************************************************************


In [2]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [None]:
def run_query(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        # Collect results as a list
        records = [record.data() for record in result]
        # Print the records
        for record in records:
            print(record)
        return records


In [None]:

# Helper function to run and display Cypher query results
def run_query_v(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        VG = from_neo4j(result)
        return VG

# Run the query; check the connection
query = "MATCH p=()-[]-() limit 20 RETURN p"
vis = run_query_v(query)
vis.render()

#### Check GDS Version

In [None]:
with driver.session() as session:
    # Test 1: Basic GDS version check
    try:
        result = session.run("RETURN gds.version()")
        # Collect results as a list
        records = [record.data() for record in result]
        #version = result.single()['version']
        version = records[0]['gds.version()']
        print(f"‚úÖ GDS Version: {version}")
        print("‚úÖ GDS is available and working!")

        # Print the records
        for record in records:
            print(record)
       
    except Exception as e:
        print(f"‚ùå GDS Version Check Failed: {e}")

In [None]:

def project_graph(tx, source_node_label="Episode", target_node_label="Topic", relationship_label="HAS_TOPIC", graph_name="topicGraph"):
    # Project Episode-Topic graph using standard GDS procedures
    try:
        # First, try the standard GDS procedure (works on most Neo4j instances)
        result = tx.run(
            """
            CALL gds.graph.project(
                $graph_name,
                [$source_node_label,$target_node_label],
                [$relationship_label]
            )
            YIELD graphName, nodeCount, relationshipCount
            RETURN graphName, nodeCount, relationshipCount
            """,
            graph_name=graph_name, source_node_label=source_node_label, target_node_label=target_node_label, relationship_label=relationship_label
        )
        return result.single()
    except Exception as e:
        raise e



In [None]:
# Refactored GDS functions with shared exception handling
def handle_gds_fallback(tx, operation_type, **kwargs):
    """
    Handle GDS fallback when procedures are not found
    Args:
        tx: Neo4j transaction
        operation_type: 'project' or 'similarity'
        **kwargs: Additional parameters for the operation
    """
    if operation_type == 'project':
        print("‚ö†Ô∏è  GDS not available, creating simple graph projection...")
        result = tx.run(
            """
            MATCH (e:Episode)-[r:HAS_TOPIC]->(t:Topic)
            WITH e, t, r
            RETURN count(e) as episodeCount, count(t) as topicCount, count(r) as relationshipCount
            """
        )
        return result.single()
    
    elif operation_type == 'similarity':
        print("‚ö†Ô∏è  GDS not available, using Jaccard similarity on shared topics...")
        episode_number = kwargs.get('episode_number')
        top_n = kwargs.get('top_n', 5)
        
        query = """
        MATCH (e1:Episode {number: $episode_number})-[:HAS_TOPIC]->(t:Topic)<-[:HAS_TOPIC]-(e2:Episode)
        WHERE e1 <> e2
        WITH e1, e2, count(t) as shared_topics
        MATCH (e1)-[:HAS_TOPIC]->(t1:Topic)
        MATCH (e2)-[:HAS_TOPIC]->(t2:Topic)
        WITH e1, e2, shared_topics, count(DISTINCT t1) as topics1, count(DISTINCT t2) as topics2
        WITH e1, e2, shared_topics, topics1, topics2, 
             shared_topics * 1.0 / (topics1 + topics2 - shared_topics) as similarity
        RETURN e2.number AS similar_episode_number, e2.name AS similar_episode_name, similarity
        ORDER BY similarity DESC
        LIMIT $top_n
        """
        result = tx.run(query, episode_number=episode_number, top_n=top_n)
        return [record.data() for record in result]
    
    else:
        raise ValueError(f"Unknown operation type: {operation_type}")




In [None]:
def project_graph_refactored(tx, graph_name="topicGraph"):
    """Project Episode-Topic graph using standard GDS procedures with fallback"""
    try:
        # First, try the standard GDS procedure (works on most Neo4j instances)
        result = tx.run(
            """
            CALL gds.graph.project(
                $graph_name,
                ['Episode','Topic'],
                ['HAS_TOPIC'],
                {memory: 'HEAP:1G'}
            )
            YIELD graphName, nodeCount, relationshipCount
            RETURN graphName, nodeCount, relationshipCount
            """,
            graph_name=graph_name
        )
        return result.single()
    except Exception as e:
        if "ProcedureNotFound" in str(e):
            return handle_gds_fallback(tx, 'project')
        else:
            raise e



In [None]:
def compute_episode_similarity(tx, graph_name="topicGraph", episode_number=None, top_n=5):
    # Compute node similarity using GDS nodeSimilarity
    query = f"""
    CALL gds.nodeSimilarity.stream('{graph_name}')
    YIELD node1, node2, similarity
    WITH gds.util.asNode(node1) AS ep1, gds.util.asNode(node2) AS ep2, similarity
    WHERE ep1.number = $episode_number
    RETURN ep2.number AS similar_episode_number, ep2.name AS similar_episode_name, similarity
    ORDER BY similarity DESC
    LIMIT $top_n
    """
    result = tx.run(query, episode_number=episode_number, top_n=top_n)
    return [record.data() for record in result]



In [None]:
def compute_episode_similarity_refactored(tx, graph_name="topicGraph", episode_number=None, top_n=5):
    """Compute episode similarity using GDS or fallback method"""
    try:
        # Try GDS nodeSimilarity first
        query = f"""
        CALL gds.nodeSimilarity.stream('{graph_name}')
        YIELD node1, node2, similarity
        WITH gds.util.asNode(node1) AS ep1, gds.util.asNode(node2) AS ep2, similarity
        WHERE ep1.number = $episode_number
        RETURN ep2.number AS similar_episode_number, ep2.name AS similar_episode_name, similarity
        ORDER BY similarity DESC
        LIMIT $top_n
        """
        result = tx.run(query, episode_number=episode_number, top_n=top_n)
        return [record.data() for record in result]
    except Exception as e:
        if "ProcedureNotFound" in str(e):
            return handle_gds_fallback(tx, 'similarity', episode_number=episode_number, top_n=top_n)
        else:
            raise e

In [None]:
def drop_graph(tx, graph_name="topicGraph"):
    tx.run(f"CALL gds.graph.drop('{graph_name}') YIELD graphName")

#### Find Similar Episodes based on Topic

In [None]:
# --- Usage ---
episode_number_to_query = 473  # Example episode
with driver.session() as session:
    print("Projecting graph...")
    info = session.execute_write(project_graph, "Episode", "Topic", "HAS_TOPIC", "topicGraph")
    print("Graph projected:", info)

    print(f"\nTop similar episodes for Episode #{episode_number_to_query}:")
    similar_eps = session.execute_read(compute_episode_similarity, "topicGraph", episode_number_to_query, 3)
    for ep in similar_eps:
        print(ep)

    print("\nCleaning up in-memory graph...")
    session.execute_write(drop_graph, "topicGraph")

#### Finding Similar Episodes based on Technology

In [None]:
# --- Usage ---
episode_number_to_query = 473  # Example episode
with driver.session() as session:
    print("Projecting graph...")
    info = session.execute_write(project_graph, "Episode", "Technology", "COVERS_TECHNOLOGY", "technologyGraph")
    print("Graph projected:", info)

    print(f"\nTop similar episodes for Episode #{episode_number_to_query}:")
    similar_eps = session.execute_read(compute_episode_similarity, "technologyGraph", episode_number_to_query, 3)
    for ep in similar_eps:
        print(ep)

    print("\nCleaning up in-memory graph...")
    session.execute_write(drop_graph, "technologyGraph")

#### Manually delete a graph projection

In [None]:
query = """
CALL gds.graph.drop('episode-tech-projection');
"""
try:
    results = run_query(query)
except Exception:
    print("Ignoring error - projection does not exist")

In [None]:
driver.close()

## GDS Test

#### List available GDS graph projections

In [None]:
with driver.session() as session:
        
    # Test 2: Check for specific GDS procedures
    try:
        result = session.run("CALL gds.graph.list()")
        graphs = [record['graphName'] for record in result]
        print(f"üìä Current GDS Graphs: {len(graphs)}")
        if graphs:
            for graph in graphs:
                print(f"  - {graph}")
        else:
            print("  (No graphs currently loaded)")
    except Exception as e:
        print(f"‚ùå GDS List Check Failed: {e}")
        
  


#### List details of available Graph Projections

In [None]:
with driver.session() as session:
        
    # Test 2: Check for specific GDS procedures
    try:
        result = session.run("CALL gds.graph.list() YIELD graphName, nodeCount, relationshipCount, schema") 
        print(f"üìä Graph Projections:")
        for record in result:
            print(f"  - {record['graphName']}:")
            print(f"    Nodes: {record['nodeCount']}")
            print(f"    Relationships: {record['relationshipCount']}")
            print(f"    Schema: {record['schema']}")
    except Exception as e:
        print(f"‚ùå GDS List Check Failed: {e}")

#### Test Refacrtored Graph projection 

In [None]:
# Test the refactored functions
print("üß™ Testing Refactored GDS Functions:")
print("=" * 50)

episode_number_to_query = 473  # Example episode

with driver.session() as session:
    print("1. Testing graph projection...")
    try:
        info = session.execute_write(project_graph_refactored, "topicGraph")
        print(f"‚úÖ Graph projected: {info}")
    except Exception as e:
        print(f"‚ùå Graph projection failed: {e}")


    

print("=" * 50)

#### Test Refactored Similarity Computation

In [None]:
with driver.session() as session:
    print(f"\n2. Testing similarity computation for Episode #{episode_number_to_query}...")
    try:
        similar_eps = session.execute_read(
            compute_episode_similarity_refactored, 
            "topicGraph", 
            episode_number_to_query, 
            3
        )
        print(f"‚úÖ Found {len(similar_eps)} similar episodes:")
        for ep in similar_eps:
            print(f"  - Episode {ep['similar_episode_number']}: {ep['similar_episode_name']} (similarity: {ep['similarity']:.3f})")
    except Exception as e:
        print(f"‚ùå Similarity computation failed: {e}")

#### Test GDS Fallback method directly

In [None]:
with driver.session() as session:
    print(f"\n3. Testing fallback method directly...")
    try:
        # Test the fallback method directly
        fallback_result = session.execute_read(
            lambda tx: handle_gds_fallback(tx, 'similarity', episode_number=episode_number_to_query, top_n=2)
        )
        print(f"‚úÖ Fallback method works: Found {len(fallback_result)} similar episodes")
        for ep in fallback_result:
            print(f"  - Episode {ep['similar_episode_number']}: {ep['similar_episode_name']} (similarity: {ep['similarity']:.3f})")
    except Exception as e:
        print(f"‚ùå Fallback method failed: {e}")

In [None]:
# List Vector Indexes (no creation)



## Create and work with Vector Index on Chunk node and KNN via GDS

#### Show Vector Indexes

In [None]:
print("üîç Showing vector indexes...")
with driver.session() as session:
    result = session.run("SHOW VECTOR INDEXES")
    indexes = [record.data() for record in result]

    if not indexes:
        print("‚ö†Ô∏è  No vector indexes found.")
    else:
        print(f"‚úÖ Found {len(indexes)} vector index(es):\n")
        for idx in indexes:
            print(f"- Name: {idx.get('name', 'N/A')}")
            print(f"  Type: {idx.get('type', 'N/A')}")
            print(f"  State: {idx.get('state', 'N/A')}")
            if 'properties' in idx:
                print(f"  Properties: {idx.get('properties', [])}")
            print()

    has_chunk_index = any(i.get('name') == 'chunkIndex' for i in indexes)
    print(f"üîé chunkIndex present: {'Yes' if has_chunk_index else 'No'}")


#### Create question embedding and Define method to anwer user question using Vector Index

In [6]:
# Vector Search with Question Embedding and Episode Retrieval

from openai import OpenAI

# Initialize OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def create_question_embedding(question):
    """
    Create an embedding for a user's question using OpenAI's text-embedding-3-small model.
    
    Args:
        question (str): The user's question text
        
    Returns:
        list: A 1536-dimensional vector embedding of the question
    """
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=question
    )
    return response.data[0].embedding

def search_episodes_by_question(question, k=5):
    """
    Search for relevant episodes using vector similarity search on chunk embeddings.
    
    Args:
        question (str): The user's question
        k (int): Number of nearest neighbor chunks to retrieve (default: 5)
        
    Returns:
        list: List of dictionaries containing EpisodeTitle, ChunkContent, and SimilarityScore
    """
    # Step 1: Create embedding for the question
    question_embedding = create_question_embedding(question)
    
    # Step 2: Execute vector search query
    with driver.session() as session:
        result = session.run("""
            // Step 1: Query the vector index ('chunkIndex') to find the most similar chunks.
            // $questionEmbedding is the list of floats/integers representing the user's question.
            // $k specifies the number of nearest neighboring chunks to retrieve.
            CALL db.index.vector.queryNodes(
                'chunkIndex',
                $k,
                $questionEmbedding
            )
            YIELD node AS chunk, score

            // Step 2: Match the relationship to find the parent Episode.
            // We use the inverse direction of the BELONGS_TO relationship 
            // to go from the retrieved Chunk node back to the Episode node.
            MATCH (episode:Episode)<-[:BELONGS_TO_EPISODE]-(chunk)

            // Step 3: Return the results, ordered by similarity score.
            RETURN
                episode.name AS EpisodeTitle,
                episode.number AS EpisodeNumber,
                // Return properties of the matching chunk (e.g., its content)
                chunk.text AS ChunkContent, 
                score AS SimilarityScore
            ORDER BY
                SimilarityScore DESC
        """, questionEmbedding=question_embedding, k=k)
        
        # Collect results
        results = []
        for record in result:
            results.append({
                'EpisodeTitle': record['EpisodeTitle'],
                'EpisodeNumber': record['EpisodeNumber'],
                'ChunkContent': record['ChunkContent'],
                'SimilarityScore': record['SimilarityScore']
            })
        
        return results

print("‚úÖ Vector search functions loaded successfully!")


‚úÖ Vector search functions loaded successfully!


### Define Hybrid search to respond to user question (Vector search followed by KNN)

In [8]:
# GDS-Enhanced Vector Search with KNN Relationship Traversal

def search_episodes_gds_by_question(question, k=5, limit=10):
    """
    Extended search that combines vector search with GDS KNN relationships.
    
    This method:
    1. Performs vector search to find seed episodes (like search_episodes_by_question)
    2. Follows pre-calculated SEMANTICALLY_SIMILAR_KNN relationships from seed episodes
    3. Combines and ranks results using both index scores and KNN similarity scores
    
    Args:
        question (str): The user's question
        k (int): Number of nearest neighbor chunks to retrieve for initial search (default: 5)
        limit (int): Total number of results to return (default: 10)
        
    Returns:
        list: List of dictionaries containing:
            - SeedEpisode: Name of the episode found via vector search
            - SeedEpisode_IndexScore: Similarity score from vector index
            - SimilarEpisode: Name of the episode found via KNN relationship
            - KNN_Similarity_Score: Pre-calculated KNN similarity score
    """
    # Step 1: Create embedding for the question
    question_embedding = create_question_embedding(question)
    
    # Step 2: Execute combined vector search + GDS KNN query
    with driver.session() as session:
        result = session.run("""
            // Step 1-2: Query the vector index and find seed episodes
            CALL db.index.vector.queryNodes(
                'chunkIndex',
                $k,
                $questionEmbedding
            )
            YIELD node AS chunk, score AS indexScore

            // Match the relationship to find the parent Episode (seed episode)
            MATCH (seedEpisode:Episode)<-[:BELONGS_TO_EPISODE]-(chunk)

            // Step 3: Follow the pre-calculated KNN relationships from the seed episodes
            OPTIONAL MATCH (seedEpisode)-[r:SEMANTICALLY_SIMILAR_KNN]->(similarEpisode:Episode)

            // Step 4: Combine and rank the results
            RETURN DISTINCT // Use DISTINCT to avoid duplicates if multiple seeds point to the same episode
                seedEpisode.name AS SeedEpisode,
                seedEpisode.number AS SeedEpisodeNumber,
                indexScore AS SeedEpisode_IndexScore,
                similarEpisode.name AS SimilarEpisode,
                similarEpisode.number AS SimilarEpisodeNumber,
                r.knn_score AS KNN_Similarity_Score
            ORDER BY 
                SeedEpisode_IndexScore DESC, // Prioritize results from a stronger index match
                KNN_Similarity_Score DESC // Use KNN score as a secondary rank
            LIMIT $limit // Return the top N overall results
        """, questionEmbedding=question_embedding, k=k, limit=limit)
        
        # Collect results
        results = []
        for record in result:
            results.append({
                'SeedEpisode': record['SeedEpisode'],
                'SeedEpisodeNumber': record['SeedEpisodeNumber'],
                'SeedEpisode_IndexScore': record['SeedEpisode_IndexScore'],
                'SimilarEpisode': record.get('SimilarEpisode'),  # May be None if no KNN relationship
                'SimilarEpisodeNumber': record.get('SimilarEpisodeNumber'),
                'KNN_Similarity_Score': record.get('KNN_Similarity_Score')
            })
        
        return results

print("‚úÖ GDS-enhanced vector search function loaded successfully!")


‚úÖ GDS-enhanced vector search function loaded successfully!


## Test Vector Index on Chunk node with hybrid method (Vector Index + KNN search)

In [16]:
# Example Usage: GDS-Enhanced Vector Search with KNN Relationships

# Example question
user_question = "What is Apache Iceberg and how does it work?"  
#user_question = "What are the top 2 episodes that cover Data Lake concepts"

print(f"üîç GDS-Enhanced Search for: '{user_question}'")
print("=" * 70)

# Perform the GDS-enhanced search
results = search_episodes_gds_by_question(user_question, k=5, limit=10)

# Display results
if results:
    print(f"\n‚úÖ Found {len(results)} results (combining vector search + KNN relationships):\n")
    for i, result in enumerate(results, 1):
        print(f"{i}. Seed Episode: #{result['SeedEpisodeNumber']} - {result['SeedEpisode']}")
        print(f"   Vector Index Score: {result['SeedEpisode_IndexScore']:.4f}")
        
        if result.get('SimilarEpisode'):
            print(f"   ‚Üí Similar Episode (via KNN): #{result['SimilarEpisodeNumber']} - {result['SimilarEpisode']}")
            print(f"   KNN Similarity Score: {result['KNN_Similarity_Score']:.4f}")
        else:
            print(f"   ‚Üí No KNN relationships found for this seed episode")
        print()
else:
    print("\n‚ö†Ô∏è  No results found. Make sure:")
    print("  1. The 'chunkIndex' vector index exists")
    print("  2. Chunk nodes have embeddings")
    print("  3. SEMANTICALLY_SIMILAR_KNN relationships exist between Episode nodes")
    print("  4. KNN relationships have knn_score property")


üîç GDS-Enhanced Search for: 'What is Apache Iceberg and how does it work?'

‚úÖ Found 10 results (combining vector search + KNN relationships):

1. Seed Episode: #1654 - Iceberg at Netflix and Beyond with Ryan Blue
   Vector Index Score: 0.8223
   ‚Üí Similar Episode (via KNN): #473 - Delayed View Semantics In Incremental Data Processing
   KNN Similarity Score: 0.8930

2. Seed Episode: #1654 - Iceberg at Netflix and Beyond with Ryan Blue
   Vector Index Score: 0.8223
   ‚Üí Similar Episode (via KNN): #480 - Simplifying Lakehouse Ecosystem With DuckLake
   KNN Similarity Score: 0.8833

3. Seed Episode: #1654 - Iceberg at Netflix and Beyond with Ryan Blue
   Vector Index Score: 0.8177
   ‚Üí Similar Episode (via KNN): #473 - Delayed View Semantics In Incremental Data Processing
   KNN Similarity Score: 0.8930

4. Seed Episode: #1654 - Iceberg at Netflix and Beyond with Ryan Blue
   Vector Index Score: 0.8177
   ‚Üí Similar Episode (via KNN): #480 - Simplifying Lakehouse Ecosystem With

## Test Vector Index on Chunk node

In [15]:
# Example Usage: Vector Search with Question Embedding

# Example question
#user_question_one = "What is Apache Iceberg and how does it work?" 
user_question = "What are the top 2 episodes that cover Data Lake concepts"


print(f"üîç Searching for episodes related to: '{user_question}'")
print("=" * 70)

# Perform the search
results = search_episodes_by_question(user_question, k=5)

# Display results
if results:
    print(f"\n‚úÖ Found {len(results)} relevant chunks:\n")
    for i, result in enumerate(results, 1):
        print(f"{i}. Episode #{result['EpisodeNumber']}: {result['EpisodeTitle']}")
        print(f"   Similarity Score: {result['SimilarityScore']:.4f}")
        print(f"   Chunk Content: {result['ChunkContent'][:200]}...")
        print()
else:
    print("\n‚ö†Ô∏è  No results found. Make sure:")
    print("  1. The 'chunkIndex' vector index exists")
    print("  2. Chunk nodes have embeddings")
    print("  3. Chunks are connected to Episodes via BELONGS_TO relationship")


üîç Searching for episodes related to: 'What are the top 2 episodes that cover Data Lake concepts'

‚úÖ Found 5 relevant chunks:

1. Episode #480: Simplifying Lakehouse Ecosystem With DuckLake
   Similarity Score: 0.7620
   Chunk Content: Summary
In this episode of the Data Engineering Podcast Hannes M√ºhleisen and Mark Raasveldt, the creators of DuckDB, share their work on Duck Lake, a new entrant in the open lakehouse ecosystem. They ...

2. Episode #1654: Iceberg at Netflix and Beyond with Ryan Blue
   Similarity Score: 0.7401
   Chunk Content:  the ability to have a swappable storage layer. It's the storage layer that you can use underneath Spark. But you can also use it underneath Snowflake now, which is really crazy. When Snowflake starte...

3. Episode #480: Simplifying Lakehouse Ecosystem With DuckLake
   Similarity Score: 0.7317
   Chunk Content:  like, of course, I knew how   SQL worked, like, what a database was, but I'd never really thought about the concept of creating a 