In [1]:
import os
import json
import getpass
from dotenv import load_dotenv

#get env setup for Free Trial
#load_dotenv('.env', override=True)

#get env set up for PROFESSIONAL free trial with data science graph algorithms enabled
load_dotenv('podcast-gds.env', override=True)

if not os.environ.get('NEO4J_URI'):
    os.environ['NEO4J_URI'] = getpass.getpass('NEO4J_URI:\n')
if not os.environ.get('NEO4J_USERNAME'):
    os.environ['NEO4J_USERNAME'] = getpass.getpass('NEO4J_USERNAME:\n')
if not os.environ.get('NEO4J_PASSWORD'):
    os.environ['NEO4J_PASSWORD'] = getpass.getpass('NEO4J_PASSWORD:\n')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [4]:
from neo4j import GraphDatabase

# load into People nodes in Neo4j

#instantiate driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#test neo4j connection
driver.execute_query("MATCH(n) RETURN count(n)")

EagerResult(records=[<Record count(n)=498>], summary=<neo4j._work.summary.ResultSummary object at 0x11fabe750>, keys=['count(n)'])

#### Different ways to get specific properties of nodes

#### Method 1: Return specific properties in the query
print("=== Method 1: Return specific properties ===")
result1 = driver.execute_query("MATCH (p:Person {name: 'Sangeetha Ramadurai'}) RETURN p.name, p.email, p.title")
for record in result1.records:
    print(f"Name: {record['p.name']}")
    print(f"Email: {record['p.email']}")
    print(f"Title: {record['p.title']}")
    print("---")

print("\n=== Method 2: Access properties from node object ===")
result2 = driver.execute_query("MATCH (p:Person {name: 'Sangeetha Ramadurai'}) RETURN p")
for record in result2.records:
    person = record['p']
    print(f"Name: {person.get('name')}")
    print(f"Email: {person.get('email')}")
    print(f"Title: {person.get('title')}")
    print(f"All properties: {dict(person)}")
    print("---")

print("\n=== Method 3: Get all properties as a dictionary ===")
result3 = driver.execute_query("MATCH (p:Person {name: 'Sangeetha Ramadurai'}) RETURN properties(p) as props")
for record in result3.records:
    props = record['props']
    print(f"All properties: {props}")
    print(f"Specific property 'name': {props.get('name')}")
    print("---")


In [None]:
# Execute the query to find a specific person and their episodes
result = driver.execute_query("MATCH (p:Person {name: 'Sangeetha Ramadurai'})-[r]-(ep:Episode) RETURN p, r, ep")

# Print the results
for record in result.records:
    print(f"Person: {record['p']}")
    print(f"Relationship: {record['r']}")
    print(f"Episode: {record['ep']}")
    print("---")

In [None]:
# Print person names, episode names, and relationship names
print("=== Person Names, Episode Names, and Relationship Names ===")

# Query to get person names, their connected episode names, and relationship types
result = driver.execute_query("""
    MATCH (p:Person)-[r]-(ep:Episode) 
    RETURN p.name as person_name, type(r) as relationship_name, 
    ep.name as episode_name,
    ep.number as episode_number
    ORDER BY p.name, type(r), ep.name
""")

for record in result.records:
    person_name = record['person_name']
    relationship_name = record['relationship_name']
    episode_name = record['episode_name']
    episode_number = record['episode_number']
    print(f"Person: {person_name} | Relationship: {relationship_name} | Episode: {episode_name} | Episode Number: {episode_number}")

print(f"\nTotal relationships found: {len(result.records)}")


In [None]:
# Load episode data - now supports multiple episodes
with open("episode_data/episode_input.json", "r") as f:
    episodes_data = json.load(f)

# Check if data is a list (multiple episodes) or single object
if isinstance(episodes_data, list):
    print(f"Loaded {len(episodes_data)} episodes")
    for i, episode in enumerate(episodes_data):
        print(f"Episode {i+1}: {episode['episode']['name']} (Episode #{episode['episode']['number']})")
else:
    print("Loaded single episode (legacy format)")
    print(f"Episode: {episodes_data['episode']['name']} (Episode #{episodes_data['episode']['number']})")

In [13]:
def create_podcast_and_episode(tx, podcast_title, episode_data):
    tx.run("""
    // Match an existing podcast by id or title
    MERGE (pod:Podcast)
      ON CREATE SET pod.id = $podcast_title,
                    pod.title = $podcast_title
      ON MATCH SET pod.title = coalesce(pod.title, $podcast_title)
    WITH pod

    // Create or match episode
    MERGE (ep:Episode {number: $ep_number})
    SET ep.name = $ep_name,
        ep.published_date = date($ep_date),
        ep.link = $ep_link,
        ep.description = $ep_description

    // Connect podcast to episode
    MERGE (pod)-[:HAS_EPISODE]->(ep)
    """, 
    podcast_title=podcast_title,
    ep_number=episode_data["number"],
    ep_name=episode_data["name"],
    ep_date=episode_data["published_date"],
    ep_link=episode_data.get("link", ""),
    ep_description=episode_data.get("description", "")
    )

In [6]:
def create_podcast_and_episode_single(tx, podcast_title, episode_data):
    # Create podcast
    tx.run("""
    MERGE (pod:Podcast {title: $podcast_title})
    MERGE (ep:Episode {number: $ep_number})
    SET ep.name = $ep_name,
        ep.published_date = date($ep_date),
        ep.link = $ep_link,
        ep.description = $ep_description
    MERGE (pod)-[:HAS_EPISODE]->(ep)
    """, 
    podcast_title=podcast_title,
    ep_number=episode_data["number"],
    ep_name=episode_data["name"],
    ep_date=episode_data["published_date"],
    ep_link=episode_data.get("link", ""),
    ep_description=episode_data.get("description", "")
    )



In [7]:
def link_transcript_chunks(tx, episode_number, chunks):
    # Link transcript chunks
    for chunk in chunks:
        tx.run("""
        MATCH (ep:Episode {number: $ep_number})
        MATCH (chunk:Chunk {fileName: $file_name})
        MERGE (ep)-[:HAS_CHUNK]->(chunk)
        SET chunk.fileSource = $file_source
        """, ep_number=episode_number, file_name=chunk["fileName"], file_source=chunk.get("fileSource", f"ep{episode_number}"))



In [8]:
def add_reference_links(tx, episode_number, links):
    # Create ReferenceLink nodes and connect
    for link in links:
        tx.run("""
        MATCH (ep:Episode {number: $ep_number})
        MERGE (rl:ReferenceLink {text: $text, url: $url})
        MERGE (ep)-[:HAS_REFERENCE_LINK]->(rl)
        """, ep_number=episode_number, text=link["text"], url=link["url"])



In [9]:
def add_topics_concepts_technologies(tx, episode_number, topics):
    # Create topics, concepts, technologies and connect
    tx.run("""
    MATCH (ep:Episode {number: $ep_number})
    WITH ep, $topics AS topics
    UNWIND topics AS topicData
    MERGE (t:Topic {name: topicData.name})
    MERGE (ep)-[:HAS_TOPIC]->(t)
    WITH t, topicData
    UNWIND topicData.concepts AS conceptName
    MERGE (c:Concept {name: conceptName})
    MERGE (t)-[:COVERS_CONCEPT]->(c)
    WITH t, topicData
    UNWIND topicData.technologies AS techName
    MERGE (tech:Technology {name: techName})
    MERGE (t)-[:COVERS_TECHNOLOGY]->(tech)
    """, ep_number=episode_number, topics=topics)



In [10]:
def add_person_to_podcast_episode(tx, podcast_title, episode_number, person_data):
    # --- Hosts ---
    for host in person_data.get("hosts", []):
        tx.run("""
        MERGE (host:Person {name: $host_name})
        MERGE (pod:Podcast {title: $podcast_title})
        MERGE (ep:Episode {number: $episode_number})
        MERGE (pod)-[:HAS_EPISODE]->(ep)
        MERGE (host)-[:IS_A_HOST]->(pod)
        MERGE (host)-[:IS_A_HOST]->(ep)
        """, host_name=host["name"], podcast_title=podcast_title, episode_number=episode_number)

    # --- Guests ---
    for guest in person_data.get("guests", []):
        tx.run("""
        MERGE (guest:Person {name: $guest_name})
        MERGE (pod:Podcast {title: $podcast_title})
        MERGE (ep:Episode {number: $episode_number})
        MERGE (pod)-[:HAS_EPISODE]->(ep)
        MERGE (guest)-[:IS_A_GUEST]->(ep)
        """, guest_name=guest["name"], podcast_title=podcast_title, episode_number=episode_number)

    # --- Listeners ---
    for listener in person_data.get("listeners", []):
        tx.run("""
        MERGE (listener:Person {name: $listener_name})
        MERGE (pod:Podcast {title: $podcast_title})
        MERGE (ep:Episode {number: $episode_number})
        MERGE (listener)-[:SUBSCRIBES_TO]->(pod)
        MERGE (listener)-[:LISTENS_TO]->(pod)
        MERGE (pod)-[:HAS_EPISODE]->(ep)
        MERGE (listener)-[:LISTENS_TO_EPISODE]->(ep)
        MERGE (listener)-[:LEARNING_FROM]->(ep)
        """, listener_name=listener["name"], podcast_title=podcast_title, episode_number=episode_number)



In [11]:
# CORRECTED VERSION - Fixed function to handle concept objects properly
def add_topics_concepts_technologies_fixed(tx, episode_number, topics):
    # Create topics, concepts, technologies and connect
    tx.run("""
    MATCH (ep:Episode {number: $ep_number})
    WITH ep, $topics AS topics
    UNWIND topics AS topicData
    MERGE (t:Topic {name: topicData.name})
    MERGE (ep)-[:HAS_TOPIC]->(t)
    WITH t, topicData
    UNWIND topicData.concepts AS conceptData
    MERGE (c:Concept {name: conceptData.name})
    SET c.description = conceptData.description
    MERGE (t)-[:COVERS_CONCEPT]->(c)
    WITH t, topicData
    UNWIND topicData.technologies AS techName
    MERGE (tech:Technology {name: techName})
    MERGE (t)-[:COVERS_TECHNOLOGY]->(tech)
    """, ep_number=episode_number, topics=topics)


#### Podcast Node Uniqueness Constraint

In [12]:
# Create unique constraint for Podcast nodes
print("Creating unique constraint for Podcast nodes...")

with driver.session() as session:
    try:
        result = session.execute_write(lambda tx: tx.run("""
            CREATE CONSTRAINT podcast_unique_id IF NOT EXISTS
            FOR (p:Podcast)
            REQUIRE p.id IS UNIQUE
        """))
        print("✅ Podcast unique constraint created successfully!")
    except Exception as e:
        print(f"❌ Error creating constraint: {e}")
   

Creating unique constraint for Podcast nodes...
✅ Podcast unique constraint created successfully!


#### Episode loader 

In [14]:
# Updated main execution to handle multiple episodes
import json

# Load the episode data
with open('episode_data/episode_input.json', 'r') as f:
    episodes_data = json.load(f)

# Check if data is a list (multiple episodes) or single object
if isinstance(episodes_data, list):
    episodes_list = episodes_data
else:
    # Handle backward compatibility - if single object, wrap in list
    episodes_list = [episodes_data]

# -----------------------------
# Main Execution with Fixed Function - Handle Multiple Episodes
# -----------------------------
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

with driver.session() as session:
    for episode_entry in episodes_list:
        print(f"Processing episode: {episode_entry['episode']['name']} (Episode #{episode_entry['episode']['number']})")
        
        podcast_title = episode_entry["podcast"]["title"]
        episode_data = episode_entry["episode"]
        
        # Create podcast and episode
        session.execute_write(create_podcast_and_episode, podcast_title, episode_data)
        
        # Transcript chunks
        if "transcript_chunks" in episode_data:
            session.execute_write(link_transcript_chunks, episode_data["number"], episode_data["transcript_chunks"])
        
        # Reference links
        if "reference_links" in episode_data:
            session.execute_write(add_reference_links, episode_data["number"], episode_data["reference_links"])
        
        # Topics, concepts, technologies - USING FIXED FUNCTION
        if "topics" in episode_data:
            session.execute_write(add_topics_concepts_technologies_fixed, episode_data["number"], episode_data["topics"])
        
        # Add person relationships (hosts, guests, listeners)
        if "person" in episode_entry:
            person_data = episode_entry.get("person", {})
            session.execute_write(add_person_to_podcast_episode, podcast_title, episode_data["number"], person_data)

driver.close()
print(f"Data loaded successfully! Processed {len(episodes_list)} episodes.")


Processing episode: Iceberg at Netflix and Beyond with Ryan Blue (Episode #1654)
Processing episode: Prompts as Functions: The BAML Revolution in AI Engineering (Episode #2025040307)
Data loaded successfully! Processed 2 episodes.


#### Close driver

In [None]:
#### Close driver
driver.close()

## Testing

In [None]:
MATCH (e1:Episode)-[r:HAS_REFERENCE_LINK]-(other) RETURN e1,r,other

In [None]:
# Test the updated code with multiple episodes
import json
print("Testing the updated code...")

# Load the episode data
with open('episode_data/episode_input.json', 'r') as f:
    episodes_data = json.load(f)

# Check if data is a list (multiple episodes) or single object
if isinstance(episodes_data, list):
    episodes_list = episodes_data
    print(f"Found {len(episodes_list)} episodes to process")
else:
    episodes_list = [episodes_data]
    print("Found 1 episode (legacy format)")

# Show what will be processed
for i, episode_entry in enumerate(episodes_list):
    episode_data = episode_entry['episode']
    person_data = episode_entry.get('person', {})
    topics_data = episode_data.get('topics', [])
    
    print(f"\nEpisode {i+1}:")
    print(f"  Name: {episode_data['name']}")
    print(f"  Number: {episode_data['number']}")
    print(f"  Podcast: {episode_entry['podcast']['title']}")
    print(f"  Hosts: {[h['name'] for h in person_data.get('hosts', [])]}")
    print(f"  Guests: {[g['name'] for g in person_data.get('guests', [])]}")
    print(f"  Listeners: {[l['name'] for l in person_data.get('listeners', [])]}")
    print(f"  Reference Links: {len(episode_data.get('reference_links', []))}")
    print(f"  Topics: {len(topics_data)}")
    
    # Print topic information if topics exist
    if len(topics_data) > 0:
        for j, topic in enumerate(topics_data):
            print(f"    Topic {j+1}: {topic['name']}")
            concepts = topic.get('concepts', [])
            print(f"      Concepts ({len(concepts)}): {[c['name'] for c in concepts]}")
            technologies = topic.get('technologies', [])
            print(f"      Technologies: {technologies}")
