In [1]:
import os
import logging, warnings
from transformers import logging as hf_logging
import transformers
import torch
from huggingface_hub import login
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"  

# Load .env file (if present)
load_dotenv()
hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")


In [3]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.73s/it]


In [12]:
# A test cell for selecting a random topic and making the podcast.

messages = [
    {
        "role": "user",
        "content": "Your entire output must start with the exact token: HOST:\n\nIf your output contains anything before 'HOST:'—including instructions, explanations, apologies, or any part of this prompt—then your output is invalid.\n\nYour entire output must be ONLY a two-person podcast conversation between HOST and GUEST, formatted strictly as:\nHOST: …\nGUEST: …\n(continue dialogue)\n\nNo other text, instructions, or blank lines are permitted.\n\nContent Requirements\nPodcast: Adaptive AI Podcast\nTopic: Pick a interesting topic and \nGuest: Create a relevant fictional or historical expert name based on the topic\nLength: 150–200 words\nSentences short and clean for TTS.\n\nDialogue Structure (follow exactly):\nHOST: hook about topic\nHOST: podcast + host intro\nHOST: guest intro\nGUEST: brief greeting\nHOST: simple first question\nGUEST: 3–5 sentence answer\nHOST: follow-up question\nGUEST: short answer\nHOST: closing line: “Let’s get started.”\n\n---BEGIN DIALOGUE---\nHOST:"
    },
]

outputs = pipeline(
    messages,
    max_new_tokens=512,
)
raw = outputs[0]["generated_text"][-1]
print(raw['content'])

HOST: Welcome to the Adaptive AI Podcast, I'm your host, Alex Chen. Today, we're exploring the fascinating world of underwater ruins, and joining me is Dr. Sofia Rodriguez, a renowned archaeologist specializing in ancient oceanic civilizations.

GUEST: Hi Alex, thanks for having me.

HOST: Dr. Rodriguez, for those who may not be familiar, can you tell us a bit about what draws you to underwater ruins?

GUEST: I've always been captivated by the mysteries hidden beneath the ocean's surface. The possibility of discovering lost cities, temples, and artifacts from ancient cultures is truly exciting. My research has focused on the advanced engineering and architectural skills of civilizations like the Minoans and the Indus Valley people, who built sophisticated structures that withstood the harsh marine environment. By studying these underwater ruins, we can gain insights into the lives and cultures of our ancestors.

HOST: That's fascinating. Can you tell us about a particularly significant

In [13]:
raw = outputs[0]["generated_text"][-1]
print(raw['content'])

HOST: Welcome to the Adaptive AI Podcast, I'm your host, Alex Chen. Today, we're exploring the fascinating world of underwater ruins, and joining me is Dr. Sofia Rodriguez, a renowned archaeologist specializing in ancient oceanic civilizations.

GUEST: Hi Alex, thanks for having me.

HOST: Dr. Rodriguez, for those who may not be familiar, can you tell us a bit about what draws you to underwater ruins?

GUEST: I've always been captivated by the mysteries hidden beneath the ocean's surface. The possibility of discovering lost cities, temples, and artifacts from ancient cultures is truly exciting. My research has focused on the advanced engineering and architectural skills of civilizations like the Minoans and the Indus Valley people, who built sophisticated structures that withstood the harsh marine environment. By studying these underwater ruins, we can gain insights into the lives and cultures of our ancestors.

HOST: That's fascinating. Can you tell us about a particularly significant

In [11]:
import json

# 1. The list of 50 Topics
topics = [
    "The Simulation Theory: Are we living in a video game?",
    "De-extinction: bringing the Woolly Mammoth back",
    "The psychology behind why we believe conspiracy theories",
    "Modern urban legends and their origins",
    "The future of commercial space tourism",
    "The enduring mystery of the Bermuda Triangle",
    "Artificial General Intelligence vs. Human Intuition",
    "The history of unbreakable codes and ciphers",
    "Biohacking and the ethics of human augmentation",
    "The cultural impact of finding alien life",
    "Deep sea exploration: What is really down there?",
    "The theoretical physics of time travel",
    "Minimalist living in a hyper-consumerist world",
    "The Mandela Effect and collective false memories",
    "The search for the Lost City of Atlantis",
    "The science and mystery of lucid dreaming",
    "The future of lab-grown meat and sustainable food",
    "Cryptozoology: The hunt for Bigfoot and Nessie",
    "The Voynich Manuscript: A book no one can read",
    "Psychological tricks used in modern advertising",
    "The history and culture of video game speedrunning",
    "How different languages shape our perception of reality",
    "The rise of digital nomads and van life",
    "The Singularity: When machines become smarter than us",
    "Investigating the reality of paranormal activity",
    "The Fermi Paradox: Where is everybody?",
    "The hidden world of the Dark Web",
    "The Golden Record aboard the Voyager probes",
    "Ancient civilizations that vanished overnight",
    "Applying Stoic philosophy to modern stress",
    "The psychology of cults and charismatic leaders",
    "Smart Cities: The metropolis of the future",
    "Sustainable architecture and vertical farming",
    "The history of espionage and cold war spy gadgets",
    "Interstellar travel: Wormholes and warp drives",
    "The weirdest medical treatments in history",
    "The art of storytelling in the age of TikTok",
    "Hypothetical survival strategies for a zombie apocalypse",
    "The mind-bending weirdness of quantum entanglement",
    "The evolutionary reason why we find things 'cute'",
    "The secret intelligence of fungi and mushrooms",
    "Preserving languages on the brink of extinction",
    "Brain-Computer Interfaces: Merging mind and machine",
    "The most audacious art heists in history",
    "The neuroscience of happiness and flow states",
    "The Great Filter theory of civilization",
    "The future of privacy in a surveillance state",
    "A history of unexplained radio signals from space",
    "The ethics of cloning pets (and people)",
    "The reality of terraforming Mars"
]

podcast_data = []

print(f"Starting generation for {len(topics)} topics...")

for i, topic in enumerate(topics):
    
    #  inject the specific topic into the prompt
    prompt_content = f"""Your entire output must start with the exact token: HOST:

If your output contains anything before 'HOST:'—including instructions, explanations, apologies, or any part of this prompt—then your output is invalid.

Your entire output must be ONLY a two-person podcast conversation between HOST and GUEST, formatted strictly as:
HOST: …
GUEST: …
(continue dialogue)

No other text, instructions, or blank lines are permitted.

Content Requirements
Podcast: Adaptive AI Podcast
Topic: {topic}
Guest: Create a relevant fictional or historical expert name based on the topic
Length: 150–200 words
Sentences short and clean for TTS.

Dialogue Structure (follow exactly):
HOST: hook about topic
HOST: podcast + host intro
HOST: guest intro
GUEST: brief greeting
HOST: simple first question
GUEST: 3–5 sentence answer
HOST: follow-up question
GUEST: short answer
HOST: closing line: “Let’s get started.”

---BEGIN DIALOGUE---
HOST:"""

    messages = [
        {
            "role": "user",
            "content": prompt_content
        },
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=512,
    )

    # Extract the generated content
    generated_text = outputs[0]["generated_text"][-1]['content']
    
    entry = {
        "id": i + 1,
        "topic": topic,
        "dialogue": generated_text
    }
    
    podcast_data.append(entry)
    
    print(f"[{i+1}/{len(topics)}] Generated: {topic}")

# Save to JSON file
output_filename = "podcast_dataset.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(podcast_data, f, indent=4, ensure_ascii=False)

print(f"\nSuccessfully saved {len(podcast_data)} conversations to {output_filename}")

Starting generation for 50 topics...
[1/50] Generated: The Simulation Theory: Are we living in a video game?
[2/50] Generated: De-extinction: bringing the Woolly Mammoth back
[3/50] Generated: The psychology behind why we believe conspiracy theories
[4/50] Generated: Modern urban legends and their origins
[5/50] Generated: The future of commercial space tourism
[6/50] Generated: The enduring mystery of the Bermuda Triangle
[7/50] Generated: Artificial General Intelligence vs. Human Intuition
[8/50] Generated: The history of unbreakable codes and ciphers
[9/50] Generated: Biohacking and the ethics of human augmentation
[10/50] Generated: The cultural impact of finding alien life
[11/50] Generated: Deep sea exploration: What is really down there?
[12/50] Generated: The theoretical physics of time travel
[13/50] Generated: Minimalist living in a hyper-consumerist world
[14/50] Generated: The Mandela Effect and collective false memories
[15/50] Generated: The search for the Lost City of At