<a href="https://colab.research.google.com/github/ritikade2/PDF-to-Podcast/blob/main/PDF_to_Podcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip -q install pypdf2 nltk pydub openai
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [24]:
import os
import re
from typing import List, Tuple
import PyPDF2
# NLP
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
# OpenAI
from openai import OpenAI
# Audio
from pydub import AudioSegment
# Colab UI
from IPython.display import Audio, display, clear_output
from ipywidgets import Dropdown, Button, VBox, Output
from google.colab import files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [25]:
#-------------------------------------
# Config: Summarizer Toggle
#-------------------------------------
BASELINE_SUMMARY = 'nlp' # NLTK extractive
LLM_SUMMARY = 'llm' # OpenAI LLM bullets
DEFAULT_SUMMARIZER_MODE = BASELINE_SUMMARY # default is NLTK model. Switch to 'llm' for OpenAI
LLM_MODEL_SUMMARY = 'gpt-4o-mini'

In [26]:
# Setting up NLTK
try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

In [27]:
#-------------------------------------
# Helper Functions
#-------------------------------------

#------- Reducing Noisy Text --------#

# Drop any residual noisy sentences
noisy_pat = re.compile(r"(http[s]?://|\bdoi:|arxiv|issn|isbn|©|all rights reserved)", re.IGNORECASE)
def is_noisy_text(s: str) -> bool:
  return bool(noisy_pat.search(s or ""))

# Reducing noise in header/body
safe_noise_re = re.compile(
    r"("
    r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9._-]+\.[A-Za-z]{2,}\b"  # emails
    r"|https?://\S+" # links
    r"|\bdoi:\s*\S+" # doi
    r"|\b(?:arxiv|issn|isbn)\b[:\s]*\S+" # ids
    r"|\borcid\.org/\S+" # orcid
    r"|(?:copyright|©|all rights reserved)" # copyright
    r")",
    re.IGNORECASE,
  )

affiliation_hints = {'university', "department", "school", "laboratory",
                     "institute", "faculty", "college", "center", "centre"}
abstract_block = re.compile(
    r"\b(?:abstract|summary)\b\s*[:.\-]?\s*(.+?)"
    r"(?=\n\s*(?:keywords?|index terms?|introduction|1\.|\d\.)\b)",
    re.IGNORECASE | re.DOTALL,
)
kw_lines = re.compile(r"^\s*(?:keywords?|index terms?)\b.*$", re.IGNORECASE | re.MULTILINE)
def strip_abstract_and_keywords(text: str) -> str:
  t = re.sub(abstract_block, "", text)
  t = re.sub(kw_lines, "", t)
  return t

def authorish_header_lines(s: str) -> bool:
  st = s.strip()
  if not st:
    return True # ignore empty lines in header
  score = 0
  if safe_noise_re.search(st):
    score += 2 # emails, urls, doi's, etc.
  if any(w in st.lower() for w in affiliation_hints):
    score += 2 # affiliations
  if st.count(',') >= 3:
    score += 1 # too many commas
  if 20 <= len(st) <= 250:
    score += 1 # short lines
  if re.search(r"[\*\u2020\u2021]", st):
    score += 1 # footnotes
  return score >= 3

# Remove author/affiliation noise in header and obvious noise.
def preclean_text(text: str, header_lines: int = 80) -> str:
  lines = text.splitlines()
  abstract_idx = None
  for i, ln in enumerate(lines):
    if re.search(r"\b(abstract|summary)\b[:\s]", ln, flags= re.IGNORECASE):
      abstract_idx = i
      break

  cleaned = []
  for i, ln in enumerate(lines):
    in_header = (abstract_idx is None and i < header_lines) or (abstract_idx is not None and i < abstract_idx)
    if in_header:
      if authorish_header_lines(ln):
        continue
      if safe_noise_re.search(ln):
        continue
    else:
      # body: only strip obvious noise
      if safe_noise_re.search(ln):
        continue
    cleaned.append(ln)
  out = re.sub(r"\s+", " ", "\n".join(cleaned)).strip()
  out = strip_abstract_and_keywords(out)
  return out


#------- PDF Extraction Helper --------#
def extract_text_from_pdf(pdf_path: str) -> str:
  reader = PyPDF2.PdfReader(pdf_path)
  pages = []
  for pg in reader.pages:
    text = pg.extract_text() or ""
    pages.append(text)
  all_text = "\n\n".join(pages)
  return preclean_text(all_text)


#------- Section & Title Helpers --------#

# Titles
def guess_title(chunk_text: str)-> str:
  chunk_text = preclean_text(chunk_text)
  words = [w.lower() for w in word_tokenize(chunk_text) if re.match(r"[A-Za-z\-]+$", w)]
  stopwords = set("""a an the and or to of for with on at by from is are
  was were be been being into that this those these which who whom whose
  in out over under as it he she they we you i their our your its""".split())
  filtered_words = [w for w in words if w not in stopwords and len(w) > 3]
  freq = Counter(filtered_words)
  top = [w.capitalize() for w,_ in freq.most_common(3)]
  return " / ".join(top) if top else "Section"

# Section Text
def split_into_sections(text:str, max_tokens: int = 900) -> List[Tuple[str,str]]:
  sentences = sent_tokenize(text)
  chunks, current_chunk, token_count = [], [], 0
  for sentence in sentences:
    words = word_tokenize(sentence)
    # if adding this sentence exceeds the cap, flush the remaining
    if token_count + len(words) > max_tokens and current_chunk:
      chunk_text = " ".join(current_chunk).strip()
      chunks.append((guess_title(chunk_text), chunk_text))
      current_chunk, token_count = [], 0
    current_chunk.append(sentence)
    token_count += len(words)
  # final flush of text to chuck
  if current_chunk:
    chunk_text = " ".join(current_chunk).strip()
    chunks.append((guess_title(chunk_text), chunk_text))
  return chunks


In [28]:
#--------------------------------------
# Summarizers
#--------------------------------------

#------- Baseline NLP Summarizer --------#

# Identify Keywords
def keyword_scores(text: str) -> Counter:
  words = [w.lower() for w in word_tokenize(text) if re.match(r'[A-Za-z\-]+$', w)]
  stop = set("""a an the and or to of for with on at by from is are was were be
  been being into that this those these which who whom whose in out over
  under as it he she they we you i their our your its""".split())
  filtered_words = [w for w in words if w not in stop and len(w)>3]
  return Counter(filtered_words)

# Summarize using NLP
def extractive_bullets(text: str, k: int = 4) -> List[str]:
  cleaned_text = preclean_text(text)
  sentences = [s for s in sent_tokenize(cleaned_text) if not is_noisy_text(s)]
  if not sentences:
    return []
  kw = keyword_scores(text)
  scored = []
  # Scoring keywords in sentences
  for i, s in enumerate(sentences):
    sw = [w.lower() for w in word_tokenize(s) if re.match(r'[A-Za-z\-]+$', w)]
    score = sum(kw[w] for w in sw)
    scored.append((score, i, s))
  scored.sort(reverse=True)

  top = [s for _, _, s in scored[:k]]
  seen, uniq = set(), []
  # Return final bullet points using sentences with top keywords
  for b in top:
    key = re.sub(r"[^a-z]", "", b.lower())[:60]
    if key not in seen:
      seen.add(key)
      uniq.append(b.strip())
  return uniq


#------- LLM extractive bullets (OpenAI) --------#

def llm_bullets(text: str, k: int = 4, model: str = LLM_MODEL_SUMMARY) -> List[str]:
  cleaned = preclean_text(text)
  try:
    client = OpenAI()
    prompt = (
        f"Summarize the following section into {k} concise, self-contained bullet sentences.\n"
        "Ignore author names, affiliations, emails, URLs, ORCIDs, DOIs, and any 'Abstract', 'Summary', 'Keywords', or 'References' sections.\n"
        "Do not add headers; return each bullet on a new line.\n\nSECTION:\n" + cleaned
    )
    resp = client.responses.create(
        model = model,
        input = [{"role": "user", "content": prompt}]
    )
    # Try to extract plain text
    try:
      txt = resp.output_text().strip()
    except Exception:
        # Fallback parse
        txt = "\n".join(
          [c.get('text','') for o in getattr(resp, 'output', []) for c in getattr(o, 'content', []) if isinstance(c, dict)]
        ).strip()
    lines = [re.sub(r"^[\-\*\d\.)\s]+","", ln).strip() for ln in (txt.splitlines() if txt else []) if ln.strip()]
    return lines[:k] if lines else []
  except Exception:
    # Fallback to baseline method
    return extractive_bullets(cleaned, k=k)

# Wrapper to choose summarizer
def summarize_to_bullets(text: str, k:int = 4, mode: str = DEFAULT_SUMMARIZER_MODE) -> List[str]:
  clean_txt = preclean_text(text)
  if (mode or '').lower() == LLM_SUMMARY:
    return llm_bullets(clean_txt, k= k)
  else:
    extractive_bullets(clean_txt, k=k)

In [29]:
#--------------------------------------
# Building Outlines
#--------------------------------------

# Sections
def make_section(title:str, text:str, bullets:List[str]) -> dict:
  return{"title": title, "text": text, "bullets": bullets}

# Outlines: Takes raw sections (title + text) and builds clean outlines with bullet summaries
def build_outline (sections_raw: List[Tuple[str, str]], summarizer_mode: str = DEFAULT_SUMMARIZER_MODE) -> List[dict]:
  outline = []
  for title, chunk in sections_raw:
    cleaned_chunk = preclean_text(chunk) # remove noise
    bullets = extractive_bullets(cleaned_chunk, k = 4) # generate bullet summaries
    section_entry = make_section(title, cleaned_chunk, bullets) # Build structured dict for this section
    outline.append(section_entry) # append to final outline
  return outline


In [30]:
#--------------------------------------
# Building Script
#--------------------------------------

# Podcast Operners & Closers
PODCAST_OPEN = (
    "Welcome! Today we're turning a dense PDF into an easy conversation.\n"
    "We will break it down section by section, highlighting the key ideas. \n"
)

PODCAST_CLOSE = (
    "That's a wrap. If you enjoyed this, share it and read the full paper.\n"
    "Thanks for listening. See you next time!\n"
)

# Dialogue
def template_dialogue_for_section(idx: int, sec: dict, next_title: str = None) -> str:
  lead = f"Section {idx+1}: {sec['title']}. Here's the gist. \n"
  lines = [lead]
  for b in sec['bullets']:
    lines.append(f"- {b}")
  # Transition lines
  transitions = [
      "Nice. Let's keep moving. \n"
      "Got it. On to the next part. \n"
  ]
  lines.append(transitions[idx % len(transitions)])
  if next_title:
    lines.append(f"Up next: {next_title}.\n")
  return "\n".join(lines)

# Script
def join_dialogue(podcast_title: str, outline: List[dict]) -> str:
  parts = [PODCAST_OPEN]
  for i, sec in enumerate(outline):
    nxt = outline[i+1]["title"] if i < len(outline) - 1 else None
    parts.append(template_dialogue_for_section(i, sec, next_title= nxt))
  parts.append(PODCAST_CLOSE)
  return "\n".join(parts)

In [31]:
#--------------------------------------
# Audio (TTS and Stitching)
#--------------------------------------
HOST1_VOICE = "alloy"
HOST2_VOICE = "verse"

def stitch_audio(files: List[str], out_path: str, fmt: str = "mp3", gap_ms: int = 350):
  combined = AudioSegment.silent(duration=0)
  for f in files:
    seg = AudioSegment.from_file(f)
    combined += seg + AudioSegment.silent(duration= gap_ms)
  combined.export(out_path, format= fmt)

def synth_openai_tts_to_file(text: str, out_path: str, model: str = "gpt-4o-mini-tts", voice: str = "alloy", fmt: str = "mp3"):
  if not text or not text.strip():
    return
  client = OpenAI()
  with client.audio.speech.with_streaming_response.create(
      model = model,
      voice = voice,
      input = text,
      response_format = fmt
  ) as response: response.stream_to_file(out_path)


def render_podcast_audio_by_section_from_outline(outline: list, title: str, out_basename: str = "podcast", fmt: str = "mp3") -> str:
    clips, idx = [], 0
    # Opening
    open_lines = [
        (HOST1_VOICE, "Welcome! Today we're turning a dense PDF into an easy conversation."),
        (HOST2_VOICE, "We will break it down section by section, highlighting the key ideas."),
    ]
    for voice, line in open_lines:
      p = f"{out_basename}_path{idx}.{fmt}"
      synth_openai_tts_to_file(PODCAST_OPEN, p, fmt= fmt)
      clips.append(p)
      idx += 1

    # Sections
    for i, sec in enumerate(outline):
      lead_voice = HOST1_VOICE if i % 2 == 0 else HOST2_VOICE
      p = f"{out_basename}_path{idx}.{fmt}"
      synth_openai_tts_to_file(f"Section {i+1}: {sec['title']}. Here's the gist", p,voice = lead_voice, fmt= fmt)
      clips.append(p)
      idx += 1

      for j, b in enumerate(sec.get("bullets", [])):
        voice = HOST2_VOICE if (i+j) % 2 == 0 else HOST1_VOICE
        p = f"{out_basename}_path{idx}.{fmt}"
        synth_openai_tts_to_file(f"- {b}", p, voice = voice, fmt= fmt)
        clips.append(p)
        idx += 1

      transitions = [
          "Nice. Let's keep moving.\n",
          "Got it. On to the next part.\n"
      ]
      trans_voice = HOST1_VOICE if i % 2 == 0 else HOST2_VOICE
      p = f"{out_basename}_path{idx}.{fmt}"
      synth_openai_tts_to_file(transitions[i % len(transitions)], p, voice = trans_voice, fmt = fmt)
      clips.append(p)
      idx += 1

      if i < len(outline) - 1:
        next_voice = HOST2_VOICE if (i+1) % 2 == 0 else HOST1_VOICE
        p = f"{out_basename}_path{idx}.{fmt}"
        synth_openai_tts_to_file(f"Up next: {outline[i+1]['title']}.", p, voice = next_voice, fmt = fmt)
        clips.append
        idx += 1

    # Closing
    close_lines = [
        (HOST1_VOICE, "That's a wrap. If you enjoyed this, share it and read the full paper."),
        (HOST2_VOICE, "Thanks for listening. See you next time!"),
    ]
    for voice, line in close_lines:
      p = f"{out_basename}_path{idx}.{fmt}"
      synth_openai_tts_to_file(line, p, voice = voice, fmt = fmt)
    clips.append(p)
    idx += 1

    # Final Stitch
    final_path = f"{out_basename}_full.{fmt}"
    stitch_audio(clips, final_path, fmt=fmt, gap_ms=350)
    return final_path


In [32]:
#--------------------------------------
# Driver
#--------------------------------------
def run_pipeline(pdf_path: str, title: str = "Podcast", max_tokens: int = 250,
                 out_basename: str = "podcast", fmt: str = "mp3",
                 summarizer_mode: str = DEFAULT_SUMMARIZER_MODE) -> str:
  raw = extract_text_from_pdf(pdf_path)
  sections = split_into_sections(raw, max_tokens = max_tokens)
  outline = build_outline(sections, summarizer_mode= summarizer_mode)
  script = join_dialogue(title, outline)
  print(script[:3000], "\n---\n")
  audio_path = render_podcast_audio_by_section_from_outline(outline, title,
                                                            out_basename = out_basename, fmt = fmt)
  return audio_path

In [33]:
#--------------------------------------
# File Upload + UI
#--------------------------------------

# Upload PDF
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]
mode_dd = Dropdown(
    options = [('Baseline (NLP)', 'nlp'), ('LLM (OpenAI)', 'llm')],
    value = 'nlp',
    description = 'Summarizer:'
    )
run_button = Button(description = 'Run', button_style = 'primary')
out = Output()

def on_run(_):
  with out:
    clear_output()
    print(f"Running with mode: {mode_dd.value}...")
    print(f"Generating podcast audio... this can take a few minutes."
          "Please keep this tab open. The audio player and download will appear below when ready.\n\n")
    audio_path = run_pipeline(pdf_filename, title = "Podcast", max_tokens= 250,
                              out_basename= "podcast", fmt= "mp3",
                              summarizer_mode= mode_dd.value)
    display(Audio(filename= audio_path, autoplay= False))
    files.download(audio_path)

run_button.on_click(on_run)
display(VBox([mode_dd, run_button, out]))




Saving 1706.03762v7.pdf to 1706.03762v7 (1).pdf


VBox(children=(Dropdown(description='Summarizer:', options=(('Baseline (NLP)', 'nlp'), ('LLM (OpenAI)', 'llm')…