In [1]:
# Cell 1: Install dependencies (run once)
# This installs commonly used libs. It may take several minutes first run.
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install -q diffusers transformers accelerate safetensors huggingface_hub moviepy gTTS soundfile pillow opencv-python-headless
# Install a small version of diffusers if you prefer: uncomment next line to force tiny model usage
# !pip install -q "diffusers[torch]" --upgrade


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Cell 2: Imports and basic setup
import os, io, uuid, math, tempfile, time
from pathlib import Path
from PIL import Image
import numpy as np
from moviepy.editor import ImageSequenceClip, AudioFileClip
import soundfile as sf
from gtts import gTTS            # lightweight TTS (fallback, internet-based but no API key)
from huggingface_hub import login as hf_login
import torch

OUT_DIR = Path("assets")
OUT_DIR.mkdir(exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':



Device: cpu


In [3]:
# Cell 3: (Optional) Hugging Face login for Stable Diffusion models that require authentication
# If you have a HF token, paste it below. If not, skip — the notebook will attempt to use a public model.
HF_TOKEN = "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"  # <-- paste your HF token here if you have one (recommended for reliable SD access)

if HF_TOKEN:
    hf_login(token=HF_TOKEN)
    print("Logged into HF.")
else:
    print("No HF token provided. We'll attempt to load a public/small model.")


Logged into HF.


In [4]:
# Cell 4: Stable Diffusion pipeline setup (diffusers)
# NOTE: model selection: if you have issues with large models, use a tiny test model or pre-cache artifacts.
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

# Choose model to use. If you have a token and accepted license, you can use runwayml/stable-diffusion-v1-5 or sd-2.1.
MODEL_ID = "runwayml/stable-diffusion-v1-5"  # change if you prefer another HF model
IMG_WIDTH = 512
IMG_HEIGHT = 512

try:
    # load pipeline (may require HF token if model gated)
    pipe = StableDiffusionPipeline.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16 if device=="cuda" else torch.float32,
    )
    # scheduler & optimizations
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_attention_slicing()
    pipe = pipe.to(device)
    print("Loaded Stable Diffusion model:", MODEL_ID)
except Exception as e:
    print("Warning: failed to load requested SD model. Error:", e)
    print("Switching to tiny stable-diffusion test model (if available)...")
    # fallback: try a tiny testing model (may still fail on some runtimes)
    try:
        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion", torch_dtype=torch.float32)
        pipe = pipe.to(device)
        print("Loaded tiny stable diffusion fallback.")
    except Exception as e2:
        pipe = None
        print("No SD pipeline available. You can still run TTS + video assembly with a pre-supplied image.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

safety_checker/model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loaded Stable Diffusion model: runwayml/stable-diffusion-v1-5


In [5]:
# Cell 5: Image generation function (uses loaded pipeline if available)
import random

def generate_image_sd(prompt, out_path, seed=None, width=IMG_WIDTH, height=IMG_HEIGHT, guidance_scale=7.5, steps=20):
    if pipe is None:
        raise RuntimeError("Stable Diffusion pipeline not available in this runtime.")
    generator = torch.Generator(device=device)
    if seed is None:
        seed = random.randint(0, 2**32 - 1)
    generator.manual_seed(seed)
    result = pipe(prompt, height=height, width=width, guidance_scale=guidance_scale, num_inference_steps=steps, generator=generator)
    img = result.images[0]
    img.save(out_path)
    return out_path

# quick test (uncomment to run, but will call model)
# test_img = OUT_DIR/"test_img.png"
# generate_image_sd("A cozy cabin in snowfall at dusk, cinematic, 3D lighting", test_img)


In [6]:
# Cell 6: TTS function using gTTS (simple, reliable)
def synthesize_gtts(text, out_wav_path, lang="en"):
    tts = gTTS(text=text, lang=lang)
    tmp_mp3 = str(out_wav_path.with_suffix(".mp3"))
    tts.save(tmp_mp3)
    # convert mp3 -> wav using soundfile (pydub could be used too, but soundfile handles wav output)
    # read mp3 as binary (soundfile doesn't read mp3 directly), fallback: use ffmpeg via moviepy to convert
    from moviepy.editor import AudioFileClip
    clip = AudioFileClip(tmp_mp3)
    clip.write_audiofile(str(out_wav_path), fps=22050, verbose=False, logger=None)
    clip.close()
    os.remove(tmp_mp3)
    return out_wav_path


In [7]:
# Cell 7: Make frames from a hero image (simple Ken Burns effect: slow zoom + optional pan)
import cv2
def make_frames_from_image(image_path, n_frames=12, zoom_factor=1.06):
    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    arr = np.array(im)
    cx, cy = w//2, h//2
    frames = []
    # generate frames by cropping with gradually changing scale
    for i in range(n_frames):
        # scale varies from 1/zoom_factor^(n/2) to zoom_factor^(n/2)
        t = (i / max(1, n_frames-1)) - 0.5
        scale = (zoom_factor ** (t * 2))  # centered around 1.0
        nw, nh = int(w / scale), int(h / scale)
        left = max(0, cx - nw//2)
        top = max(0, cy - nh//2)
        crop = arr[top:top+nh, left:left+nw]
        # resize back to original size to get smooth zoom
        frame = Image.fromarray(cv2.resize(crop, (w, h)))
        frames.append(frame)
    return frames


In [8]:

# Cell 8: Assemble frames + audio into MP4
def save_video_from_frames(frames, audio_wav_path, out_mp4_path, fps=24):
    # save temporary frames
    frame_files = []
    for i, f in enumerate(frames):
        p = OUT_DIR/f"frame_{i:03d}.png"
        f.save(p)
        frame_files.append(str(p))
    clip = ImageSequenceClip(frame_files, fps=fps)
    if audio_wav_path and audio_wav_path.exists():
        audio = AudioFileClip(str(audio_wav_path))
        # ensure audio length >= clip duration: optionally loop or trim
        clip = clip.set_duration(audio.duration if audio.duration > clip.duration else clip.duration)
        clip = clip.set_audio(audio.set_duration(clip.duration))
    clip.write_videofile(str(out_mp4_path), codec="libx264", audio_codec="aac")
    # cleanup frames if you want (comment out if you prefer keep them)
    # for p in frame_files: os.remove(p)
    return out_mp4_path


In [9]:
# Cell 9: Planner + Orchestrator (tiny agent-like sequence)
def planner(prompt, duration_seconds=10, n_frames=12):
    # Simple decomposition. Could be extended with rules or an LLM planner.
    plan = {
        "prompt": prompt,
        "n_frames": n_frames,
        "duration": duration_seconds,
    }
    return plan

def run_pipeline(prompt, out_prefix=None, verbose=True):
    if out_prefix is None:
        uid = uuid.uuid4().hex[:8]
        out_prefix = f"poc_{uid}"
    image_path = OUT_DIR/f"{out_prefix}_img.png"
    wav_path = OUT_DIR/f"{out_prefix}_narr.wav"
    video_path = OUT_DIR/f"{out_prefix}_final.mp4"

    plan = planner(prompt)
    if verbose: print("Plan:", plan)

    # 1) Image: generate hero image
    if pipe is not None:
        if verbose: print("Generating image (SD)...")
        generate_image_sd(prompt, image_path)
    else:
        # fallback: use a simple gradient image if SD is unavailable
        if verbose: print("SD pipeline not available — generating fallback placeholder image.")
        im = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(120, 160, 200))
        im.save(image_path)

    # 2) Narration (simple): create a short narration from the prompt
    narration_text = f"{prompt}. A short narrated description of this scene."
    if verbose: print("Synthesizing narration (gTTS)...")
    synthesize_gtts(narration_text, wav_path)

    # 3) Frames
    if verbose: print("Creating frames (Ken Burns)...")
    frames = make_frames_from_image(image_path, n_frames=plan["n_frames"])

    # 4) Assemble video
    if verbose: print("Assembling video...")
    save_video_from_frames(frames, wav_path, video_path, fps=24)

    if verbose: print("Done. Artifacts saved to:", OUT_DIR)
    return {"image": str(image_path), "audio": str(wav_path), "video": str(video_path)}

# Example quick run:
# out = run_pipeline("A cozy cabin in snowfall at dusk, cinematic, warm light", None, True)
# print(out)


In [10]:
# Cell 10: Run the demo (edit the prompt below and run cell)
prompt = "A cozy wooden cabin by a snow-covered pine forest at dusk, warm orange window light, cinematic"
out = run_pipeline(prompt, verbose=True)
print("Artifacts:", out)

# Provide a download link for the generated video (works in Colab)
from google.colab import files
files.download(out["video"])


Plan: {'prompt': 'A cozy wooden cabin by a snow-covered pine forest at dusk, warm orange window light, cinematic', 'n_frames': 12, 'duration': 10}
Generating image (SD)...


  0%|          | 0/20 [00:00<?, ?it/s]

Synthesizing narration (gTTS)...
Creating frames (Ken Burns)...
Assembling video...
Moviepy - Building video assets/poc_ddc8a7b2_final.mp4.
MoviePy - Writing audio in poc_ddc8a7b2_finalTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video assets/poc_ddc8a7b2_final.mp4





Moviepy - Done !
Moviepy - video ready assets/poc_ddc8a7b2_final.mp4
Done. Artifacts saved to: assets
Artifacts: {'image': 'assets/poc_ddc8a7b2_img.png', 'audio': 'assets/poc_ddc8a7b2_narr.wav', 'video': 'assets/poc_ddc8a7b2_final.mp4'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Troubleshooting & Tips
"""
- If Stable Diffusion fails to load due to model gating: provide your Hugging Face token in Cell 3 and re-run Cell 4.
- If Colab kills the runtime (OOM): reduce `IMG_WIDTH` / `IMG_HEIGHT` to 384 or use the tiny fallback model.
- To speed up demo prep for the interview: **pre-generate** a few videos and save them in the repo so you can play them during live demo without regenerating.
- To make the orchestration more "agentic": replace `planner()` with a small LLM-based planner (LangChain + a small HF LLM or local Llama variant) that outputs a sequence of tool calls (Tools: ImageGen, TTS, VideoMake). The current notebook demonstrates the same pattern deterministically.
"""