In [None]:
# ✅ Install necessary packages
!pip install gradio SpeechRecognition moviepy

# ✅ Import required libraries
import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
from PIL import Image
import os
import gradio as gr
from moviepy.editor import ImageSequenceClip, AudioFileClip
import speech_recognition as sr

# ✅ Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
    print("⚠️ Warning: CUDA (GPU) is not available. Enable GPU in Colab settings for better performance.")

# ✅ Load Stable Diffusion models
print("✅ Loading Stable Diffusion models...")
sd_pipeline = StableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
).to(device)

sd_img2img_pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
).to(device)

# ✅ Output directory setup
output_dir = "generated_images"
os.makedirs(output_dir, exist_ok=True)

# ✅ Human-related keywords
human_words = [
    "man", "woman", "boy", "girl", "person", "people", "child", "adult", "kid",
    "father", "mother", "brother", "sister", "friend", "uncle", "aunt",
    "human", "guy", "lady", "character", "soldier", "doctor", "teacher",
    "wizard", "witch", "student", "player", "athlete", "worker", "employee",
    "boss", "king", "queen", "prince", "princess", "hero", "villain"
]

# ✅ Motion-related keywords
motion_words = [
    "running", "walking", "playing", "dancing", "fighting", "jumping",
    "moving", "chasing", "climbing", "working", "spinning", "flying", "driving"
]

# ✅ Analyze and enhance prompt
def analyze_and_enhance_prompt(prompt, style):
    human_in_prompt = any(word in prompt.lower() for word in human_words)
    motion_in_prompt = any(word in prompt.lower() for word in motion_words)
    negative_prompt = "" if human_in_prompt else "humans, people, person, character, girl, boy, man, woman"
    motion_strength = 0.35 if motion_in_prompt else 0.20
    motion_addition = "dynamic poses, natural movement" if motion_in_prompt else "still, consistent background"

    if not human_in_prompt:
        prompt += ", including a character (e.g., a person, character, etc.)"
    if not motion_in_prompt:
        prompt += ", with a still background and no movement"

    return prompt, negative_prompt, motion_addition, motion_strength

# ✅ Refine prompt
def refine_prompt(prompt, style):
    prompt = prompt.strip()
    if not prompt or len(prompt.split()) < 2:
        return None, "❌ Please enter a more detailed description (minimum 2 words)."
    if len(prompt.split()) < 3:
        prompt += " in high resolution"
    if style:
        prompt += f", {style.strip()}"
    return prompt, None

# ✅ Main video generation function
def generate_video(prompt, style, scene_type, num_images, fps,
                   enable_interpolation, enable_face_animation, add_music,
                   audio_prompt=None, progress=gr.Progress()):

    # Audio to text conversion
    if audio_prompt:
        recognizer = sr.Recognizer()
        try:
            with sr.AudioFile(audio_prompt) as source:
                audio_data = recognizer.record(source)
                audio_text = recognizer.recognize_google(audio_data)
                print(f"✅ Transcribed audio: {audio_text}")
                if not prompt:
                    prompt = audio_text.strip()
        except Exception as e:
            return None, f"❌ Error processing audio file: {str(e)}"

    # Analyze & enhance prompt
    final_prompt, negative_prompt, motion_addition, motion_strength = analyze_and_enhance_prompt(prompt, style)

    # Refine prompt
    final_prompt, error = refine_prompt(final_prompt, style)
    if error:
        return None, error

    print(f"✅ Final prompt: {final_prompt}")
    image_paths = []
    seed = torch.manual_seed(42)

    # Scene types
    scene_descriptions = {
        "Conversation Scene": "two characters talking, mid-conversation, facing each other",
        "Chess Scene": "two characters playing chess, focused on the game, magical atmosphere",
        "Fight Scene": "dynamic action, fighting, intense expressions, detailed background",
        "Magical Scene": "magical environment, glowing elements, fantasy atmosphere"
    }
    scene_addition = scene_descriptions.get(scene_type, "")

    # Generate first frame
    print("🖼️ Generating base image (frame 1)...")
    enhanced_prompt = f"{final_prompt}, {scene_addition}, cinematic lighting, frame 1, full body, all characters fully visible, same faces, same clothes, consistent background, high quality, highly detailed"
    first_image = sd_pipeline(
        enhanced_prompt, negative_prompt=negative_prompt, generator=seed
    ).images[0]
    first_image_path = os.path.join(output_dir, "frame_1.png")
    first_image.save(first_image_path)
    image_paths.append(first_image_path)
    prev_image = first_image

    # Generate remaining frames
    for i in progress.tqdm(range(2, num_images + 1), desc="Generating frames"):
        print(f"🖼️ Generating frame {i}/{num_images}...")
        enhanced_prompt = f"{final_prompt}, {scene_addition}, cinematic lighting, frame {i}, full body, all characters fully visible, same faces, same clothes, {motion_addition}, consistent background, high quality, highly detailed"
        new_image = sd_img2img_pipeline(
            prompt=enhanced_prompt,
            image=prev_image,
            strength=motion_strength,
            guidance_scale=8.0,
            negative_prompt=negative_prompt
        ).images[0]
        image_path = os.path.join(output_dir, f"frame_{i}.png")
        new_image.save(image_path)
        image_paths.append(image_path)
        prev_image = new_image

    # Create video
    print("🎥 Compiling video...")
    clip = ImageSequenceClip(image_paths, fps=fps)
    video_path = "animated_video.mp4"

    # Add music (optional)
    if add_music:
        print("🎶 Adding sample music (demo feature)...")
        audio_path = "sample_music.mp3"
        if os.path.exists(audio_path):
            audio = AudioFileClip(audio_path)
            clip = clip.set_audio(audio)
        else:
            print("⚠️ No music file found!")

    clip.write_videofile(video_path, codec="libx264", fps=fps, logger=None)
    print("✅ Video generation complete!")
    return video_path, "Note: AI Music, Face Animation, and Interpolation are Pro features."

# ✅ Gradio UI
iface = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Describe the video scene..."),
        gr.Textbox(label="Style (Optional)", placeholder="e.g., Anime, Realistic, Cyberpunk"),
        gr.Dropdown(choices=["", "Conversation Scene", "Chess Scene", "Fight Scene", "Magical Scene"], label="Scene Type"),
        gr.Slider(minimum=10, maximum=100, value=50, label="Number of Frames"),
        gr.Slider(minimum=5, maximum=30, value=8, label="FPS"),
        gr.Checkbox(label="Enable AI-based Interpolation (Pro)", interactive=False),
        gr.Checkbox(label="Enable Face Animation (Pro)", interactive=False),
        gr.Checkbox(label="Add AI-generated Music (Pro)", interactive=False),
        gr.File(label="Upload Audio Prompt (.wav/.flac/.aiff)", type="filepath", file_types=[".wav", ".flac", ".aiff"])
    ],
    outputs=[
        gr.Video(label="Generated Video"),
        gr.Textbox(label="Status")
    ],
    title="🎬 EchoVid: Echoing Ideas into Videos",
    description="Generate animated videos from text or audio prompts using Stable Diffusion."
)

iface.launch()


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-gyebvkio
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-gyebvkio
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://download.pytorch.org/whl/cu118


ImportError: cannot import name 'cached_download' from 'huggingface_hub' (/usr/local/lib/python3.11/dist-packages/huggingface_hub/__init__.py)