In [1]:
# 1. Image-Based Short Video Generation Using AI
!pip install -q diffusers transformers accelerate torch torchvision safetensors imageio imageio[ffmpeg]

In [2]:
from diffusers import StableVideoDiffusionPipeline
import torch
from PIL import Image
import numpy as np
import imageio
from google.colab import files # for uploading files

print("Please upload an image file (jpg/png).")
uploaded = files.upload()
image_path = list(uploaded.keys())[0] # get uploaded file name
image = Image.open(image_path).convert("RGB")
image = image.resize((512, 512)) # resize for model

model_id = "stabilityai/stable-video-diffusion-img2vid-xt"
pipe = StableVideoDiffusionPipeline.from_pretrained(model_id,
torch_dtype=torch.float16, variant="fp16")
pipe = pipe.to("cuda") # use GPU
result = pipe(image, num_frames=6) # 6 frames for low memory
frames = result.frames[0]
video_path = "/content/generated_video.mp4"
imageio.mimsave(video_path, [np.array(f) for f in frames], fps=8)
print("Video generated and saved at:", video_path)
from IPython.display import Video, display
display(Video(video_path, embed=True, width=560))

Please upload an image file (jpg/png).


Saving 8897423.jpg to 8897423.jpg


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/984 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/3.05G [00:00<?, ?B/s]

image_encoder/model.fp16.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/196M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


  0%|          | 0/25 [00:00<?, ?it/s]

Video generated and saved at: /content/generated_video.mp4


In [1]:
# 2. AI-Based Short Video Generation from Image and Text Inputs Using Diffusion Models

from diffusers import StableVideoDiffusionPipeline, DiffusionPipeline
import torch, imageio, numpy as np
from PIL import Image
from google.colab import files
from IPython.display import Video, display

# STEP 1: Upload image
print("Please upload an image file (JPG or PNG):")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
print(f"Image uploaded successfully: {image_path}")

image = Image.open(image_path).convert("RGB")
image = image.resize((384, 384))  # smaller than 512x512 to save VRAM

# -------- IMAGE → VIDEO --------
print("\nGenerating video from uploaded image...")
img_model = "stabilityai/stable-video-diffusion-img2vid-xt"

img_pipe = StableVideoDiffusionPipeline.from_pretrained(
    img_model, torch_dtype=torch.float16, variant="fp16"
).to("cuda")

img_result = img_pipe(image, num_frames=4)  # fewer frames to save memory
img_frames = img_result.frames[0]

image_video_path = "/content/image_video.mp4"
imageio.mimsave(image_video_path, [np.array(f) for f in img_frames], fps=8)

print("\nImage-based video generated.")

# Free the image pipeline from GPU
del img_pipe
torch.cuda.empty_cache()

# -------- TEXT → VIDEO --------
print("\nGenerating video from text prompt...")
text_model = "damo-vilab/text-to-video-ms-1.7b"

text_pipe = DiffusionPipeline.from_pretrained(
    text_model, torch_dtype=torch.float16, variant="fp16"
).to("cuda")

prompt = "A mountain landscape with clouds moving slowly."
text_result = text_pipe(prompt, num_frames=4)
text_frames = text_result.frames[0]

text_video_path = "/content/text_video.mp4"
imageio.mimsave(text_video_path, [np.array(f) for f in text_frames], fps=8)

print("\nText-based video generated.")

# -------- DISPLAY --------
print("\nImage-based Video:")
display(Video(image_video_path, embed=True, width=500))

print("\nText-based Video:")
display(Video(text_video_path, embed=True, width=500))

print("\nBoth videos have been generated successfully.")


Please upload an image file (JPG or PNG):


Saving 8897423.jpg to 8897423.jpg
Image uploaded successfully: 8897423.jpg

Generating video from uploaded image...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/984 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/196M [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/3.05G [00:00<?, ?B/s]

image_encoder/model.fp16.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


  0%|          | 0/25 [00:00<?, ?it/s]


Image-based video generated.

Generating video from text prompt...


model_index.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/2.82G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


  0%|          | 0/50 [00:00<?, ?it/s]




Text-based video generated.

Image-based Video:



Text-based Video:



Both videos have been generated successfully.


In [2]:
# 3. Speach Generative
!pip install gTTS pydub -q
!apt-get install -y -qq ffmpeg
from gtts import gTTS
from IPython.display import Audio
text = "Welcome to Generative AI and Prompt engineering Tutorial"
print("Input text:\n", text)
tts = gTTS(text, lang="en")
tts.save("story_voice.mp3")
Audio("story_voice.mp3")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInput text:
 Welcome to Generative AI and Prompt engineering Tutorial


In [3]:
# 4. Automatic Background Music Integration for Speech Audio using Python

from gtts import gTTS
text = "Welcome to MSRIT Department of MCA."
tts = gTTS(text, lang="en")
tts.save("voice.mp3")
from pydub.generators import Sine

from pydub import AudioSegment
pad = Sine(220).to_audio_segment(duration=15000).apply_gain(-10) # 15s ambient tone
voice = AudioSegment.from_file("voice.mp3").set_frame_rate(44100).set_channels(2)
combined = pad.overlay(voice, position=1000) # start after 1s
combined.export("simple_music.wav", format="wav")
from IPython.display import Audio
Audio("simple_music.wav")

Output hidden; open in https://colab.research.google.com to view.

In [4]:
# 5. Design an AI system that can describe artworks or museum exhibits aloud when an image of an artifact is uploaded.
# The system should automatically analyze the image, generate a descriptive caption, and convert it into speech narration.
!pip install -q gTTS transformers torch torchvision pillow -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m127.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m553.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m142.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
from google.colab import files
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
print("Please upload an image (jpg/png)")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
image = Image.open(image_path).convert("RGB")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
inputs = processor(image, return_tensors="pt").to("cuda")
out = model.generate(**inputs, max_length=30)
caption = processor.decode(out[0], skip_special_tokens=True)
print("\n Generated Description:")

print(caption)
from gtts import gTTS
tts = gTTS(caption, lang="en")
tts.save("image_speech.mp3")
from IPython.display import Audio, display
print("\n Image analyzed and converted to speech successfully!")
display(Audio("image_speech.mp3"))

Please upload an image (jpg/png)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Saving 8897423.jpg to 8897423 (1).jpg


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]


 Generated Description:
a pirate ship in the ocean

 Image analyzed and converted to speech successfully!
