In [1]:
from langchain.agents import initialize_agent
from langchain.tools import tool
from diffusers import DiffusionPipeline
from langchain_google_genai import ChatGoogleGenerativeAI
from gtts import gTTS

import bitsandbytes as bnb
import torch.nn as nn
import torch

### LLMs

In [2]:
image_generator = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
image_generator.enable_model_cpu_offload()

def quantize_model_to_4bit(model):
    replacements = []

    # Collect layers to be replaced
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            replacements.append((name, module))

    # Replace layers
    for name, module in replacements:
        # Split the name to navigate to the parent module
        *path, last = name.split('.')
        parent = model
        for part in path:
            parent = getattr(parent, part)

        # Create and assign the quantized layer
        quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
        quantized_layer.weight.data = module.weight.data
        if module.bias is not None:
            quantized_layer.bias.data = module.bias.data
        setattr(parent, last, quantized_layer)

    return model

# Quantize the UNet part of the pipeline
image_generator.unet = quantize_model_to_4bit(image_generator.unet)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
image_generator

StableDiffusionXLPipeline {
  "_class_name": "StableDiffusionXLPipeline",
  "_diffusers_version": "0.29.0",
  "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
  "feature_extractor": [
    null,
    null
  ],
  "force_zeros_for_empty_prompt": true,
  "image_encoder": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "EulerDiscreteScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "text_encoder_2": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_2": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [15]:
llm = ChatGoogleGenerativeAI(model='gemini-1.5-pro')

### Tools

In [4]:
@tool
def generate_script(text:str)->str:
        """Returns a script for given text"""
        response = llm.invoke(text)
        return response.content

@tool
def generate_image(text):
    """generates image for given script"""
    output = image_generator(text)
    image = output.images[0]
    image.show()
    image.save('outputs/images/generated_img.jpg')
    return image

@tool
def generate_speech(text):
    """Generates speech for given script"""
    filename="output.mp3"
    tts = gTTS(text=text, lang='en')
    tts.save(filename)
    return filename

### Generating Script

In [5]:
input = 'a boy dreaming his future'

In [10]:
prompt_template1 = """Create a story for YouTube Shorts. The story should be engaging, concise (up to 60 seconds), and include a hook, main content, and a call-to-action (CTA). Use the information provided below to generate the script. Make sure to use a conversational and energetic tone suitable for a broad audience.

**Title:** [Title of the Short]
**Description:** [Brief description of the video]
**Tags:** [Relevant tags]

**Topic Text:**
{topic}

**story:**

**[0-5 seconds]:** [Hook - an engaging introduction to capture viewers' attention]

**[5-20 seconds]:** [Main Content Part 1 - present the first main point]

**[20-40 seconds]:** [Main Content Part 2 - present the second main point]

**[40-55 seconds]:** [Main Content Part 3 - present the third main point]

**[55-60 seconds]:** [Call-to-action - prompt viewers to like, comment, and subscribe]

Make sure the script is punchy, direct, and visually descriptive to align with the fast-paced nature of YouTube Shorts."""
# give the output in json format


In [11]:
from langchain.prompts import PromptTemplate

In [12]:
prompt1 = PromptTemplate.from_template(prompt_template1)
prompt1

PromptTemplate(input_variables=['topic'], template="Create a story for YouTube Shorts. The story should be engaging, concise (up to 60 seconds), and include a hook, main content, and a call-to-action (CTA). Use the information provided below to generate the script. Make sure to use a conversational and energetic tone suitable for a broad audience.\n\n**Title:** [Title of the Short]\n**Description:** [Brief description of the video]\n**Tags:** [Relevant tags]\n\n**Topic Text:**\n{topic}\n\n**story:**\n\n**[0-5 seconds]:** [Hook - an engaging introduction to capture viewers' attention]\n\n**[5-20 seconds]:** [Main Content Part 1 - present the first main point]\n\n**[20-40 seconds]:** [Main Content Part 2 - present the second main point]\n\n**[40-55 seconds]:** [Main Content Part 3 - present the third main point]\n\n**[55-60 seconds]:** [Call-to-action - prompt viewers to like, comment, and subscribe]\n\nMake sure the script is punchy, direct, and visually descriptive to align with the fa

In [13]:
from langchain.chains import LLMChain

In [16]:
chain1 = LLMChain(llm=llm, prompt=prompt1, output_key='script')

  warn_deprecated(


In [17]:
script = chain1.invoke(input)

In [18]:
print(script)

{'topic': 'a boy dreaming his future', 'script': "## Title: Dream BIG, Little Man! 🤯🚀\n\n**Description:** This kid's got BIG dreams! See what happens when imagination takes over! 🌠\n\n**Tags:** #dreams #motivation #kids #future #imagination #inspiration #shorts \n\n**[0-5 seconds]:** 😴 **(Close-up shot of a young boy sleeping soundly, then his eyes suddenly pop open with a gasp!)**  Whoa! Did you see that?! This little guy just dreamt up his FUTURE! \n\n**[5-20 seconds]:** 🚀 **(Transition to a montage of the boy dressed as an astronaut, exploring a vibrant, animated planet surface. Upbeat, inspiring music plays in the background.)** First stop, OUTER SPACE! He's walking on Mars, discovering alien life, and even having a snack break with a friendly alien! 👽🥪\n\n**[20-40 seconds]:** 👨\u200d🔬 **(Transition to a scene of the boy in a lab coat, surrounded by bubbling beakers and flashing lights. He's excitedly examining a test tube.)** Next up, he's a world-renowned SCIENTIST, finding cures

In [19]:
type(script)

dict

In [38]:
prompt_template2 = """Given the following YouTube Shorts script and topic, extract key sentences that can be used to generate images. The sentences should be visually descriptive and capture the main ideas or scenes from the script.

**Script:**
{script}

**Topic:**
{topic}

**Sentences for Image Generation:**
1. [First descriptive sentence]
2. [Second descriptive sentence]
3. [Third descriptive sentence]
4. [Fourth descriptive sentence]
5. [Fifth descriptive sentence]

Make sure each sentence is concise and vivid, clearly depicting a scene or idea that can be visualized as an image. If any characters are present in the text, ensure all points describe characters with consistent characteristics such as age, gender, etc."""

In [39]:
prompt2 = PromptTemplate.from_template(prompt_template2)

chain2 = LLMChain(llm=llm, prompt=prompt2, output_key='text_img')
output = chain2.run(script)

In [40]:
print(output)

Here are five sentences from the script that are visually descriptive and capture the main ideas for image generation:

1. **A young boy lays sleeping soundly, his face lit by a sudden gasp and wide, dreaming eyes.** (Captures the opening scene and the "dream" element)
2. **Dressed as an astronaut, the boy explores a vibrant alien planet, walking amongst strange plants and curious creatures.** (Focuses on the space exploration dream) 
3. **Surrounded by bubbling beakers and flashing lights, the boy, now a scientist in a lab coat, examines a glowing test tube with a look of wonder.** (Depicts the science and discovery aspect) 
4. **Bathed in bright stage lights, the boy, holding an electric guitar, plays to a cheering crowd with the energy of a rockstar.** (Illustrates the music and performance dream)
5. **A thought bubble above the boy's head fills with colorful images of an astronaut, a scientist, and a musician, representing the endless possibilities of his dreams.** (Provides a symb

In [27]:
prompt_template3 = """Given the following YouTube Shorts script and topic, extract key sentences that can be used for a voice-over. The sentences should capture the main ideas or scenes from the script, suitable for narration, and be crafted to grab viewers' attention.

**Script:**
{script}

**Topic:**
{topic}

**Sentences for Voice-Over:**
1. [First attention-grabbing voice-over sentence]
2. [Second engaging voice-over sentence]
3. [Third compelling voice-over sentence]
4. [Fourth captivating voice-over sentence]
5. [Fifth intriguing voice-over sentence]

Make sure each sentence is concise, clear, and designed to capture the viewer's interest, effectively conveying the main points or scenes from the script. If any characters are present in the text, ensure all points describe characters with consistent characteristics such as age, gender, etc. Additionally, consider the tone needed to maintain engagement and interest throughout the video.
"""

In [28]:
prompt3 = PromptTemplate.from_template(prompt_template3)

chain3 = LLMChain(llm=llm, prompt=prompt3, output_key='text_speech')
output_= chain2.run(script=output['script'], topic=output['topic'])

In [29]:
output_

"## Sentences for Image Generation:\n\n1. **Ganesha, the elephant-headed god, sits happily with a mountain of sweets piled before him, his stomach wrapped in a snake-belt.** (This captures Ganesha's love for sweets and the story of his burst stomach)\n2. **Shiva, adorned with snakes and a crescent moon, performs the Tandav, a cosmic dance, surrounded by swirling galaxies and exploding stars.** (This visualizes Shiva's power as the master of dance and destruction)\n3. **Lakshmi, the beautiful goddess of wealth, sits gracefully on a lotus flower, her hand outstretched, showering golden coins, while a watchful owl perches beside her.** (This depicts Lakshmi's association with wealth, prosperity, and her fickle nature represented by the owl)\n4. **A montage of vibrant and powerful Hindu deities, including Vishnu, Brahma, Saraswati, and Hanuman, each displaying their unique attributes and powers.** (This showcases the diversity and grandeur of Hindu Gods) \n5. **A curious young person watch

In [23]:
from langchain.chains import SequentialChain

In [41]:
overall_chain = SequentialChain(
    chains=[chain1, chain2, chain3],
    input_variables=["topic"],
    output_variables=["script", "text_img", "text_speech"],
    verbose=True
)

In [42]:
input = 'Indian Gods'

In [43]:
output = overall_chain(input)
output



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


{'topic': 'Indian Gods',
 'script': '## Title: 🤯 3 Indian Gods You WON\'T Believe Exist! 🤯\n\n**Description:** Think you know Indian mythology? Think again! These lesser-known deities are WILD. 🤯\n\n**Tags:** #indianmythology #mythology #india #gods #facts #didyouknow #shorts\n\n**(Video starts with fast-paced, upbeat traditional Indian music playing in the background)**\n\n**[0-5 seconds]**\n\n**(Visually: A montage of famous Indian gods like Ganesha, Shiva, and Lakshmi flashes across the screen.)**\n\n**YouTuber (Energetic, fast-talking):**  Yo, myth fans! We all know Ganesh, right? Remover of obstacles, loves his modak?  But get this…Indian mythology is PACKED with lesser-known deities who are even MORE mind-blowing! 🤯\n\n**(Music cuts to a dramatic sting as the title card appears: 3 Indian Gods You WON\'T Believe Exist!)**\n\n**[5-20 seconds]**\n\n**(Visually: An image of a dog-headed man with a weapon appears.)**\n\n**YouTuber:**  First up: Bhairava! This fierce dude is Shiva’s bo

In [44]:
type(script)

dict

### Generating Images

In [30]:
from langchain.agents import create_react_agent, AgentExecutor

In [47]:
# ReAct template
prompt_template4 = """Based on the following topics, generate images that visually represent the key scenes or ideas discussed. Ensure each image is detailed and visually descriptive. You have access to the following tools:

{tools}

Use the following format:

Question: topics to generate images

Thought: you should always think about what to do

Action: the action to take, should be one of [{tool_names}]

Action Input: the input to the action

Observation: the result of the action

... (this Thought/Action/Action Input/Observation can repeat N times)

Thought: I now know the final answer

Final Answer: the final answer to the original input question

Begin!

Question: {input}

Thought:{agent_scratchpad}"""

In [48]:
prompt4 = PromptTemplate.from_template(prompt_template4)

In [50]:
prompt4

PromptTemplate(input_variables=['agent_scratchpad', 'input', 'tool_names', 'tools'], template='Based on the following topics, generate images that visually represent the key scenes or ideas discussed. Ensure each image is detailed and visually descriptive. You have access to the following tools:\n\n{tools}\n\nUse the following format:\n\nQuestion: topics to generate images\n\nThought: you should always think about what to do\n\nAction: the action to take, should be one of [{tool_names}]\n\nAction Input: the input to the action\n\nObservation: the result of the action\n\n... (this Thought/Action/Action Input/Observation can repeat N times)\n\nThought: I now know the final answer\n\nFinal Answer: the final answer to the original input question\n\nBegin!\n\nQuestion: {input}\n\nThought:{agent_scratchpad}')

In [51]:
agent = create_react_agent(llm, tools=[generate_image], prompt =prompt4)
agent_executor = AgentExecutor(agent=agent, tools=[generate_image])

In [None]:
agent_executor.invoke({"input": output['output']})

In [None]:
agent_executor.invoke({"input": output_})

### Generating Speech

In [52]:
# ReAct template
prompt_template5 = """For the following topics, seperate each topic and generate speech for each one. One speech for one topic. You have access to the following tools:

{tools}

Use the following format:

Question: topics to generate images

Thought: you should always think about what to do

Action: the action to take, should be one of [{tool_names}]

Action Input: the input to the action

Observation: the result of the action

... (this Thought/Action/Action Input/Observation can repeat N times)

Thought: I now know the final answer

Final Answer: the final answer to the original input question

Begin!

Question: {input}

Thought:{agent_scratchpad}"""

In [53]:
prompt5 = PromptTemplate.from_template(prompt_template5)

In [54]:
agent2 = create_react_agent(llm, tools=[generate_speech], prompt =prompt5)
agent_executor2 = AgentExecutor(agent=agent2, tools=[generate_speech])

In [56]:
output['text_speech']

"Here are five sentences for the voice-over, designed to capture attention and highlight the script's key points:\n\n1.  **Think you know Indian gods? Get ready to have your mind blown!** (Energetic, challenging tone)\n2.  **Meet Bhairava, the ultimate guard dog...with a twist!** (Intriguing, slightly humorous)\n3. **Ever heard of a goddess of intoxication?  Get ready to party with Madhavi!** (Playful, suggestive of fun facts)\n4. **Forget billionaires, Kubera is the REAL god of wealth!** (Emphasizes scale, uses modern reference)\n5. **Which god surprised you the most? Let me know in the comments!**  (Encourages interaction, continues the conversation) \n"

In [None]:
import langchain
langchain.debug = True
agent_executor2.invoke({"input": output['text_speech']})
langchain.debug = False

In [4]:
import pyttsx3

def generate_speech(text, lang='en', speed=200, voice='default', filename='output.mp3'):
    """
    Generates speech for given script.
    """
    engine = pyttsx3.init()
    
    # Set language and voice
    voices = engine.getProperty('voices')
    if voice == 'default':
        voice_id = voices[0].id
    else:
        # Try to find the voice with the given name
        voice_id = None
        for v in voices:
            if voice in v.name:
                voice_id = v.id
                break
        if not voice_id:
            raise ValueError(f"Voice '{voice}' not found.")
    
    engine.setProperty('voice', voice_id)
    engine.setProperty('rate', speed)

    engine.save_to_file(text, filename)
    engine.runAndWait()
    
    return filename

# Example usage
text = "Get ready to have your mind blown by these fascinating facts about Indian Gods!"
filename = generate_speech(text, lang='en', speed=180, voice='default', filename='output.mp3')
print(f"Speech generated and saved as: {filename}")

Speech generated and saved as: output.mp3
