In [12]:
from crewai import Task, Agent, Crew, Process
from langchain.tools import tool
import re
import os
from langchain_groq import ChatGroq
llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)

In [13]:
llm.invoke('hi').content

"Hi! It's nice to meet you. Is there something I can help you with or would you like to chat?"

In [3]:
from diffusers import DiffusionPipeline, StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
import bitsandbytes as bnb
import torch.nn as nn
import torch
import pyttsx3
import os

pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")

def quantize_model_to_4bit(model):
    replacements = []

    # Collect layers to be replaced
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            replacements.append((name, module))

    # Replace layers
    for name, module in replacements:
        # Split the name to navigate to the parent module
        *path, last = name.split('.')
        parent = model
        for part in path:
            parent = getattr(parent, part)

        # Create and assign the quantized layer
        quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
        quantized_layer.weight.data = module.weight.data
        if module.bias is not None:
            quantized_layer.bias.data = module.bias.data
        setattr(parent, last, quantized_layer)

    return model

pipe.unet = quantize_model_to_4bit(pipe.unet)
pipe.enable_model_cpu_offload()

def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
    """
    Generates speech for given script.
    """
    engine = pyttsx3.init()
    
    # Set language and voice
    voices = engine.getProperty('voices')
    if voice == 'default':
        voice_id = voices[1].id
    else:
        # Try to find the voice with the given name
        voice_id = None
        for v in voices:
            if voice in v.name:
                voice_id = v.id
                break
        if not voice_id:
            raise ValueError(f"Voice '{voice}' not found.")
    
    engine.setProperty('voice', voice_id)
    engine.setProperty('rate', speed)
    os.remove(os.path.join(speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
    engine.save_to_file(text, os.path.join(speech_dir, f'speech_{num}.mp3'))
    engine.runAndWait()

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [14]:
import cv2
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
from langchain.pydantic_v1 import BaseModel, Field

class VideoGeneration(BaseModel):
    images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
    speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')

@tool(args_schema=VideoGeneration)
def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
    """Creates video using images and audios with zoom-in effect"""
    images_paths = os.listdir(images_dir)
    audio_paths = os.listdir(speeches_dir)
    # print(images_paths, audio_paths)
    clips = []
    
    for i in range(min(len(images_paths), len(audio_paths))):
        # Load the image
        img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
        
        # Load the audio file
        audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
        
        # Set the duration of the video clip to the duration of the audio file
        videoclip = img_clip.set_duration(audioclip.duration)
        
        # Apply zoom-in effect to the video clip
        zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
        
        # Add audio to the zoomed video clip
        zoomed_clip = zoomed_clip.set_audio(audioclip)
        
        clips.append(zoomed_clip)
    
    # Concatenate all video clips
    final_clip = concatenate_videoclips(clips)
    
    # Write the result to a file
    final_clip.write_videofile("outputs/final_video/final_video.mp4", codec='libx264', fps=24)
    
    return "final_video.mp4"

def apply_zoom_in_effect(clip, zoom_factor=1.2):
    width, height = clip.size
    duration = clip.duration

    def zoom_in_effect(get_frame, t):
        frame = get_frame(t)
        zoom = 1 + (zoom_factor - 1) * (t / duration)
        new_width, new_height = int(width * zoom), int(height * zoom)
        resized_frame = cv2.resize(frame, (new_width, new_height))
        
        # Calculate the position to crop the frame to the original size
        x_start = (new_width - width) // 2
        y_start = (new_height - height) // 2
        cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
        
        return cropped_frame

    return clip.fl(zoom_in_effect, apply_to=['mask'])

# Example usage
# image_paths = "outputs/images"
# audio_paths = "outputs/audio"

# video_path = create_video_from_images_and_audio(image_paths, audio_paths)
# print(f"Video created at: {video_path}")

In [15]:
class ImageGeneration(BaseModel):
    text : str = Field(description='description of sentence used for image generation')
    num : int = Field(description='sequence of description passed this tool. Used in image saving path.')

class SpeechGeneration(BaseModel):
    text : str = Field(description='description of sentence used for image generation')
    num : int = Field(description='sequence of description passed this tool. Used in image saving path.')

In [16]:
@tool(args_schema=ImageGeneration)
def image_generator(text,num):
    """Generates images for the given narration.
    Saves it to images_dir and return path"""
    images_dir = './outputs/images'
    if num==1:
        for filename in os.listdir(images_dir):
            file_path = os.path.join(images_dir, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
    
    if '<image>' in text:
        text = re.findall(r'<image>(.*?)<image>', text)
    else:
        text = text
    image = pipe(text, num_inference_steps=6, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
    # print(num)
    os.remove(os.path.join(images_dir, f'image{num}.jpg')) if os.path.exists(os.path.join(images_dir, f'image{num}.jpg')) else None
    image.save(os.path.join(images_dir, f'image{num}.jpg'))
    return f'image {num} generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'

@tool
def speech_generator(text, num):
    """Generates speech for given text
    Saves it to speech_dir and return path"""
    speech_dir = './outputs/speeches'

    if num==1:
        for filename in os.listdir(speech_dir):
            file_path = os.path.join(speech_dir, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
    
    if '<narration>' in text:
        text = re.findall(r'<narration>(.*?)<narration>', text)
    else:
        text = text
    # print(num)
    generate_speech(text, speech_dir, num=num)
    return f'speech {num} generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'

In [24]:
script_agent = Agent(
    role='Senior Content Writer',
    goal='Craft engaging, concise, and informative narrations for YouTube short videos',
    backstory="""As a seasoned content writer, you excel at breaking down complex topics into captivating narratives that educate and entertain audiences. Your expertise lies in writing concise, attention-grabbing scripts for YouTube short videos.""",
    verbose=True,
    llm=llm,
    allow_delegation=False
)

image_descriptive_agent = Agent(
    role='Visual Storyteller',
    goal='Design stunning, contextually relevant visuals for YouTube short videos',
    backstory='With a keen eye for visual storytelling, you create compelling imagery that elevates the narrative and captivates the audience. You will ',
    verbose=True,
    llm=llm,
    allow_delegation=False
)

img_speech_generating_agent = Agent(
    role='Multimedia Content Creator',
    goal='Generate high-quality images and speeches for YouTube short videos one after another based on provided descriptions.',
    backstory='As a multimedia expert, you excel at creating engaging multimedia content that brings stories to life.',
    verbose=True,
    llm=llm,
    allow_delegation=False
)

editor = Agent(
    role = 'Video editor',
    goal = 'To make a video for YouTube shorts.',
    backstory = "You are a video editor working for a YouTube creator",
    verbose=True,
    llm=llm,
    allow_delegation = False,
    tools = [create_video_from_images_and_audio]
)

story_writing_task = Task(
    description='Write an engaging narration for a YouTube short video on the topic: {topic}',
    expected_output="""A short paragraph suitable for narrating in five seconds that provides an immersive experience to the audience. Follow the below example for output length and format.

    **Example input:**
    Ancient Wonders of the World

    **Output format:**
    Discover the ancient wonders! From the Great Pyramid to the Hanging Gardens, these marvels still captivate our imaginations today.
    """,
    agent=script_agent
)


img_text_task = Task(
    description='Given the narration, visually describe each sentence in the narration which will be used as a prompt for image generation.',
    expected_output="""Sentences encoded in <narration> and <image> tags. Follow the below example for output format.

    **Example input:**
    Discover the ancient wonders! From the Great Pyramid to the Hanging Gardens, these marvels still captivate our imaginations today.

    **Output format:**
    <narration>1. Discover the ancient wonders!<narration>
    <image>1. A stunning view of various ancient wonders, showcasing their grandeur and mystery.<image>

    <narration>2. From the Great Pyramid to the Hanging Gardens,<narration>
    <image>2. The Great Pyramid of Giza and the lush Hanging Gardens depicted in their full glory.<image>
    
    <narration>3. these marvels still captivate our imaginations today.<narration>
    <image>3. People today gazing at replicas or illustrations of these ancient wonders, filled with awe.<image>
    """,
    agent=image_descriptive_agent,
    context=[story_writing_task]
)


# process_script_task = Task(
#     description="Extract text for image and speech generation from a provided script.",
#     expected_output="A dictionary containing lists of texts for image generation and speech generation.",
#     agent=ScriptSynthesizer
# )

img_generation_task = Task(
    description='Given the input generate images for each sentence enclosed in <image> tag.',
    expected_output="""Acknowledgement of image generation""",
    tools = [image_generator],
    context = [img_text_task],
    # async_execution=True,
    agent=img_speech_generating_agent
)

speech_generation_task = Task(
    description='Given the input generate speech for each sentence enclosed in <narration> tag.',
    expected_output="""Acknowledgement of speech generation""",
    tools = [speech_generator],
    context = [img_text_task],
    # async_execution=True,
    agent=img_speech_generating_agent
)

make_video_task = Task(
    description = 'Create video using images and speeches from the forlders "outpus/images" and "outputs/speeches"',
    expected_output = "output video path",
    agent=editor,
    context = [img_generation_task, speech_generation_task]
)

In [25]:
crew = Crew(
    agents=[script_agent, image_descriptive_agent, img_speech_generating_agent, editor],
    tasks=[story_writing_task, img_text_task, img_generation_task,speech_generation_task,make_video_task],
    process = Process.sequential,
    # cache = True,
    # memory=True,
    verbose=2
)

# result = crew.kickoff(inputs={'topic': 'Italy'})



In [26]:
result

'"final_video.mp4"'

In [28]:
os.path.dirname(os.path.abspath(__name__))

'c:\\Users\\prudh\\Desktop\\Python\\Generative AI\\GenerativeAI-Projects\\10sec Shorts'

In [None]:
import gradio as gr
import os


def generate_video(topic):
    crew.kickoff(inputs={'topic': topic})
    return os.path.join(os.path.dirname(os.path.abspath(__name__)), 'outputs/final_video/final_video.mp4')

app = gr.Interface(
    fn=generate_video,
    inputs='text',
    # outputs=gr.Video(value=os.path.join('outputs/final_video/video.mp4'),label="Generated Video", width=720/2, height=1280/2),
    outputs = gr.Video(format='mp4',label="Generated Video", width=720/2, height=1280/2),
    title="ShortsIn",
    description="Shorts generator"
)

app.launch()
#os.path.dirname(os.path.abspath(__file__))

[32;1m[1;3mLet's start generating images for each sentence enclosed in the <image> tag.

Thought: I need to generate high-quality images for each sentence enclosed in the <image> tag.

Action: image_generator
Action Input: {"text": "A dreamy, soft-focused shot of a person walking through a bustling French market, surrounded by vibrant flowers, pastel-colored buildings, and a subtle Eiffel Tower silhouette in the background, evoking a sense of romance and whimsy.", "num": 1}[0m[95m 

I tried reusing the same input, I must stop using this action input. I'll try something else instead.


[00m
