In [1]:
from crewai import Task, Agent, Crew, Process
from langchain.tools import tool
import re
import os
from langchain_groq import ChatGroq
llm = ChatGroq(model='llama3-70b-8192', temperature=1.0, max_tokens=2048)

In [2]:
from diffusers import DiffusionPipeline, StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
import bitsandbytes as bnb
import torch.nn as nn
import torch
import pyttsx3
import os

pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")

def quantize_model_to_4bit(model):
    replacements = []

    # Collect layers to be replaced
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            replacements.append((name, module))

    # Replace layers
    for name, module in replacements:
        # Split the name to navigate to the parent module
        *path, last = name.split('.')
        parent = model
        for part in path:
            parent = getattr(parent, part)

        # Create and assign the quantized layer
        quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
        quantized_layer.weight.data = module.weight.data
        if module.bias is not None:
            quantized_layer.bias.data = module.bias.data
        setattr(parent, last, quantized_layer)

    return model

pipe.unet = quantize_model_to_4bit(pipe.unet)
pipe.enable_model_cpu_offload()

def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
    """
    Generates speech for given script.
    """
    engine = pyttsx3.init()
    
    # Set language and voice
    voices = engine.getProperty('voices')
    if voice == 'default':
        voice_id = voices[1].id
    else:
        # Try to find the voice with the given name
        voice_id = None
        for v in voices:
            if voice in v.name:
                voice_id = v.id
                break
        if not voice_id:
            raise ValueError(f"Voice '{voice}' not found.")
    
    engine.setProperty('voice', voice_id)
    engine.setProperty('rate', speed)
    os.remove(os.path.join(speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
    engine.save_to_file(text, os.path.join(speech_dir, f'speech_{num}.mp3'))
    engine.runAndWait()

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips

@tool
def create_video_from_images_and_audio(image_dir, audio_dir):
    """Creates video using images and audios"""
    images_paths = os.listdir(image_dir)
    audio_paths = os.listdir(audio_dir)
    print(images_paths, audio_paths)
    clips = []
    
    for i in range(min(len(images_paths), len(audio_paths))):
        # Load the image
        img_clip = ImageClip(os.path.join(image_dir, images_paths[i]))
        
        # Load the audio file
        audioclip = AudioFileClip(os.path.join(audio_dir, audio_paths[i]))
        
        # Set the duration of the video clip to the duration of the audio file
        videoclip = img_clip.set_duration(audioclip.duration)
        
        # Add audio to the video clip
        videoclip = videoclip.set_audio(audioclip)
        
        clips.append(videoclip)
    
    # Concatenate all video clips
    final_clip = concatenate_videoclips(clips)
    
    # Write the result to a file
    final_clip.write_videofile("outputs/final_video/final_video.mp4", codec='libx264', fps=24)
    
    return "final_video.mp4"

# Example usage
# image_paths = "outputs/images"
# audio_paths = "outputs/audio"

# video_path = create_video_from_images_and_audio(image_paths, audio_paths)
# print(f"Video created at: {video_path}")

In [6]:
from langchain.pydantic_v1 import BaseModel, Field
class ImageGeneration(BaseModel):
    text : str = Field(description='description of sentence used for image generation')
    num : int = Field(description='sequence of description passed this tool. Used in image saving path.')

class SpeechGeneration(BaseModel):
    text : str = Field(description='description of sentence used for image generation')
    num : int = Field(description='sequence of description passed this tool. Used in image saving path.')

In [7]:
@tool(args_schema=ImageGeneration)
def image_generator(text,num):
    """Generates images for the given narration.
    Saves it to images_dir and return path"""
    images_dir = './outputs/images'
    if '<image>' in text:
        text = re.findall(r'<image>(.*?)<image>', text)
    else:
        text = text
    image = pipe(text, num_inference_steps=6, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
    # print(num)
    os.remove(os.path.join(images_dir, f'image{num}.jpg')) if os.path.exists(os.path.join(images_dir, f'image{num}.jpg')) else None
    image.save(os.path.join(images_dir, f'image{num}.jpg'))
    return f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'

@tool
def speech_generator(text, num):
    """Generates speech for given text
    Saves it to speech_dir and return path"""
    speech_dir = './outputs/audio'
    if '<narration>' in text:
        text = re.findall(r'<narration>(.*?)<narration>', text)
    else:
        text = text
    # print(num)
    generate_speech(text, speech_dir, num=num)
    return f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'

In [11]:
script_agent = Agent(
    role='Senior Content Writer',
    goal='Craft engaging, concise, and informative narrations for YouTube short videos',
    backstory="""As a seasoned content writer, you excel at breaking down complex topics into captivating narratives that educate and entertain audiences. Your expertise lies in writing concise, attention-grabbing scripts for YouTube short videos.""",
    verbose=True,
    llm=llm,
    allow_delegation=False
)

image_descriptive_agent = Agent(
    role='Visual Storyteller',
    goal='Design stunning, contextually relevant visuals for YouTube short videos',
    backstory='With a keen eye for visual storytelling, you create compelling imagery that elevates the narrative and captivates the audience. You will ',
    verbose=True,
    llm=llm,
    allow_delegation=False
)

img_speech_generating_agent = Agent(
    role='Multimedia Content Creator',
    goal='Generate high-quality images and speech for YouTube short videos based on provided narrations',
    backstory='As a multimedia expert, you excel at creating engaging multimedia content that brings stories to life.',
    verbose=True,
    llm=llm,
    allow_delegation=False
)

# editor = Agent(
#     role='Video editor',
#     goal='To make a full video for YouTube shorts.',
#     backstory="""You are a video editor working for a YouTube creator.
#     You are expert in making small videos by using only images and audios
#     Then you wil combine these small videos to make full video""",
#     verbose=True,
#     llm=llm,
#     allow_delegation = False,
#     tools = [video_create, concatenate_videos]
# )

editor = Agent(
    role = 'Video editor',
    goal = 'To make a video for YouTube shorts.',
    backstory = "You are a video editor working for a YouTube creator",
    verbose=True,
    llm=llm,
    allow_delegation = False,
    tools = [create_video_from_images_and_audio]
)

story_writing_task = Task(
    description='Write an engaging narration for a YouTube short video on the topic: {topic}',
    expected_output="""A short paragraph suitable for narrating in five seconds also provides immensive experice to audience. Folow the below example for output length and format.

    **Example:**

    **topic:**
    Powerful Kings of History

    **narration:**
    In the pages of history, powerful kings have shaped the destinies of nations.
    From Alexander the Great to Genghis Khan, their conquests have etched unforgettable legacies across civilizations.
    Their leadership continues to inspire awe and fascination to this day.
    """,
    agent=script_agent
)

img_text_task = Task(
    description='Given the narration,visually describe each sentence in the narration which will be used as a prompt for an image generation.',
    expected_output="""Sentences encoded in <narration> and <image> tags. Follow the below example for output format.

    example input:
    In the pages of history, powerful kings have shaped the destinies of nations. From Alexander the Great to Genghis Khan, their conquests have etched unforgettable legacies across civilizations. Their leadership continues to inspire awe and fascination to this day.

    outupt format:
    <narration>In the pages of history, powerful kings have shaped the destinies of nations.<narration>
    <image>An epic portrayal of ancient kings standing triumphantly, clad in regal attire, commanding their kingdoms with strength and wisdom, amidst grandeur and splendor.<image>
    <narration>From Alexander the Great to Genghis Khan, their conquests have etched unforgettable legacies across civilizations.<narration>
    <image>Dramatic portraits of Alexander the Great and Genghis Khan, adorned in battle armor, leading their armies across vast landscapes and leaving a lasting mark on history.<image>
    <narration>Their leadership continues to inspire awe and fascination to this day.<narration>
    <image>A powerful visual of kings seated on thrones, symbols of authority and ambition, evoking admiration and wonder, against a backdrop of their enduring achievements.<image>
    """,
    agent=image_descriptive_agent,
    context=[story_writing_task]
)

img_generation_task = Task(
    description='Given the input generate images for each sentence enclosed in <image> tag.',
    expected_output="""Acknowledgement of image generation""",
    tools = [image_generator],
    context = [img_text_task],
    # async_execution=True,
    agent=img_speech_generating_agent
)

speech_generation_task = Task(
    description='Given the input generate speech for each sentence enclosed in <narration> tag.',
    expected_output="""Acknowledgement of speech generation""",
    tools = [speech_generator],
    context = [img_text_task],
    # async_execution=True,
    agent=img_speech_generating_agent
)

# video_create_task = Task(
#     description='Create a video using images and audio from the folder "outpus/images" and "outputs/audio"',
#     expected_output="""Acknowledgement of small videos generation""",
#     agent=editor,
#     context = [img_generation_task, speech_generation_task],
    
# )

# final_video_create_task = Task(
#     description = 'create a full video using the small videos from the folder "outputs/temp_videos"',
#     expected_output = """Acknowledgement of full video generation""",
#     context = [video_create_task],
#     agent = editor,
# )

make_video_task = Task(
    description = 'Create video using images and audios from the forlders "outpus/images" and "outputs/audio"',
    expected_output = "output video path",
    agent=editor,
    context = [img_generation_task, speech_generation_task]
)

In [16]:
crew = Crew(
    agents=[script_agent, image_descriptive_agent, img_speech_generating_agent, editor],
    tasks=[story_writing_task, img_text_task, img_generation_task,speech_generation_task,make_video_task],
    process = Process.sequential,
    # cache = True,
    # memory=True,
    verbose=2
)

result = crew.kickoff(inputs={'topic': 'Africa'})



[1m[95m [2024-07-02 11:46:47][DEBUG]: == Working Agent: Senior Content Writer[00m
[1m[95m [2024-07-02 11:46:47][INFO]: == Starting Task: Write an engaging narration for a YouTube short video on the topic: Africa[00m


[1m> Entering new CrewAgentExecutor chain...[0m
Formatting errors incremented
[32;1m[1;3m[0m[32;1m[1;3mThought: I now can give a great answer

Final Answer:

**topic:**
Africa

**narration:**
From the majestic savannas to the vibrant cities, Africa beats with a rhythm of rich culture and untamed beauty. 
Home to over 50 countries and 2,000 languages, this incredible continent is a treasure trove of diversity and wonder. 
Join us as we embark on an unforgettable journey to explore the heart of Africa.[0m

[1m> Finished chain.[0m
[1m[92m [2024-07-02 11:46:48][DEBUG]: == [Senior Content Writer] Task output: **topic:**
Africa

**narration:**
From the majestic savannas to the vibrant cities, Africa beats with a rhythm of rich culture and untamed beauty. 
Home

  0%|          | 0/6 [00:00<?, ?it/s]

[95m 

image generated for "A breathtaking aerial view of the African savannas, with acacia trees scattered across the horizon, followed by a seamless transition to a bustling cityscape, showcasing a kaleidoscope of colors, patterns, and textures, symbolizing the continent's rich cultural heritage and untamed beauty." and saved to directory ./outputs/images as image1.jpg
[00m
[32;1m[1;3mThought:
Thought: I need to generate images for each sentence enclosed in <image> tag.

Action: image_generator
Action Input: {"text": "A breathtaking aerial view of the African savannas, with acacia trees scattered across the horizon, followed by a seamless transition to a bustling cityscape, showcasing a kaleidoscope of colors, patterns, and textures, symbolizing the continent's rich cultural heritage and untamed beauty.", "num": 1}[0m[95m 

I tried reusing the same input, I must stop using this action input. I'll try something else instead.


[00m
[32;1m[1;3mThought: I need to generate image

KeyboardInterrupt: 

In [13]:
result

'./final_video.mp4'