In [None]:
python_version = "3.9"

In [None]:
env_name = "name"

In [None]:
!conda create -n {env_name} python={python_version} pip -y

In [None]:
!conda clean -ya

In [None]:
import os

In [None]:
old_path = os.environ["PATH"]

In [None]:
new_path = f"/opt/conda/envs/{env_name}/bin:{old_path}"

In [None]:
%env PATH=$new_path

In [None]:
!python --version

In [None]:
#Download ollama
!curl -fsSL https://ollama.com/install.sh | sh

# To support background processes in Kaggle
import os
get_ipython().system = os.system

!ollama serve &

# Explore more ollama models here: https://ollama.com/library
!ollama pull zephyr

In [None]:
# Explore more ollama models here: https://ollama.com/library
!ollama pull mistral

In [None]:
!ollama pull llama3

In [None]:
!ollama pull qwen2

In [None]:
!ollama pull llama3:instruct

In [None]:
!apt-get install espeak-ng -y

In [None]:
!pip install -q tts accelerate moviepy diffusers langchain pydub openai-whisper ffmpeg-python Pillow

In [None]:
!pip install langchain-community

In [None]:
%%writefile main.py
import torch
from pydub import AudioSegment
from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
from langchain_community.llms import Ollama
from langchain import PromptTemplate, LLMChain
from moviepy.editor import *
from TTS.api import TTS
import whisper
import os
import json
import ffmpeg
from PIL import Image, ImageDraw, ImageFont

# Initialize models and pipeline
def initialize_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load stable diffusion pipeline
    pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to(device)
    pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")

    # Initialize TTS model
    tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC_ph", progress_bar=False).to(device)

    return pipe, tts, device

pipe, tts, device = initialize_models()

# LLM initialization
def get_llm(model_name):
    print(f"Building LLM Model: {model_name}")
    return Ollama(model=model_name, temperature=0.4)

# Prompt templates
def build_template():
    fact_template = '''
You are a talented YouTube content creator working for a famous YouTuber. Your task is to create a 35-second YouTube Short script on "Top 3 facts about {topic}" for a wide audience. Start with "Here are the Top 3 facts about {topic}. The last one will blow your mind." Present the topics in descending order from 3rd to 1st. Be creative and engaging.

Close the script by asking viewers to like, subscribe, and share the video.

Example:

Title: Top 5 Mysterious Facts About Ancient Indian Men
Description: Intriguing facts about ancient Indian men.
Visual: A statue of an ancient Indian man in the city.
Narration: "Here are the Top 5 Mysterious Facts About Ancient Indian Men. The last one will blow your mind!"
Visual: Ancient Indian men doing yoga, transitioning to a modern yoga class.
Narration: "Fact 5: Yoga, practiced by millions today, was created by ancient Indian men thousands of years ago."
Visual: An ancient Indian man observing the night sky, transitioning to a modern planetarium.
Narration: "Fact 4: Ancient Indian men were stellar astronomers who calculated the Earth's circumference and theorized about gravity centuries before Western science."
Visual: Ruins of the Indus Valley Civilization with undeciphered scripts.
Narration: "Fact 3: The Indus Valley Civilization's script remains undeciphered. Their urban planning and drainage systems are still a mystery."
Visual: Ancient Ayurvedic texts, transitioning to modern herbal medicine.
Narration: "Fact 2: Ayurveda, one of the oldest holistic healing systems, was pioneered by ancient Indian men. Their texts detail surgical procedures and herbal medicines."
Visual: Artistic depictions of Vimanas flying, transitioning to ancient scriptures and modern interpretations.
Narration: "Fact 1: The Vimana, described in ancient texts as flying machines, suggest advanced technology or extraterrestrial contact."
Visual: YouTube's Like and Share buttons.
Narration: "If these facts blew your mind, like, subscribe, and share this video. See you next time!"

Create your script following this format. Keep it short, interesting, and simple.
    '''  # Same as in the original
    quiz_template = '''
You are a skilled YouTube content creator working for a renowned Investigation quiz Creator. Your task is to craft a 55-second YouTube Short script focused on an intriguing investigation quiz on {topic}.

Begin by setting the scene, introduce each suspect individually, and present compelling clues to your audience. Ensure your narration is creative and captivating.

Conclude the script by inviting viewers to identify the culprit in the comments, and encourage them to like, subscribe, and share the video.

Example:

Title: Quick Investigation Quiz: Who Stole the Diamond Necklace?
Description: Can you solve the mystery in 55 seconds?
Visual: A detective's magnifying glass focusing on a diamond necklace.
Narration: "Hey detectives! Ready for a quick investigation quiz? Let's see if you can crack the case in 55 seconds!"
Visual: A hotel room in disarray, with a broken window illustrating the crime scene.
Narration: "Here's the scene: A valuable diamond necklace has gone missing from this hotel room. The room was ransacked, and the window was broken from the inside. There are three suspects who were in the hotel at the time."
Visual: Mr. White, looking concerned.
Narration: "Suspect A: Mr. White, the hotel manager, who has access to all rooms."
Visual: Ms. Blue, looking sly.
Narration: "Suspect B: Ms. Blue, a famous jewel thief recently spotted in the area."
Visual: Mr. Red, looking nervous.
Narration: "Suspect C: Mr. Red, a guest staying in the room next door, known for his shady dealings."
Visual: An illustration of some evidence boxes in the police locker.
Narration: "Here are the clues:"
Visual: A glove on the floor, with a magnifying glass hovering over it.
Narration: "Clue 1: A glove was found on the floor, but it doesn't belong to the room's owner."
Visual: A key card in the hallway.
Narration: "Clue 2: A room key card was found in the hallway, registered to Mr. Red."
Visual: Security camera footage showing Ms. Blue entering the hotel.
Narration: "Clue 3: A security camera shows Ms. Blue entering the hotel at the time of the theft."
Visual: The three suspects (i.e., Mr. White, Ms. Blue, and Mr. Red) side by side.
Narration: "Who do you think stole the diamond necklace? Was it Mr. White, Ms. Blue, or Mr. Red? Comment your answer below!"
Visual: YouTube's Like and Share buttons.
Narration: "Don't forget to like, share, and subscribe for more investigation quizzes!"

Craft your script following this structure. Keep it concise, intriguing, and engaging for your audience.
    '''  # Same as in the original
    return quiz_template, fact_template

# LLM chain
def get_llm_chain(template, llm):
    print("Building LLM Chain with prompt and model")
    prompt = PromptTemplate(template=template, input_variables=["topic"])
    return prompt | llm

# Parsing LLM response
def parse_llm_response(response):
    print("Parsing response from LLM model")
    text_array = response.splitlines()

    title = ""
    description = ""
    visuals = []
    narrations = []

    for text in text_array:
        if "Title:" in text:
            title = text.replace("Title: ", "")
        elif "Description:" in text:
            description = text.replace("Description: ", "")
        elif "Visual:" in text:
            visuals.append(text.replace("Visual: ", ""))
        elif "Narration:" in text:
            narrations.append(text.replace("Narration: ", ""))

    return title, description, visuals, narrations

# Convert text to images
def convert_text_to_image(visuals, output_dir='images'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    print("Converting text to images")
    image_files = []
    for count, visual in enumerate(visuals):
        file_path = os.path.join(output_dir, f'output_image{count}.png')
        pipe(f"{visual}, realism", width=768, height=1344, num_inference_steps=7, guidance_scale=3).images[0].save(file_path)
        image_files.append(file_path)
    
    return image_files

# Change audio speed
def change_speed(audio, speed_factor):
    return audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * speed_factor)}).set_frame_rate(audio.frame_rate)

# Generate speech from text
def generate_speech(narrations, speed_factor=1.25, output_dir='speeches'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    audio_files = []
    for count, narration in enumerate(narrations):
        file_name = os.path.join(output_dir, f"speech{count}.wav")
        tts.tts_to_file(text=narration, file_path=file_name)
        
        audio = AudioSegment.from_file(file_name)
        audio = change_speed(audio, speed_factor)
        audio.export(file_name, format="wav")
        
        audio_files.append(file_name)
    
    return audio_files

# Generate video
def generate_video(image_files, audio_files, output_file):
    video_clips = []
    for image, audio in zip(image_files, audio_files):
        audio_clip = AudioFileClip(audio)
        img_clip = ImageClip(image).set_duration(audio_clip.duration).set_audio(audio_clip)
        video_clips.append(img_clip)
    
    final_video = concatenate_videoclips(video_clips, method="compose")
    final_video = final_video.set_fps(24)
    
    final_audio = final_video.audio
    final_audio.fps = 44100
    audio_file_path = 'output_audio.mp3'
    final_audio.write_audiofile(audio_file_path)
    
    transcribe_audio_with_timestamps(audio_file_path)
    
    line_level_timestamps = split_text_into_lines()
    
    create_subtitle_images_with_highlight(line_level_timestamps, 768, 1344)
    
    final_video = add_highlighted_subtitles(final_video, line_level_timestamps)
    
    final_video = generate_video_with_bg_audio(final_video, "/kaggle/input/prisms/prism.mp3")
    
    final_video.write_videofile(output_file, codec="libx264", audio_codec="aac")


# Transcribe audio with timestamps
def transcribe_audio_with_timestamps(audiofilename):
    model = whisper.load_model("medium", device=device)
    result = model.transcribe(audiofilename, word_timestamps=True)
    
    wordlevel_info = [{'word': word['word'].strip(), 'start': word['start'], 'end': word['end']} for segment in result['segments'] for word in segment['words']]
    
    with open('data.json', 'w') as f:
        json.dump(wordlevel_info, f, indent=4)

# Split text into lines for subtitles
def split_text_into_lines():
    with open('data.json', 'r') as f:
        data = json.load(f)
    
    MaxChars = 10 
    MaxDuration = 1
    MaxGap = 1.5

    subtitles = []
    line = []
    line_duration = 0

    for idx, word_data in enumerate(data):
        word, start, end = word_data["word"], word_data["start"], word_data["end"]
        line.append(word_data)
        line_duration += end - start
        temp = " ".join(item["word"] for item in line)

        duration_exceeded = line_duration > MaxDuration 
        chars_exceeded = len(temp) > MaxChars 
        maxgap_exceeded = idx > 0 and word_data['start'] - data[idx-1]['end'] > MaxGap

        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitles.append({
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                })
                line = []
                line_duration = 0

    if line:
        subtitles.append({
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        })

    return subtitles

# Create subtitle images
def create_subtitle_image(text, current_word_idx, width, height, font_path='/kaggle/input/boldttf/Roboto-Bold.ttf', font_size=80):
    img = Image.new('RGBA', (width, height), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img)
    
    try:
        font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
    except Exception as e:
        print(f"Error loading font: {e}. Using default font.")
        font = ImageFont.load_default()
    
    words = text.split()
    total_width = sum([draw.textbbox((0, 0), word, font=font)[2] for word in words])
    x_offset = (width - total_width) // 2
    
    outline_offset = 2

    for idx, word in enumerate(words):
        fill = (255, 0, 0, 255) if idx == current_word_idx else (255, 255, 255, 255)
        word_bbox = draw.textbbox((0, 0), word, font=font)
        word_width = word_bbox[2] - word_bbox[0]
        word_height = word_bbox[3] - word_bbox[1]

        for x in range(-outline_offset, outline_offset + 1):
            for y in range(-outline_offset, outline_offset + 1):
                if x != 0 or y != 0:
                    draw.text((x_offset + x, (height - word_height) // 2 + y), word, font=font, fill=(0, 0, 0, 255))
        
        draw.text((x_offset, (height - word_height) // 2), word, font=font, fill=fill)
        x_offset += word_width + draw.textbbox((0, 0), ' ', font=font)[2]
    
    return img

def create_subtitle_images_with_highlight(linelevel_timestamps, width, height, output_dir='subtitles'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for i, line in enumerate(linelevel_timestamps):
        subtitle = line['word']
        for j, content in enumerate(line['textcontents']):
            img = create_subtitle_image(subtitle, j, width, height)
            img_path = os.path.join(output_dir, f"subtitle_{i}_{j}.png")
            img.save(img_path)

# Add highlighted subtitles to video
def add_highlighted_subtitles(video, linelevel_timestamps, input_dir='subtitles'):
    subclips = []

    for i, line in enumerate(linelevel_timestamps):
        for j, content in enumerate(line['textcontents']):
            img_path = os.path.join(input_dir, f"subtitle_{i}_{j}.png")
            if not os.path.exists(img_path):
                print(f"Subtitle image {img_path} does not exist.")
                continue

            img_clip = ImageClip(img_path).set_start(content['start']).set_duration(content['end'] - content['start']).set_position(("center", "bottom"))
            subclips.append(img_clip)

    video_with_subs = CompositeVideoClip([video, *subclips])
    return video_with_subs

def generate_video_with_bg_audio(video_clip, bg_file_path):
    # Path to your audio file
    audio_path = bg_file_path
    
    # Duration of the video clip in seconds
    video_duration = video_clip.duration
    
    # Load audio clip
    audio_clip = AudioFileClip(audio_path)
    
    # Start the audio from 25 seconds and end at the end of the video clip
    start_time = 25  # seconds
    end_time = start_time + video_duration  # seconds
    
    # Trim the audio clip
    trimmed_audio = audio_clip.subclip(start_time, end_time)
    
    # Set the volume of the background music (audio from audio file)
    background_music_volume = 0.15  # You can adjust this value as needed
    
    # Set the volume of the original video's audio
    original_audio_volume = 1.0  # Full volume (1.0)
    
    # Set the volume for the background music
    background_music = trimmed_audio.volumex(background_music_volume)
    
    # Get original video's audio
    original_audio = video_clip.audio
    
    # Reduce the volume of the original video's audio
    reduced_original_audio = original_audio.volumex(original_audio_volume - background_music_volume)
    
    # Combine the reduced original audio with the background music
    final_audio = CompositeAudioClip([reduced_original_audio, background_music])
    
    # Set the audio of the video clip to the combined audio
    video_clip = video_clip.set_audio(final_audio)
    return video_clip

# Main function to generate video with subtitles and background music
def main(model_name, niche, topic):
    llm = get_llm(model_name)
    
    quiz_template, fact_template = build_template()
    template = quiz_template if niche == "quiz" else fact_template
    
    llm_chain = get_llm_chain(template, llm)

    response = llm_chain.invoke(topic)
    title, description, visuals, narrations = parse_llm_response(response)

    image_files = convert_text_to_image(visuals)
    audio_files = generate_speech(narrations)

    output_file = 'output_video.mp4'
    generate_video(image_files, audio_files, output_file)

# Execute main function
if __name__ == "__main__":
    model_name, niche, topic = sys.argv[1:]
    main(model_name, niche, topic)

In [None]:
!python ./main.py "llama3:instruct" "fact" "Kohinoor"