# Imports

In [6]:
video_path = "V1.mp4"

In [7]:
import os
import json

from textwrap import dedent
import moviepy.editor as mp
from pydub import AudioSegment
import speech_recognition as sr
from moviepy.editor import VideoFileClip

from langchain_groq import ChatGroq
from crewai import Agent, Task, Process, Crew


# Functions

## Getting Text From Audio

In [8]:
def transcribe_video(video_path):
    recognizer = sr.Recognizer()
    
    video = mp.VideoFileClip(video_path)
    audio_path = "temp_audio.wav"
    video.audio.write_audiofile(audio_path)
    
    transcript = ""
    try:
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
        transcript = recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        transcript = "Google Speech Recognition could not understand audio"
    except sr.RequestError as e:
        transcript = f"Could not request results from Google Speech Recognition service; {e}"
    finally:
        if os.path.exists(audio_path):
            os.remove(audio_path)
    
    return transcript

## Getting captions from LLM

In [9]:
GROQ_API_KEY = "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"
Text_Input = """I love you Grand rising good Vibes to all status I'm ok happy Monday Happy Tuesday happy Wednesday happy Thursday happy Friday and a good New Year everyone"""
LLM_Model = "llama3-70b-8192"
def create_gif_caption_identifier_task(api_key, model, text_input):
    LLM_Grouq = ChatGroq(
        api_key=api_key,
        model=model
    )

    GIF_caption_identifier = Agent(
        role="GIF Caption Finder",
        goal="Identify sentences that can be used as text over GIFs",
        backstory=dedent("""\
            You're a GIF caption expert tasked with identifying sentences that are ideal for adding text overlays to GIFs. 
            Your role is crucial in selecting text fragments that convey impactful and memorable messages to enhance the visual appeal and communication of the GIFs. 
            Your task is to carefully read the provided text and list all sentences that are suitable for GIF captions, ensuring they are concise, relevant, and engaging.
        """),
        verbose=True,
        allow_delegation=True,
        llm=LLM_Grouq,
        memory=True,
    )

    GIF_caption_identifier_Task = Task(
        description=dedent(f"""\
            **INPUT:**
                * **Text:**
                    {text_input}
            **TASK:**
                Identify sentences within the text that can be used as captions for GIFs. ENSURE THE SELECTED SENTENCES ARE EXACTLY AS IN THE INPUT TEXT. NO SINGLE WORD OR LETTER SHOULD BE DIFFERENT OR NEW, and the sentences should be impactful, very short and contextually appropriate for a GIF.

            **OUTPUT:**
                Output the selected sentences as a JSON object, with each sentence being a separate entry.
            
            **EXAMPLE OUTPUT:**
                {{
                    "captions": [
                        "This is a perfect moment!",
                        "I can't believe this is happening.",
                        "Let's make this unforgettable."
                    ]
                }}
        """),
        agent=GIF_caption_identifier,
        expected_output=dedent("""\
            {
                "captions": [
                    "Sentence 1",
                    "Sentence 2",
                    ...
                    "Sentence n"
                ]
            }
        """),
        async_execution=False
    )

    crew = Crew(
        agents=[GIF_caption_identifier],
        tasks=[GIF_caption_identifier_Task],
        verbose=2,
        process=Process.sequential,
    )

    crew_result = crew.kickoff()

    return crew_result

## Getting the exact clips for each gif line also respective list of captions

In [None]:
def extract_audio_from_video(video_path, audio_path):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)

def transcribe_audio_chunk(recognizer, audio_chunk):
    with sr.AudioFile(audio_chunk) as source:
        audio_data = recognizer.record(source)
        try:
            return recognizer.recognize_google(audio_data).lower()
        except sr.UnknownValueError:
            return ""

def find_sentence_times(audio_path, target_sentence, initial_chunk_duration=2000, max_chunk_duration=10000):
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_file(audio_path)
    
    audio_length = len(audio)
    target_sentence = target_sentence.lower()
    target_sentence_words = len(target_sentence.split())
    
    start_time = None
    end_time = None
    
    chunk_duration = initial_chunk_duration
    overlap_duration = initial_chunk_duration // 2  # 50% overlap
    
    while chunk_duration <= max_chunk_duration:
        for i in range(0, audio_length, chunk_duration - overlap_duration):  # Overlapping chunks
            chunk = audio[i:i+chunk_duration]
            chunk.export("chunk.wav", format="wav")
            
            text = transcribe_audio_chunk(recognizer, "chunk.wav")
            
            if target_sentence in text:
                words = text.split()
                sentence_start_index = text.find(target_sentence)
                sentence_end_index = sentence_start_index + len(target_sentence)
                
                start_time = i / 1000.0 + sentence_start_index / len(text) * chunk_duration / 1000.0
                end_time = i / 1000.0 + sentence_end_index / len(text) * chunk_duration / 1000.0
                
                break
        if start_time is not None:
            break
        chunk_duration += 1000  # Increase chunk duration by 1 second if the sentence is not found
    
    return start_time, end_time

def save_video_clip(video_path, start_time, end_time, output_path):
    video = VideoFileClip(video_path)
    clip = video.subclip(start_time, end_time)
    clip.write_videofile(output_path, codec="libx264")

## Adding Captions

# Test

In [10]:
Text_Transcript = (transcribe_video("V1.mp4"))
print(Text_Transcript)

MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
I love you Grand rising good Vibes to all status I'm ok happy Monday Happy Tuesday happy Wednesday happy Thursday happy Friday and a good New Year everyone


In [11]:
GIF_Sentences = (create_gif_caption_identifier_task(GROQ_API_KEY, LLM_Model, Text_Transcript))
print(GIF_Sentences)

[1m[95m [2024-06-21 15:44:48][DEBUG]: == Working Agent: GIF Caption Finder[00m
[1m[95m [2024-06-21 15:44:48][INFO]: == Starting Task: **INPUT:**
    * **Text:**
        I love you Grand rising good Vibes to all status I'm ok happy Monday Happy Tuesday happy Wednesday happy Thursday happy Friday and a good New Year everyone
**TASK:**
    Identify sentences within the text that can be used as captions for GIFs. ENSURE THE SELECTED SENTENCES ARE EXACTLY AS IN THE INPUT TEXT. NO SINGLE WORD OR LETTER SHOULD BE DIFFERENT OR NEW, and the sentences should be impactful, very short and contextually appropriate for a GIF.

**OUTPUT:**
    Output the selected sentences as a JSON object, with each sentence being a separate entry.

**EXAMPLE OUTPUT:**
    {
        "captions": [
            "This is a perfect moment!",
            "I can't believe this is happening.",
            "Let's make this unforgettable."
        ]
    }
[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m

In [13]:
temp = json.loads(GIF_Sentences)
GIF_Sentences_LIST = temp["captions"]
print(GIF_Sentences_LIST)

['Grand rising', 'Happy Monday', 'Happy Tuesday', 'Happy Wednesday', 'Happy Thursday', 'Happy Friday', 'Good Vibes to all', "I'm ok", 'Happy New Year everyone']


In [None]:
video_path = "V1.mp4"
audio_path = "audio.wav"

extract_audio_from_video(video_path, audio_path)

clips_paths = {}

for i, caption in enumerate(GIF_Sentences_LIST, 1):
    start_time, end_time = find_sentence_times(audio_path, caption)
    if start_time is not None:
        output_path = f"output_clip_{i}.mp4"
        save_video_clip(video_path, start_time, end_time, output_path)
        clips_paths[f"caption_{i}"] = output_path
        print(f"The sentence '{caption}' starts at {start_time} seconds and ends at {end_time} seconds. Saved to {output_path}")
    else:
        print(f"The sentence '{caption}' was not found in the audio.")

print(clips_paths)
