
## Convert a video to a podcast (2 people - video format)

In [None]:
import sieve

#### Step 1. Download a YouTube video

In [None]:
url = "https://www.youtube.com/watch?v=AKJfakEsgy0"
resolution = "highest-available"
include_audio = True

youtube_to_mp4 = sieve.function.get("sieve/youtube_to_mp4")
output_video = youtube_to_mp4.run(url, resolution, include_audio)

print(output_video.path)

#### Step 2. Summarize it into a conversational style

In [None]:

backend = "gemini-1.5-flash" #for more complex tasks use "gemini-1.5-pro"
prompt = "Summarize the video into a conversation between two people."
fps = 30
audio_context = True

visual_qa = sieve.function.get("sieve/visual-qa")
output = visual_qa.run(output_video, backend, prompt, fps, audio_context)

#### Step 3. Process summary text (summary as conversation between 2 people)

In [None]:
# Split the text by newline character
conversation_list = output.split('\n')
# Remove empty strings from the list
conversation_list = [line for line in conversation_list if line.strip()]

# Get a list of dialogues with each item as a tuple of format (speaker_id,text).
summary_conversation_list = []

for line in conversation_list:
    if line.startswith("Person 1:"):
        speaker = "Person 1"
        text = line.replace("Person 1:", "").strip()
    elif line.startswith("Person 2:"):
        speaker = "Person 2"
        text = line.replace("Person 2:", "").strip()
    else:
        # Find the position of the first colon
        colon_index = line.find(":")
        # If a colon is found, return the content after it
        if colon_index != -1:
            text = line[colon_index + 1:].strip()
            speaker = line[:colon_index]
        else: # If no colon is found, return the entire sentence
            speaker = "Unknown"
            text = line.strip()
    
    summary_conversation_list.append((speaker, text))

# Display the processed conversation
for speaker, text in summary_conversation_list:
    print(f"{speaker}: {text}")

Person 1: Did you know that giant clams are more efficient at converting energy from the sun than solar panels or even leaves? <br><br>
Person 2: That's really interesting! What's their secret? <br><br>
Person 1: They have algae living inside them. The algae takes light from the sun and converts it to energy for itself and the clam. <br><br>
Person 2: Wow! So we could learn from them to make better solar energy?<br><br>
Person 1: That's right. People are trying to extract energy from algae, but our current process is only 10% efficient. These clams are 67% efficient.<br><br>
Person 2: How do they do it?<br><br>
Person 1: It's because of these shimmery cells on the clam. They reflect sunlight down onto the algae inside the clam, evenly coating them with the perfect dose of light.<br><br>
Person 2: Fascinating! There's so much to learn from nature. <br><br>

#### Step 4. Convert each conversation turn text to speech & generate its talking avatar

feed each turn of output (processed_conversation) to *sieve/tts* with either speaker1 or speaker2 voice iteratively → generate audio1, audio2, audio3, .., (Odd number files belong to speaker1 and even numbered files belong to speaker2).
During each turn, generates its talking avatar. 


In [None]:
import subprocess

def reencode_video(input_path, output_path):
    """
    Re-encode a video to normalize codec properties.
    """
    command = [
        "ffmpeg",
        "-loglevel", "warning",
        "-i", input_path,
        "-r", "30",
        "-c:v", "libx264",
        "-preset", "fast",
        "-crf", "23",
        "-c:a", "aac",
        output_path
    ]

    # Execute the command using subprocess.run
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print(f"Re-encoded: {input_path} -> {output_path}")
        print(result.stdout)  # Optionally print the standard output
    except subprocess.CalledProcessError as e:
        print("Error occurred while merging videos.")
        print(e.stderr)  # Optionally print the error message

In [None]:
# Step 4. Convert each conversation-turn's text to speech & generate its talking avatar.
tts = sieve.function.get("sieve/tts")
portrait_avatar = sieve.function.get("sieve/portrait-avatar")

# tts inputs:
print("generating tts audio and its avatar video...")
odd_voice = "cartesia-commercial-man"
even_voice = "cartesia-sweet-lady"
reference_audio = sieve.File(url="") #not passing this argument results throws error.

# portrait-avatar inputs
odd_image = sieve.File("https://storage.googleapis.com/sieve-prod-us-central1-public-file-upload-bucket/c4d968f5-f25a-412b-9102-5b6ab6dafcb4/4c35f0b2-5925-4acc-8870-0b06641fd5f6-boy.jpg")
even_image = sieve.File(url="https://storage.googleapis.com/sieve-prod-us-central1-public-file-upload-bucket/dea37047-9b88-44b7-aacb-a5f4745f1f2d/db7a439e-24f8-40cd-b29d-43935e1a2ae7-input-source_image.jpg")
aspect_ratio = "16:9"

turn = 0
normalized_videos = []
for speaker, text in summary_conversation_list:
    turn += 1
    if turn % 2 != 0: # odd-turn of conversation
        target_audio = tts.run(odd_voice, text,reference_audio,"curiosity")
        avatar_video = portrait_avatar.run(source_image=odd_image, driving_audio=target_audio,aspect_ratio = aspect_ratio)
        print(f'odd turn: done tts and avatar generation for turn-{turn}')
    else: #even-turn
        target_audio = tts.run(even_voice, text, reference_audio,"curiosity")
        avatar_video = portrait_avatar.run(source_image=even_image, driving_audio=target_audio,aspect_ratio = aspect_ratio)
        print(f'even turn: done tts and avatar generation for turn-{turn}')
    #Encode generated video to ensure same frame rate, video codec, audio codec and similar video quality.
    normalized_video = f"normalized_{turn}.mp4"
    reencode_video(avatar_video.path, normalized_video)
    normalized_videos.append(normalized_video)
print("done generating video avatars for all individual conversation turns!")

Step 5. Merge video files

In [None]:
def merge_videos(video_list_file):
    """
    Merge videos listed in the video_list_file into a single video.
    """
    # Step 1: Read the video list from the file
    with open(video_list_file, 'r') as f:
        video_files = f.readlines()

    # Step 2: Normalize (re-encode) each video in the list
    normalized_videos = []
    for i, video in enumerate(video_files):
        video = video.strip()  # Remove any extra spaces or newline characters
        normalized_video = f"normalized_{i+1}.mp4"
        reencode_video(video, normalized_video)
        normalized_videos.append(normalized_video)
        if i == 3:
            break
    
    # Step 3: Write the normalized video list to a new text file for concatenation
    with open('normalized_videos.txt', 'w') as f:
        for normalized_video in normalized_videos:
            f.write(f"file '{normalized_video}'\n")
            
    # # Step 4: Concatenate the normalized videos
    output_path = "merged_output_video.mp4"
    command = [
        "ffmpeg",
        "-f", "concat",
        "-safe", "0",
        "-i", "normalized_videos.txt",
        "-loglevel", "warning",
        "-c", "copy",
        output_path     
    ]
    
    # subprocess.run(command)
    # Execute the command using subprocess.run
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("Merge successful!")
        print(result.stdout)  # Optionally print the standard output
    except subprocess.CalledProcessError as e:
        print("Error occurred while merging videos.")
        print(e.stderr)  # Optionally print the error message
    return output_path

# Example usage
final_output = merge_videos('videos.txt')