
## Convert a video to a podcast (2 people - video format)

In [None]:
import sieve

#### Step 1. Download a YouTube video

In [None]:
# url = "https://www.youtube.com/watch?v=AKJfakEsgy0"
url = "https://youtube.com/shorts/D-F32ieZ4WA?si=X7QzBXMEuJM6d-E4"

resolution = "highest-available"
include_audio = True

youtube_to_mp4 = sieve.function.get("sieve/youtube_to_mp4")
output_video = youtube_to_mp4.run(url, resolution, include_audio)

print(output_video.path)

#### Step 2. Summarize it into a conversational style

In [None]:
visual_summarizer = sieve.function.get("sieve/visual-qa")
function_json = {
  "type": "list",
  "items": {
    "type": "object",
    "properties": {
      "speaker_name": {
        "type": "string",
        "description": "The speaker name"
      },
      "dialogue": {
        "type": "string",
        "description": "dialogue"
      }
    }
  }
}

backend = "gemini-1.5-flash" 
prompt = "Summarize the video into a conversation between two people. Denote first speaker as 'Person 1' and second speaker as 'Person 2'."
summary_as_conversation = visual_summarizer.run(output_video, backend, prompt, fps=1, audio_context= True,function_json=function_json)
print("Summary: \n", summary_as_conversation)

#### Step 3. Convert each conversation turn text to speech & generate its talking avatar

feed each turn of output (processed_conversation) to *sieve/tts* with either speaker1 or speaker2 voice iteratively → generate audio1, audio2, audio3, .., (Odd number files belong to speaker1 and even numbered files belong to speaker2).
During each turn, generates its talking avatar. 


In [None]:
import subprocess

def reencode_video(input_path, output_path):
    """
    Re-encode a video to normalize codec properties.
    """
    command = [
        "ffmpeg",
        "-loglevel", "warning",
        "-i", input_path,
        "-r", "30",
        "-c:v", "libx264",
        "-preset", "fast",
        "-crf", "23",
        "-c:a", "aac",
        output_path
    ]

    # Execute the command using subprocess.run
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print(f"Re-encoded: {input_path} -> {output_path}")
        print(result.stdout)  # Optionally print the standard output
    except subprocess.CalledProcessError as e:
        print("Error occurred while merging videos.")
        print(e.stderr)  # Optionally print the error message

In [None]:
# Step 3. Convert each conversation-turn's text to speech & generate its talking avatar.
tts = sieve.function.get("sieve/tts")
portrait_avatar = sieve.function.get("sieve/portrait-avatar")

# tts inputs:
print("generating tts audio and its avatar video...")
odd_voice = "cartesia-commercial-man"
even_voice = "cartesia-sweet-lady"
reference_audio = sieve.File(url="") #not passing this argument results throws error.

# portrait-avatar inputs
odd_image = sieve.File("https://storage.googleapis.com/sieve-prod-us-central1-public-file-upload-bucket/c4d968f5-f25a-412b-9102-5b6ab6dafcb4/4c35f0b2-5925-4acc-8870-0b06641fd5f6-boy.jpg")
even_image = sieve.File(url="https://storage.googleapis.com/sieve-prod-us-central1-public-file-upload-bucket/dea37047-9b88-44b7-aacb-a5f4745f1f2d/db7a439e-24f8-40cd-b29d-43935e1a2ae7-input-source_image.jpg")
aspect_ratio = "1:1"

turn = 0
normalized_videos = []
for entry in summary_as_conversation:
    turn += 1
    if turn % 2 != 0: # odd-turn of conversation
        target_audio = tts.run(odd_voice, entry['dialogue'],reference_audio,"curiosity")
        avatar_video = portrait_avatar.run(source_image=odd_image, driving_audio=target_audio,aspect_ratio = aspect_ratio)
        print(f'odd turn: done tts and avatar generation for turn-{turn}')
    else: #even-turn
        target_audio = tts.run(even_voice, entry['dialogue'], reference_audio,"curiosity")
        avatar_video = portrait_avatar.run(source_image=even_image, driving_audio=target_audio,aspect_ratio = aspect_ratio)
        print(f'even turn: done tts and avatar generation for turn-{turn}')
    #Encode generated video to ensure same frame rate, video codec, audio codec and similar video quality.
    normalized_video = f"normalized_{turn}.mp4"
    reencode_video(avatar_video.path, normalized_video)
    normalized_videos.append(normalized_video)
print("done generating video avatars for all individual conversation turns!")

Step 4. Merge video files

In [None]:
def merge_videos(video_list_file):
    """
    Merge videos listed in the video_list_file into a single video.
    """
    # Step 1: Read the video list from the file
    with open(video_list_file, 'r') as f:
        video_files = f.readlines()

    # Step 2: Normalize (re-encode) each video in the list
    normalized_videos = []
    for i, video in enumerate(video_files):
        video = video.strip()  # Remove any extra spaces or newline characters
        normalized_video = f"normalized_{i+1}.mp4"
        reencode_video(video, normalized_video)
        normalized_videos.append(normalized_video)
        if i == 3:
            break
    
    # Step 3: Write the normalized video list to a new text file for concatenation
    with open('normalized_videos.txt', 'w') as f:
        for normalized_video in normalized_videos:
            f.write(f"file '{normalized_video}'\n")
            
    # # Step 4: Concatenate the normalized videos
    output_path = "merged_output_video.mp4"
    command = [
        "ffmpeg",
        "-f", "concat",
        "-safe", "0",
        "-i", "normalized_videos.txt",
        "-loglevel", "warning",
        "-c", "copy",
        output_path     
    ]
    
    # subprocess.run(command)
    # Execute the command using subprocess.run
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("Merge successful!")
        print(result.stdout)  # Optionally print the standard output
    except subprocess.CalledProcessError as e:
        print("Error occurred while merging videos.")
        print(e.stderr)  # Optionally print the error message
    return output_path

# Example usage
final_output = merge_videos('videos.txt')

### [Advanced] Parallelized Code Using concurrent.futures

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

# Step 4. Convert each conversation-turn's text to speech & generate its talking avatar.
# tts inputs:
print("generating tts audio and its avatar video...")
reference_audio = sieve.File(url="")  # Required argument to avoid errors

turn = 0
normalized_videos = []
turn_results = {}  # Dictionary to store normalized videos by turn

# ThreadPoolExecutor for concurrent execution
with ThreadPoolExecutor() as executor:
    # List to keep track of all submitted jobs
    future_to_turn = {}

    # Submit TTS and avatar generation jobs
    for entry in summary_as_conversation:
        turn += 1
        if turn % 2 != 0:  # Odd-turn of conversation
            target_audio_future = tts.push(voice1, entry['dialogue'], reference_audio, "curiosity")
            avatar_video_future = portrait_avatar.push(
                source_image=image1, 
                driving_audio=target_audio_future.result(), 
                aspect_ratio="1:1"
            )
        else:  # Even-turn
            target_audio_future = tts.push(voice2, entry['dialogue'], reference_audio, "curiosity")
            avatar_video_future = portrait_avatar.push(
                source_image=image2, 
                driving_audio=target_audio_future.result(), 
                aspect_ratio="1:1"
            )

        # Store the avatar video future and turn in a dictionary for tracking
        future_to_turn[avatar_video_future] = turn

    # Process avatar generation results as they complete
    for future in as_completed(future_to_turn):
        turn = future_to_turn[future]
        try:
            avatar_video = future.result()  # Wait for the avatar video to complete
            print(f"Done TTS and avatar generation for turn-{turn}")
            
            # Re-encode the video
            normalized_video = f"normalized_{turn}.mp4"
            reencode_video(avatar_video.path, normalized_video)

            # Store normalized video path in a dictionary
            turn_results[turn] = normalized_video
        except Exception as e:
            print(f"Error processing turn-{turn}: {e}")

# Append normalized videos to the list in the sequential order of turns
for turn in sorted(turn_results.keys()):
    normalized_videos.append(turn_results[turn])


**Advantages of This Approach**
- Concurrency: Multiple TTS and avatar generation tasks are submitted and executed in parallel, reducing the overall processing time.<br>
- Asynchronous Result Handling: *as_completed* ensures that completed tasks are processed immediately, improving efficiency. <br>
- Scalability: Can handle a larger number of turns or tasks without significant changes to the code.
<br><br>
NB:
- Correct Ordering:Ensures that the normalized_videos list is populated in the correct order of turns, regardless of the order in which tasks complete.