In [1]:
import time
import os
import json
import imageio
import requests
import argparse
from tqdm import tqdm
from moviepy.editor import VideoFileClip
from utils import encode_image
from movieseq import MovieSeq

import openai
from openai import OpenAI

import whisperx

In [2]:
# Set up OPENAI KEY and Models
os.environ['OPENAI_API_KEY'] = 
openai.api_key = os.getenv("OPENAI_API_KEY")

# Please provide Huggingface tokens to access speaker-identify model
HF_TOKEN = 

# load whisperx model
model = whisperx.load_model('large-v3', device='cuda')
model_a, metadata = whisperx.load_align_model(language_code='en', device='cuda')
diarize_model = whisperx.DiarizationPipeline(model_name='pyannote/speaker-diarization-3.1', use_auth_token=HF_TOKEN, device='cuda')

No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../home/qinghong/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu117. Bad things might happen unless you revert torch to 1.x.


In [3]:
# vid_url
vid_url = 'input_clip.mp4'

# proceed the video into # clips
cut_clip = 4

# Context -- Character / Images
char_dir = 'char_bank'
char_bank = {
    'Mia': f'{char_dir}/character1.jpg',
    'Sebastian': f'{char_dir}/character2.jpg'
}

In [4]:
def prepare_dialogue(video_path):
    audio = whisperx.load_audio(video_path)
    result = model.transcribe(audio, batch_size=32)
    result_a = whisperx.align(result["segments"], model_a, metadata, audio,
                              return_char_alignments=False, device='cuda')
    
    diarize_segments = diarize_model(audio)
    result_id = whisperx.assign_word_speakers(diarize_segments, result_a)
    return result_id

def prepare_context(vid_url, cut_clip):
    # Context -- Dialogue / Subtitles
    vid_asr_id = prepare_dialogue(vid_url)

    # Video sampling and prepare keyframes & dialogues
    video = VideoFileClip(vid_url)
    duration = video.duration

    prev_speaker = None
    current_segment = []
    all_segments = []
    start_timestamps = []

    for x in vid_asr_id['segments']:
        if 'speaker' in x:
            if prev_speaker is None:
                prev_speaker = x['speaker']
                start_timestamps.append(x['start'])
            
            if x['speaker'] == prev_speaker:
                current_segment.append(f"{x['text']}")
            else:
                all_segments.append(f"{prev_speaker}: {' '.join(current_segment)}")
                current_segment = [f"{x['text']}"]
                prev_speaker = x['speaker']
                start_timestamps.append(x['start'])

    if current_segment:
        all_segments.append(f"{prev_speaker}: {' '.join(current_segment)}")

    asr_list = [all_segments[i:i + cut_clip] for i in range(0, len(all_segments), cut_clip)]
    timestamps = [start_timestamps[i] for i in range(0, len(start_timestamps), cut_clip)]

    time_list = []
    diag_list = []
    for i, clips in enumerate(asr_list):
        time_list.append(timestamps[i])
        diag_list.append(f" ".join(clips))

    output_dir = 'frames'
    os.makedirs(output_dir, exist_ok=True)

    frame_list = []
    for timestamp in time_list:
        frame = video.get_frame(timestamp)
        frame_path = os.path.join(output_dir, f"{timestamp:.1f}.jpg")
        imageio.imwrite(frame_path, frame)
        frame_list.append(frame_path)

    return diag_list, frame_list

In [5]:
movieseq = MovieSeq()
diag_list, frame_list = prepare_context(vid_url, cut_clip)

Detected language: en (0.98) in first 30s of audio...


In [11]:
query = "Who is in this video?"
movieseq.get_response(char_bank, frame_list, diag_list, query)

'Based on the provided dialogues and the photos, the video features two characters: Mia and Sebastian. Mia is the character in the first photo you provided, and Sebastian is the character in the second photo. The dialogue involves a conversation between these two characters, with Mia being referred to as SPEAKER_00 and Sebastian as SPEAKER_01.'

In [13]:
query = "What colors did Sebastian wear?"
movieseq.get_response(char_bank, frame_list, diag_list, query)

'Sebastian is wearing a white shirt and a dark-colored jacket, which appears to be either black or navy blue.'

In [12]:
query = "Can you infer what feelings Sebastian has for Mia?"
movieseq.get_response(char_bank, frame_list, diag_list, query)

'Based on the dialogue provided, it seems that Sebastian\'s feelings towards Mia evolve over the course of their conversation. Initially, there is a sense of tension and perhaps irritation, as indicated by his admission of being "an asshole" and his defensive stance about being a "serious musician." However, as the conversation progresses, there is a shift in tone. Sebastian becomes more curious and somewhat playful, especially when he asks if he has seen Mia in anything and when he makes a light-hearted comment about her being a barista.\n\nBy the end of the conversation, Sebastian\'s tone appears to be more respectful and intrigued. His final line, "Guess I\'ll see you in the movies," suggests a level of acknowledgment and perhaps a budding interest in Mia. Overall, it seems that Sebastian\'s feelings transition from initial defensiveness to curiosity and a hint of admiration.'