In [3]:
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    TextIteratorStreamer,
    pipeline,
)
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain import HuggingFacePipeline
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory
from langchain.prompts.prompt import PromptTemplate
from threading import Thread
import time
import torch
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio, Video
from Wav2Lip import inference as wav2lip

In [2]:
# Load the models and stuff
# IMPORTANT: If you want to rerun this cell, restart the kernel! (or delete the model variable)
# The old model is still eating VRAM, transformers may start loading into CPU RAM!
# Should not run for more than 30s!

modelPath = "/home/models/vicuna/13B-1.3"
tokenizer = LlamaTokenizer.from_pretrained(modelPath)
llmModel = AutoModelForCausalLM.from_pretrained(modelPath, device_map="auto", load_in_8bit=True, torch_dtype=torch.float16)
print("Loaded LLM model.")

speechModelPath = "/home/models/speecht5_tts/"
ganPath = "/home/models/speecht5_hifigan/"
processor = SpeechT5Processor.from_pretrained(speechModelPath)
speechModel = SpeechT5ForTextToSpeech.from_pretrained(speechModelPath)
vocoder = SpeechT5HifiGan.from_pretrained(ganPath)
print("Loaded text to speech model.")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loaded LLM model.
Loaded text to speech model.


In [3]:
llmPipeline = pipeline(
    "text-generation",
    model=llmModel,
    temperature=0,
    max_new_tokens=512,
    tokenizer=tokenizer,
)

template = """The following is a friendly conversation between a human and an AI. The AI is helpful, detailed, but also concise.

Current conversation:
{history}
USER: {input}
ASSISTANT:"""
prompt = PromptTemplate(input_variables=["history", "input"], template=template)
llm = HuggingFacePipeline(pipeline=llmPipeline)
llmChain = ConversationChain(
    llm=llm,
    prompt=prompt,
    verbose=False,
    memory=ConversationSummaryBufferMemory(llm=llm, k=3, ai_prefix="ASSISTANT")
)

In [16]:
speaker_embeddings = {
    "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy", # male
    "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy", # female
    "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy", # male
    "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy", # male
    "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy", # female
    }

def __textToSpeech(text, speaker):
    if len(text.strip()) == 0:
        return np.zeros(0).astype(np.int16)

    inputs = processor(text=text, return_tensors="pt")

    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :speechModel.config.max_text_positions]
    speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
    speech = speechModel.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
    speech = (speech.numpy() * 32767).astype(np.int16)
    
    return speech

def textToSpeech(text:str, speaker="RMS"):
    sentences = text.split("\n")
    outputs = []
    for sentence in sentences:
        outputs.append(__textToSpeech(sentence, speaker))

    return np.concatenate(outputs)

In [4]:
from Wav2Lip import inference as wav2lip
wav2lip.main(video_path="videos/kennedy1min.mp4", 
        audio_path="temp/tts.wav",
        out_path="result.mp4", 
        checkpoint_path="checkpoints/wav2lip_gan.pth",
        batch_size=32, is_static=False)
Video("result.mp4")

Reading video frames...
Number of frames available for inference: 2398
(80, 3759)
Length of mel chunks: 1404


100%|██████████| 44/44 [00:25<00:00,  1.69it/s]


Load checkpoint from: checkpoints/wav2lip_gan.pth


  2%|▏         | 1/44 [00:26<19:12, 26.79s/it]

Model loaded


100%|██████████| 44/44 [00:30<00:00,  1.46it/s]


In [17]:
def queryLLMWithVoice(input):
    output = llmChain.predict(input=input)
    print(output)
    speech = textToSpeech(output)
    write('temp/tts.wav', 16000, speech)
    return Audio(speech, rate=16000)

def queryLLMWithVideo(input):
    text = llmChain.predict(input=input)
    print(text)
    speech = textToSpeech(text)
    write('temp/tts.wav', 16000, speech)
    wav2lip.main(video_path="videos/kennedy1min.mp4", 
        audio_path="temp/tts.wav",
        out_path="result.mp4", batch_size=32, static=False)
    return Video("result.mp4")
    

In [15]:
queryLLMWithVideo("You are Kennedy. What do you think of Donald Trump as a person?")

  As President John F. Kennedy, I do not believe it is appropriate for me to comment on the personal characteristics of individual politicians. It is important for me to focus on the issues and policies that are important to our country and to work towards finding constructive solutions to the challenges we face.

I believe that it is important for our country to come together and to work towards finding common ground, regardless of our political differences. I also believe in the importance of upholding the values and principles that have made our country great, including the rule of law, respect for human rights, and a commitment to democracy and freedom.

Overall, I believe that it is important for us to focus on the issues that matter and to work towards finding constructive solutions to the challenges we face as a nation.
Reading video frames...
Number of frames available for inference: 2398
(80, 3759)
Length of mel chunks: 1404
Load checkpoint from: checkpoints/wav2lip_gan.pth
mo

  0%|          | 0/44 [00:00<?, ?it/s]

beginning face detection


100%|██████████| 44/44 [00:32<00:00,  1.36it/s]
100%|██████████| 44/44 [00:37<00:00,  1.18it/s]
