# Demo of mWhisper-Flamingo

# Setup

In [None]:
# Verify the correct versions are loaded (1.22.0 and 2.14.0)
import numpy; print(numpy.__version__)
import tensorboard; print(tensorboard.__version__)

1.22.0
2.14.0


In [None]:
import sys
import os
import numpy as np
import torch
from scipy.io import wavfile
import whisper
from utils import add_noise



In [3]:
# verify that we are using the local whisper
print(whisper.__file__)

/data/sls/u/meng/roudi/whisper-flamingo/whisper/__init__.py


In [5]:
# download data and models
!wget https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/demo.tar.gz
!wget https://data.csail.mit.edu/public-release-sls/whisper-flamingo/noise.tar.gz
!wget https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/whisper_multi-all_small.pt

--2025-01-30 17:46:05--  https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/demo.tar.gz
Resolving data.csail.mit.edu (data.csail.mit.edu)... 128.52.131.233
Connecting to data.csail.mit.edu (data.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1277423 (1.2M) [application/x-gzip]
Saving to: ‘demo.tar.gz.1’


2025-01-30 17:46:05 (106 MB/s) - ‘demo.tar.gz.1’ saved [1277423/1277423]

--2025-01-30 17:46:06--  https://data.csail.mit.edu/public-release-sls/whisper-flamingo/noise.tar.gz
Resolving data.csail.mit.edu (data.csail.mit.edu)... 128.52.131.233
Connecting to data.csail.mit.edu (data.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 766132 (748K) [application/x-gzip]
Saving to: ‘noise.tar.gz’


2025-01-30 17:46:06 (69.8 MB/s) - ‘noise.tar.gz’ saved [766132/766132]

--2025-01-30 17:46:07--  https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/whispe

In [6]:
!tar -xf demo.tar.gz
# adjust the noise tsv files with the correct path to the noise
!tar -xf noise.tar.gz
!echo $(pwd)/noise/babble/muavic/babble_all.wav > ./noise/babble/muavic/test.tsv
!echo $(pwd)/noise/babble/lrs3/noise.wav > ./noise/babble/lrs3/test.tsv

# Process Video

In [4]:
from IPython.display import HTML
from base64 import b64encode
def play_video(video_path, width=200):
  mp4 = open(video_path,'rb').read()
  data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
  return HTML(f"""
  <video width={width} controls>
        <source src="{data_url}" type="video/mp4">
  </video>
  """)

# Play video

In [7]:
origin_clip_path = "demo/muavic_es_9VA26uZPqYA_0000.wav"
mouth_roi_path = "demo/muavic_es_9VA26uZPqYA_0000.mp4"

Original Audio Only:

In [8]:
play_video(origin_clip_path, width=300)

Video after face detection, normalization to the reference mean face, and cropping (used as input to AV-HuBERT):

In [9]:
play_video(mouth_roi_path, width=300)

# Add Babble Noise

In [10]:
import IPython
clean_input = whisper.load_audio(origin_clip_path)
print("Original input")
IPython.display.Audio(clean_input, rate=16000)

Original input


In [11]:
noise_fn = 'noise/babble/lrs3/noise.wav'
sample_rate, noise = wavfile.read(noise_fn)
print("Babble noise based on LRS3")
IPython.display.Audio(noise, rate=16000)

Babble noise based on LRS3


In [12]:
# SNR = 0 # negative values make the noise stronger, positive values make the noise weaker
# SNR = -2.5 # negative values make the noise stronger, positive values make the noise weaker
SNR = -5.0 # negative values make the noise stronger, positive values make the noise weaker
noisy_input = add_noise(clean_input * 32768.0, [noise_fn], noise_snr=SNR).flatten().astype(np.float32) / 32768.0
print("Original input with babble noise added at {} SNR".format(SNR))
IPython.display.Audio(noisy_input, rate=16000)

Original input with babble noise added at -5.0 SNR


In [34]:
babble_multilingual = 'demo/merged_audio_equalized.wav'
sample_rate, noisy_multilingual = wavfile.read(babble_multilingual)
print("Multilingual babble noise for demo")
IPython.display.Audio(noisy_multilingual, rate=16000)

Multilingual babble noise for demo


# Transcribe audio with Whisper Small (original OpenAI weights)

In [13]:
import whisper
model = whisper.load_model("small", download_root='models/')

100%|███████████████████████████████████████| 461M/461M [05:08<00:00, 1.57MiB/s]


Whisper dropout rate : 0.0


In [14]:
def decode_audio(input, model, lang="en"):
    audio = whisper.pad_or_trim(input)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    options = whisper.DecodingOptions(fp16 = True if torch.cuda.is_available() else False,
                                      language=lang, beam_size=1, without_timestamps=True)
    pred = whisper.decode(model, mel, options).text
    return pred

In [15]:
result = decode_audio(clean_input, model, lang ='es')
print("Transcribing original input : {}".format(result))

Transcribing original input : Tenía cinco años y empezaba a leer mi querido abuelo Carlos Felipe me regaló un libro de aventuras.


In [16]:
result = decode_audio(noisy_input, model, lang='es')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : y


In [38]:
result = decode_audio(noisy_multilingual, model, lang='es')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : y


### The multilingual babble noise has speakers in 4 other languages

In [39]:
result = decode_audio(noisy_multilingual, model, lang='en')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : The union shall be swallowed by the community, and the rules and the behavior of the community will be granted.


In [40]:
result = decode_audio(noisy_multilingual, model, lang='fr')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : Et donc, on va aussi pour le plan de l'église, pour les plus grandes.


In [41]:
result = decode_audio(noisy_multilingual, model, lang='it')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : E il suo regio in cibo è stato accortato in cibo, e non è stato accortato in cibo.


In [42]:
result = decode_audio(noisy_multilingual, model, lang='ru')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : И еще один чар, который сфальтажили, и он сфальтажил.


# Transcribe / Translate audio with Multilingual-Whisper Small (ours, fine-tuned on MuAViC)

In [20]:
whisper_ft_model = whisper.load_model("small", download_root='models/')
state_dict = torch.load('models/whisper_multi-all_small.pt', map_location=torch.device('cpu'))
state_dict = state_dict['state_dict']
state_dict_updated = {k[6:]: v  for k, v in state_dict.items()} # remove 'model.'
whisper_ft_model.load_state_dict(state_dict_updated)

Whisper dropout rate : 0.0


<All keys matched successfully>

### En Transcription
Note: our model does not capitilize text and add punctuation (besides apostrophe) due to the text normalization

In [21]:
result = decode_audio(clean_input, whisper_ft_model, lang='es')
print("Transcribing original input : {}".format(result))

Transcribing original input : yo tenía 5 años y empezaba a leer mi querido abuelo carlos felipe me regaló un libro de aventuras


In [22]:
result = decode_audio(noisy_input, whisper_ft_model, lang='es')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : y no solo el mundo sino la vida que tenemos para poder cambiar el mundo


In [43]:
result = decode_audio(noisy_multilingual, whisper_ft_model, lang='es')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : y de hecho es posible que la gente crezca y crezca en lo que pasa en el mundo


### The multilingual babble noise has speakers in 4 other languages

In [45]:
result = decode_audio(noisy_multilingual, whisper_ft_model, lang='en')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : and yet again we're not sure what that actually is but i think we need to be able to do something about it


In [46]:
result = decode_audio(noisy_multilingual, whisper_ft_model, lang='fr')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : et pourtant les sorties de la société ont fait un effet très fort et très fort


In [48]:
result = decode_audio(noisy_multilingual, whisper_ft_model, lang='it')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : e i ragazzi sono stati dei ragazzi che hanno fatto un esempio di un'idea di un'idea di cose


In [49]:
result = decode_audio(noisy_multilingual, whisper_ft_model, lang='ru')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : и ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз ещё раз


# Multilingual Whisper-Flamingo (Audio-Visual)

In [None]:
# Download video models
!wget https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/whisper-flamingo_multi-all_small.pt
!wget https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/mavhubert_only_weights.pt

--2025-01-30 17:56:27--  https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/whisper-flamingo_multi-all_small.pt
Resolving data.csail.mit.edu (data.csail.mit.edu)... 128.52.131.233
Connecting to data.csail.mit.edu (data.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2611373899 (2.4G)
Saving to: ‘whisper-flamingo_multi-all_small.pt’


2025-01-30 17:59:38 (13.0 MB/s) - ‘whisper-flamingo_multi-all_small.pt’ saved [2611373899/2611373899]

--2025-01-30 17:59:40--  https://data.csail.mit.edu/public-release-sls/mwhisper-flamingo/models/mavhubert_only_weights.pt
Resolving data.csail.mit.edu (data.csail.mit.edu)... 128.52.131.233
Connecting to data.csail.mit.edu (data.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 

In [None]:
model_type = 'small'
checkpoint= 'models/whisper-flamingo_multi-all_small.pt'
use_av_hubert_encoder = 1
av_fusion = 'separate'
video_model_path = 'models/mavhubert_only_weights.pt'

In [60]:
def load_model(model_type, checkpoint, use_av_hubert_encoder, av_fusion, video_model_path):
    print("Loading Whisper")
    whisper_model = whisper.load_model(model_type,
                                    download_root='models',
                                    video=True if av_fusion == 'separate' else 0,
                                    video_model_path=video_model_path,
                                    av_hubert_encoder=use_av_hubert_encoder,
                                    av_fusion=av_fusion,
                                    add_gated_x_attn=1 if av_fusion == 'separate' else 0)

    if checkpoint is not None:
        print("Loading checkpoint")
        state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))
        print(state_dict.keys())
        state_dict = state_dict['state_dict']
        state_dict_updated = {k[6:]: v  for k, v in state_dict.items()} # remove 'model.'
        try: # newer models have learnable scaler init 1
            whisper_model.load_state_dict(state_dict_updated)
        except BaseException as e:
            print(str(e))
            print("Loading weights with strict=False")
            whisper_model.load_state_dict(state_dict_updated, strict=False)

    if torch.cuda.is_available() and use_av_hubert_encoder == 1:
        whisper_model.encoder.video_projection_scalar.half()
        whisper_model.encoder.video_model.half()
        model_to_num_layers = {'small': 12, 'medium': 24, 'large-v2': 32}
        if av_fusion == 'separate':
            for i in range(model_to_num_layers[model_type]):
                whisper_model.decoder.blocks[i].attn_gate.data = whisper_model.decoder.blocks[i].attn_gate.half()
                whisper_model.decoder.blocks[i].ff_gate.data = whisper_model.decoder.blocks[i].ff_gate.half()
    return whisper_model


In [None]:
whisper_flamingo_multi_small = load_model(model_type, checkpoint, use_av_hubert_encoder, av_fusion, video_model_path)

Loading Whisper
Whisper dropout rate : 0.0
Loading AV-HuBERT encoder


2025-01-31 10:18:59 | INFO | avhubert.hubert_pretraining | current directory is /data/sls/u/meng/roudi/whisper-flamingo
2025-01-31 10:18:59 | INFO | avhubert.hubert_pretraining | AVHubertPretrainingTask Config {'_name': 'av_hubert_pretraining', 'data': '/mnt/ssd3/jh/Exp/iclr24/pretraining/', 'input_modality': '???', 'labels': ['unit'], 'label_dir': '/mnt/ssd3/jh/Exp/iclr24/pretraining/', 'label_rate': 25, 'sample_rate': 25, 'normalize': True, 'enable_padding': False, 'max_sample_size': 2000, 'min_sample_size': 5, 'max_trim_sample_size': 400, 'single_target': False, 'random_crop': True, 'pad_audio': False, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['audio', 'video'], 'is_s2s': False, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': False, 'add_eos': False}
2025-01-31 10:18:59 | IN

Using AV-HuBERT encoder with parameters: 325142504
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Loading checkpoint
dict_keys(['state_dict'])


In [68]:
from utils import load_video_feats
def decode_audio_video(audio, video_path, model, lang="en", beam_size=1):
    # Note: we don't pad the audio to 30s
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    video = load_video_feats(video_path, train=False)
    video = torch.tensor(video.astype(np.float32))
    video = video.unsqueeze(0).permute((0, 4, 1, 2, 3)).contiguous() # [B, T, H, W, C] -> [B, C, T, H, W]
    video = video.half().to(model.device) if torch.cuda.is_available() else video
    # print(audio.shape, audio.dtype)
    # print(video.shape, video.dtype)

    model.eval() # AV-HuBERT batch norm and dropout
    options = whisper.DecodingOptions(fp16 = True if torch.cuda.is_available() else False,
                                      language=lang, without_timestamps=True, beam_size=beam_size)
    pred = model.decode(mel, options, video).text
    return pred

### En Transcription

In [None]:
result = decode_audio_video(clean_input, mouth_roi_path, whisper_flamingo_multi_small, lang='es')
print("Transcribing original input : {}".format(result))

Transcribing original input : cuando tenía cinco años y empezaba a leer mi querido abuelo carlos felipe me regaló un libro de aventuras


In [None]:
result = decode_audio_video(noisy_input, mouth_roi_path, whisper_flamingo_multi_small, lang='es')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : cuando tenía cinco años empecé a hablar me dijeron bueno que los filipinas no tuvieran nada con un libro de aventuras


In [None]:
result = decode_audio_video(noisy_multilingual, mouth_roi_path, whisper_flamingo_multi_small, lang='es')
print("Transcribing noisy input : {}".format(result))

  audio = torch.from_numpy(audio)


Transcribing noisy input : y torciendo a noche pensaba en mi querido abuelo que no felizmente me regaló un libro de aventuras


### The multilingual babble noise has speakers in 4 other languages

In [None]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_en_aoGJP02CtPA_00002.mp4", whisper_flamingo_multi_small, lang='en')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : if any child is found violating these rules it is severely represented


In [None]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_fr_0u7tTptBo9I_0081.mp4", whisper_flamingo_multi_small, lang='fr')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : et pourtant les scientifiques de la société pratiquent un effet de repas génétique à 12 000 km


In [None]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_it_AILi62xo7j8_0033.mp4", whisper_flamingo_multi_small, lang='it')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : e perciò con qualche importante rumore non si è deciso altre cose


In [None]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_ru_dd5R6wgQlp8_0103.mp4", whisper_flamingo_multi_small, lang='ru')
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : и несмотря на способность развязанных жизни это поверьте не принёс


# mWhisper-Flamingo Medium

In [73]:
model_type = 'medium'
checkpoint= 'models/whisper-flamingo_multi-all_medium.pt'
whisper_flamingo_multi_medium = load_model(model_type, checkpoint, use_av_hubert_encoder, av_fusion, video_model_path)

Loading Whisper
Whisper dropout rate : 0.0
Loading AV-HuBERT encoder


2025-01-31 10:23:33 | INFO | avhubert.hubert_pretraining | current directory is /data/sls/u/meng/roudi/whisper-flamingo
2025-01-31 10:23:33 | INFO | avhubert.hubert_pretraining | AVHubertPretrainingTask Config {'_name': 'av_hubert_pretraining', 'data': '/mnt/ssd3/jh/Exp/iclr24/pretraining/', 'input_modality': '???', 'labels': ['unit'], 'label_dir': '/mnt/ssd3/jh/Exp/iclr24/pretraining/', 'label_rate': 25, 'sample_rate': 25, 'normalize': True, 'enable_padding': False, 'max_sample_size': 2000, 'min_sample_size': 5, 'max_trim_sample_size': 400, 'single_target': False, 'random_crop': True, 'pad_audio': False, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['audio', 'video'], 'is_s2s': False, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': False, 'add_eos': False}
2025-01-31 10:23:33 | IN

Using AV-HuBERT encoder with parameters: 325142504
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Adding gated x attn layers
Loading checkpoint
dict_keys(['state_dict'])


In [74]:
result = decode_audio_video(noisy_multilingual, mouth_roi_path, whisper_flamingo_multi_medium, lang='es', beam_size=5)
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : y yo tenía 5 años empecé a hablar y me dijeron abuelo a qué no felipe me marcaron con un libro de aventuras


In [75]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_en_aoGJP02CtPA_00002.mp4", whisper_flamingo_multi_medium, lang='en', beam_size=5)
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : if any child is found violent in the schools it is severely reprimanded


In [76]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_fr_0u7tTptBo9I_0081.mp4", whisper_flamingo_multi_medium, lang='fr', beam_size=5)
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : et pourtant les scientifiques des nations unies pratiquent un effrénement des pêcheurs à l'étranger


In [77]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_it_AILi62xo7j8_0033.mp4", whisper_flamingo_multi_medium, lang='it', beam_size=5)
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : e perché le donne fanno con qualche importante e luminosa decisione altre cose


In [82]:
result = decode_audio_video(noisy_multilingual, "demo/muavic_ru_dd5R6wgQlp8_0103.mp4", whisper_flamingo_multi_medium, lang='ru', beam_size=5)
print("Transcribing noisy input : {}".format(result))

Transcribing noisy input : и единственное способ начиная с вас на жизнь это поверить и прийти к природе
