<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/audio/whisperat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install whisper-at

In [None]:
# download a sample audio
!pip -q install wget
import wget,IPython
wget.download('https://www.dropbox.com/s/7eznyazmc1pmw9h/case_closed.wav?dl=1', '/content/sample_audio.flac')
IPython.display.Audio('/content/sample_audio.flac')

In [None]:
# note this is whisper"_"at not whisper-at
import whisper_at as whisper

# the only new thing in whisper-at
# specify the temporal resolution for audio tagging, 10 means Whisper-AT predict audio event every 10 seconds (hop and window=10s).
audio_tagging_time_resolution = 10

model = whisper.load_model("large-v1")
# for large, medium, small models, we provide low-dim proj AT models to save compute.
# model = whisper.load_model("large-v1", at_low_compute=Ture)
result = model.transcribe("/content/sample_audio.flac", at_time_res=audio_tagging_time_resolution)
for segment in result['segments']:
  print(segment['start'], 's-', segment['end'], 's', segment['text'])

In [None]:
# translation task is also supported
result = model.transcribe("/content/sample_audio.flac", task='translate', at_time_res=audio_tagging_time_resolution)
print(result["text"])

Get the Audio Tagging

In [None]:
import torchaudio
audio, sr = torchaudio.load('/content/sample_audio.flac')
audio_len = audio.shape[1] / sr
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-2, include_class_list=list(range(527)))
for segment in audio_tag_result:
  print("time: ", segment['time'], 'Audio Tag Dict: ', segment['audio tags'])

In [None]:
audio_tagging_time_resolution = 2
result = model.transcribe("/content/sample_audio.flac", at_time_res=audio_tagging_time_resolution)
print('Audio length is {:.2f}, at time resolution is {:.1f}, Whisper-AT output in shape'.format(audio_len, audio_tagging_time_resolution), result['audio_tag'].shape)
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-2, include_class_list=list(range(527)))
for segment in audio_tag_result:
  print(segment)


In [None]:
audio_tag_result = whisper.parse_at_label(result, language='ja', top_k=5, p_threshold=-2, include_class_list=list(range(527)))
for segment in audio_tag_result:
  print(segment)

Transcribe video

In [11]:
from IPython.display import HTML
from base64 import b64encode
# Replace this URL to play with your own video
wget.download('https://www.dropbox.com/s/pzc72c59xtluuc0/case_closed.mp4?dl=1', '/content/sample_video.mp4')

'/content/sample_video.mp4'

In [None]:
!pip install -q ffmpeg-python
import os,ffmpeg,cv2

def dubbing_video(video_path, out_video_path, text_info, font_size=0.5, font_v_pos=0.95, font_color=(0, 0, 255)):
    extract_audio(video_path, './temp_audio.wav')

    video = cv2.VideoCapture(video_path)
    # Get video properties
    frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = video.get(cv2.CAP_PROP_FPS)

    # Create output video writer
    output_video = cv2.VideoWriter('./temp_video.mp4', cv2.VideoWriter_fourcc(*"mp4v"), fps,
                                   (frame_width, frame_height))

    # Process each frame of the video
    current_frame = 0
    while video.isOpened():
        ret, frame = video.read()
        if not ret:
            break

        # Calculate current time in seconds
        current_time = current_frame / fps

        # Iterate through text information and add text if within the time interval
        for text_start, text_end, text in text_info:
            if text_start <= current_time <= text_end:
                text_position = (int(frame_width * 0.0), int(frame_height * font_v_pos))
                font = cv2.FONT_HERSHEY_SIMPLEX
                font_scale = font_size
                font_color = font_color
                line_type = 1

                cv2.putText(frame, text, text_position, font, font_scale, font_color, line_type, cv2.LINE_AA)

        # Write the frame to the output video
        output_video.write(frame)
        current_frame += 1

    # Release video resources
    video.release()
    output_video.release()

    combine_audio_video('./temp_video.mp4', './temp_audio.wav', out_video_path)
    os.remove('./temp_video.mp4')
    os.remove('./temp_audio.wav')

def combine_audio_video(video_path, audio_path, output_path):
    video = ffmpeg.input(video_path)
    audio = ffmpeg.input(audio_path)
    output_file = ffmpeg.output(video, audio, output_path)
    output_file.overwrite_output().run()

def extract_audio(video_path, output_path):
    video = ffmpeg.input(video_path)
    audio = video.audio
    output_file = ffmpeg.output(audio, output_path)
    output_file.overwrite_output().run()

extract_audio('/content/sample_video.mp4', '/content/sample_audio_from_video.wav')
result = model.transcribe("/content/sample_audio_from_video.wav", at_time_res=audio_tagging_time_resolution)

# ASR Output
text_segments = result['segments']
text_annotation = [(x['start'], x['end'], x['text']) for x in text_segments]

# Audio Tagging Output
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-2, include_class_list=list(range(527)))

all_seg = []
for segment in audio_tag_result:
    cur_start = segment['time']['start']
    cur_end = segment['time']['end']
    cur_tags = segment['audio tags']
    cur_tags = [x[0] for x in cur_tags]
    cur_tags = '; '.join(cur_tags)
    all_seg.append((cur_start, cur_end, cur_tags))

dubbing_video('/content/sample_video.mp4', '/content/sample_video_at.mp4', all_seg)
dubbing_video('/content/sample_video_at.mp4', '/content/sample_video_at_text.mp4', text_annotation, font_color=(0,255,0), font_v_pos=0.85)
os.remove('/content/sample_video.mp4')
os.remove('/content/sample_video_at.mp4')

mp4 = open('/content/sample_video_at_text.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

In [None]:
HTML(f"""
<video width="100%" height="100%" controls>
      <source src="{data_url}" type="video/mp4">
</video>""")