In [None]:
import torch
from transformers import pipeline
from transformers.utils import is_flash_attn_2_available


def transcribe_v2():
    pipe = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-large-v3-turbo",  # select checkpoint from https://huggingface.co/openai/whisper-large-v3#model-details
        torch_dtype=torch.float16,
        device="cuda:0",  # or mps for Mac devices
        model_kwargs={"attn_implementation": "flash_attention_2"}
        if is_flash_attn_2_available()
        else {"attn_implementation": "sdpa"},
    )
    outputs = pipe(
        "videos/index_arb.mp3",
        chunk_length_s=30,
        batch_size=24,
        return_timestamps=True,
    )
    # outputs has text, chunks
    # chunks is list of item, each has text, timestamp
    # {'timestamp': (1188.0, 1193.0), 'text': '因为当然就是虽然是我们的book size很小'},

Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [8]:
from main import split_transcript_into_clips, get_transcript_from_audio

audio_file = "videos/index_arb.mp3"
transcript = get_transcript_from_audio(audio_file)

  checkpoint = torch.load(fp, map_location=device)


In [9]:
clips = split_transcript_into_clips(transcript)

In [10]:
clips

SegmentedClips(clips=[RawClip(start_ts=60.28, end_ts=91.92, summary='This clip introduces the concept of Index Arbitrage, explaining that it is a market-neutral trading strategy aimed at profiting from the price discrepancies between index futures and the fair value of the underlying index.'), RawClip(start_ts=177.3, end_ts=203.92, summary='In this segment, the speaker discusses the theoretical aspects of Index Arbitrage with a practical example using the S&P 500 index and how to identify overpriced and underpriced assets to generate profit.'), RawClip(start_ts=346.14, end_ts=371.82, summary='This clip covers practical concerns in Index Arbitrage such as transaction costs, borrow costs, and market impact, highlighting how these factors can affect profitability.'), RawClip(start_ts=681.0, end_ts=694.5, summary='The speaker explains the operational complexities and requirements associated with executing Index Arbitrage strategies and discusses how banks and hedge funds have different app

In [12]:
from main import edit_clips

video_path = "videos/index_arb.mov"
edit_clips(video_path, clips)

ImportError: cannot import name 'edit_clips' from 'main' (/home/ruizeli/dev/clipit/main.py)

In [None]:
raise NotImplementedError

In [None]:
import whisper_timestamped as whisper
import json
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

audio = whisper.load_audio("videos/index_arb.mp3")

model = whisper.load_model("large-v3", device=device)

result = whisper.transcribe(model, audio)

print(json.dumps(result, indent=2, ensure_ascii=False))

In [None]:
import whisper

whisper.available_models()

In [None]:
model = whisper.load_model("large-v3-turbo", device="cuda")
transcript = model.transcribe(word_timestamps=True, audio="videos/index_arb.mp3")
for segment in transcript["segments"]:
    print(
        "".join(
            f"{word['word']}[{word['start']}/{word['end']}]"
            for word in segment["words"]
        )
    )

In [None]:
transcript.keys()

In [18]:
transcript["segments"][0]["text"]
# write the segments to a file with format
outfile = "output/turbo_transcription.txt"

with open(outfile, "w", encoding="utf-8") as f:
    # Save the text
    f.write(transcript["text"])
    f.write("\n\n# Timestamps:\n")
    # Save the chunks with timestamps
    for chunk in transcript["segments"]:
        f.write(f"[{chunk['start']}s -> {chunk['end']}s] {chunk['text']}\n")