https://huggingface.co/openai/whisper-large-v3

https://huggingface.co/docs/transformers/en/model_doc/whisper

https://stackoverflow.com/questions/73822353/how-can-i-get-word-level-timestamps-in-openais-whisper-asr

https://community.openai.com/t/how-to-get-whispers-api-to-add-timestamps-to-the-transcripts/501788/3 


Whisper

Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. 

Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning.

Whisper was proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. 
The original code repository can be found here. https://github.com/openai/whisper

In [8]:
import os
import time
import glob
import pandas as pd

from soundfile import read

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

In [9]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float16 # change to "int8" if low on GPU mem (may reduce accuracy)

In [10]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=False, 
    #attn_implementation="flash_attention_2", # UNIX only: !pip install flash-attn --no-build-isolation
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [11]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=32, # reduce if low on GPU mem
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

In [19]:
# Get list of wav files
#wav_files = glob.glob("./intakes/*.wav")
wav_files = glob.glob("/Users/Willi/Documents/code/HUGGINGFACE_WISHPERER/AUDIO/*.wav")
print(wav_files)

['/Users/Willi/Documents/code/HUGGINGFACE_WISHPERER/AUDIO/harvard.wav']


In [18]:
print(wav_files)

[]


In [20]:
import pandas as pd

def create_dataframe(res):
  """
  This function creates a Pandas DataFrame from a list named 'res'.

  Args:
      res: A list of dictionaries containing 'timestamp' and 'text' keys.

  Returns:
      A Pandas DataFrame with three columns: 'start_time', 'end_time', and 'text'.
  """

  # Create lists of timestamps and text from the 'res' list
  timestamps = [item['timestamp'] for item in res]
  text = [item['text'] for item in res]

  # Extract start and end time from each timestamp tuple
  start_time = [t[0] for t in timestamps]
  end_time = [t[1] for t in timestamps]
 

  # Create a DataFrame with separate columns
  data = pd.DataFrame({'start_time': start_time, 'end_time': end_time, 'text': text})

  return data

In [22]:
results = []  # Create an empty list to store transcription results

# Create the "transcripts" directory if it doesn't exist
text_transcripts = 'text_transtripts'
os.makedirs(text_transcripts, exist_ok=True)


# Create the ouput data directory if it doesn't exist
data_transcripts = 'data_transtripts'
os.makedirs(data_transcripts, exist_ok=True)

for i, wav_file in enumerate(wav_files):

    # Define start_time for each loop
    start_time = time.time()

    # Extract filename without extension
    filename, _ = os.path.splitext(os.path.basename(wav_file))

    result = pipe(
        wav_file,
        # attn_implementation     =   "eager",
        # return_timestamps       =   "word",
        generate_kwargs         =    {"language": "dutch"}
    )

    # show the amount of time it took to process the current audio file
    processing_time = time.time() - start_time  # Calculate processing time
    print(f"Processed file {i+1} ({wav_file}) in {processing_time:.2f} seconds")

    # Get WAV duration using soundfile
    audio_data, sample_rate = read(wav_file)
    duration = len(audio_data) / sample_rate
    print(f"  - Duration: {duration:.2f} seconds")  # Print duration


    # retreive chuncked data + correct voor time data type int 
    data = create_dataframe(result["chunks"])
    data.iloc[:,0]=data.iloc[:,0].fillna(-1)
    data.iloc[:,0]=data.iloc[:,0].astype(int, errors='ignore')
    data.iloc[:,1]=data.iloc[:,1].fillna(-1)
    data.iloc[:,1]=data.iloc[:,1].astype(int, errors='ignore')


    # Save transcript with matching filename (excluding .wav) and .txt extension
    with open(os.path.join(text_transcripts, f"{filename}.txt"), "w") as text_file:
       text_file.write(result["text"])

    # Save tthe DataFrame to a CSV file with full path
    data.to_excel(os.path.join(data_transcripts, f"{filename}.xlsx"))

You have passed language=dutch, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=dutch.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processed file 1 (/Users/Willi/Documents/code/HUGGINGFACE_WISHPERER/AUDIO/harvard.wav) in 36.71 seconds
  - Duration: 18.36 seconds
