In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
model_id = "openai/whisper-small"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import os
import re
def get_all_files(folder_path):
    all_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_files.append(full_path)
    return all_files

# Example usage
folder_path = "speech_calib"  # Replace with your folder path
files = get_all_files(folder_path)
def extract_number(file_path):
    match = re.search(r'(\d+)\.(opus|mp3)', file_path)
    return int(match.group(1)) if match else float('inf')

# Sort the file paths by the extracted number
sorted_file_paths = sorted(files, key=extract_number)
sorted_file_paths

['speech_calib\\jishnu\\1.opus',
 'speech_calib\\rashmi_mp3\\1.mp3',
 'speech_calib\\speaker4\\sentence_1.mp3',
 'speech_calib\\jishnu\\2.opus',
 'speech_calib\\rashmi_mp3\\2.mp3',
 'speech_calib\\speaker4\\sentence_2.mp3',
 'speech_calib\\jishnu\\3.opus',
 'speech_calib\\rashmi_mp3\\3.mp3',
 'speech_calib\\speaker4\\sentence_3.mp3',
 'speech_calib\\jishnu\\4.opus',
 'speech_calib\\rashmi_mp3\\4.mp3',
 'speech_calib\\speaker4\\sentence_4.mp3',
 'speech_calib\\jishnu\\5.opus',
 'speech_calib\\rashmi_mp3\\5.mp3',
 'speech_calib\\speaker4\\sentence_5.mp3',
 'speech_calib\\jishnu\\6.opus',
 'speech_calib\\rashmi_mp3\\6.mp3',
 'speech_calib\\speaker4\\sentence_6.mp3',
 'speech_calib\\jishnu\\7.opus',
 'speech_calib\\rashmi_mp3\\7.mp3',
 'speech_calib\\speaker4\\sentence_7.mp3',
 'speech_calib\\jishnu\\8.opus',
 'speech_calib\\rashmi_mp3\\8.mp3',
 'speech_calib\\speaker4\\sentence_8.mp3',
 'speech_calib\\jishnu\\9.opus',
 'speech_calib\\rashmi_mp3\\9.mp3',
 'speech_calib\\speaker4\\sentence_

In [15]:
import librosa 
from tqdm import tqdm 
ts=[]
language_token_id = processor.tokenizer.convert_tokens_to_ids("<|en|>")
for file in tqdm(sorted_file_paths):
        y,sr=librosa.load(file,sr=16000)
        inputs = processor(y, sampling_rate=sr, return_tensors="pt").input_features
        inputs = inputs.to(device, dtype=torch_dtype)
        with torch.no_grad():
            generated_ids = model.generate(
            inputs,
            forced_decoder_ids=[[0, language_token_id]]  # [0, ID] forces the language
            )
            transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            ts.append(transcript.lower())

100%|██████████| 30/30 [01:26<00:00,  2.90s/it]


In [17]:
from difflib import SequenceMatcher

def calculate_stt_accuracy(transcripts, predictions):
    """
    Calculate word-level and character-level accuracy for speech-to-text predictions.
    
    Parameters:
    - transcripts (list of str): List of ground truth text (actual transcripts).
    - predictions (list of str): List of predicted text from the model.
    
    Returns:
    - dict: A dictionary with word-level and character-level accuracy.
    """
    assert len(transcripts) == len(predictions), "Lists must have the same length."
    
    total_word_matches = 0
    total_words = 0
    total_char_matches = 0
    total_chars = 0
    
    for transcript, prediction in zip(transcripts, predictions):
        # Word-level accuracy
        transcript_words = transcript.split()
        prediction_words = prediction.split()
        total_word_matches += len(set(transcript_words) & set(prediction_words))
        total_words += len(transcript_words)
        
        # Character-level accuracy
        matcher = SequenceMatcher(None, transcript, prediction)
        total_char_matches += sum(block.size for block in matcher.get_matching_blocks())
        total_chars += len(transcript)
    
    word_accuracy = (total_word_matches / total_words) * 100 if total_words > 0 else 0
    char_accuracy = (total_char_matches / total_chars) * 100 if total_chars > 0 else 0
    
    return {
        "word_accuracy": word_accuracy,
        "char_accuracy": char_accuracy
    }

# Example usage
sentences=[
    "for some people this room might be",
    "the scariest place on earth",
    "behind these black curtains are deadly spiders",
    "hundreds of them",
    "and what were gonna do is poke them make them angry",
    "and then suction the venom that appears",
    "at the end of their really long fangs",
    "this is about as close as i ever want to get to a funnel",
    "and were doing it for a very good reason",
    "this is a funnel web spider"
]
transcripts=[]
for s in sentences:
    for i in range(3):
        transcripts.append(s)
predictions = ts
accuracy = calculate_stt_accuracy(transcripts, predictions)
print("Accuracy:", accuracy)


Accuracy: {'word_accuracy': 81.81818181818183, 'char_accuracy': 98.12332439678283}


In [18]:
transcripts,predictions

(['for some people this room might be',
  'for some people this room might be',
  'for some people this room might be',
  'the scariest place on earth',
  'the scariest place on earth',
  'the scariest place on earth',
  'behind these black curtains are deadly spiders',
  'behind these black curtains are deadly spiders',
  'behind these black curtains are deadly spiders',
  'hundreds of them',
  'hundreds of them',
  'hundreds of them',
  'and what were gonna do is poke them make them angry',
  'and what were gonna do is poke them make them angry',
  'and what were gonna do is poke them make them angry',
  'and then suction the venom that appears',
  'and then suction the venom that appears',
  'and then suction the venom that appears',
  'at the end of their really long fangs',
  'at the end of their really long fangs',
  'at the end of their really long fangs',
  'this is about as close as i ever want to get to a funnel',
  'this is about as close as i ever want to get to a funnel',
