In [None]:
!nvidia-smi
!pip install python-dotenv matplotlib google-cloud-speech deepgram-sdk levenshtein git+https://github.com/openai/whisper.git datasets
!sudo apt update && sudo apt install ffmpeg

Tue Oct 15 15:39:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import torch
import whisper
import json
import numpy as np
import typer
import os
import sys
import re

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from datasets import Dataset
from datasets import Audio
from google.cloud import speech
from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)

from Levenshtein import distance
from rich import print
from rich.progress import track
from hashlib import sha512
from re import split
from pathlib import Path
from collections import defaultdict
from dotenv import load_dotenv



In [None]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
FOLDER_BASE = Path("/content/drive/MyDrive/testData/")
RESULTS = FOLDER_BASE / "benchmark.json"
EXTS = { '.mp3', '.wav' }
!touch {RESULTS}
if not RESULTS.exists() or RESULTS.stat().st_size == 0:
  RESULTS.write_text('{\n}\n')
print(DEVICE)

In [None]:
def google(file):
    client = speech.SpeechClient.from_service_account_file(
        "google-api/service_account.json"
    )

    with open(file, "rb") as file:
        audio = speech.RecognitionAudio(content=file.read())

    context = {"boost": 10, "phrases": ["rot", "grün", "blau"]}

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code="de-DE",
        speech_contexts=[context],
        max_alternatives=3,
        enable_word_time_offsets=True,
        enable_word_confidence=True,
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)
    return [w.word for w in response.results[0].alternatives[0].words]


def deepgram(file: Path):
    deepgram = DeepgramClient(os.getenv("DEEPGRAM_API_KEY"))

    with open(file, "rb") as file:
        buffer_data = file.read()

    payload: FileSource = {
        "buffer": buffer_data,
    }

    # STEP 2: Configure Deepgram options for audio analysis
    options = PrerecordedOptions(
        model="nova-2",
        language="de-DE",
        smart_format=False,
        keywords=["rot", "grün", "blau"],
    )

    # STEP 3: Call the transcribe_file method with the text payload and options
    response = deepgram.listen.rest.v("1").transcribe_file(payload, options)

    # STEP 4: Print the response
    return [w.word for w in response.results.channels[0].alternatives[0].words]

def run_test_whisper_multilingual(file: Path):
    # with warnings.catch_warnings() as a, contextlib.redirect_stdout(None) as b, contextlib.redirect_stderr(None) as c:
    #     warnings.simplefilter('ignore')
    model = whisper.load_model('large', device=DEVICE)

    # supposedly faster on cpu-only setup
    if not torch.cuda.is_available():
        model = torch.quantization.quantize_dynamic(
            model, {torch.nn.Linear}, dtype=torch.qint8
        )

    test_result = model.transcribe(file.as_posix(), language="de")
    print('original_output: ' + test_result["text"])
    print('stripped_output: ' + ', '.join(re.split(r'\s+', re.sub(r'\W+', ' ', test_result["text"]).strip().lower())))
    return re.split(r'\s+', re.sub(r'\W+', ' ', test_result["text"]).strip().lower())

def run_test_whisper_german(file: Path):
    #with warnings.catch_warnings() as a, contextlib.redirect_stdout(None) as b, contextlib.redirect_stderr(None) as c:
    #    warnings.simplefilter('ignore')
    model_id = "primeline/whisper-large-v3-german"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=TORCH_DTYPE, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(DEVICE)
    processor = AutoProcessor.from_pretrained(model_id)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=TORCH_DTYPE,
        device=DEVICE,
    )
    # sample = dataset[0]["audio"]
    audio_dataset = Dataset.from_dict({"audio": [ file.as_posix() ]}).cast_column("audio", Audio())
        # test_result = result = pipe(Path(TEST_DATA_PATH / input))
    test_result = result = pipe(audio_dataset[0]["audio"])
    print('original_output: ' + test_result["text"])
    print('stripped_output: ' + ', '.join(re.split(r'\s+', re.sub(r'\W+', ' ', test_result["text"]).strip().lower())))
    return re.split(r'\s+', re.sub(r'\W+', ' ', test_result["text"]).strip().lower())

In [None]:
def post_process(inputs: list):
    targets_to_prefix_max_distance = {'rot': {'ro': 2},
                    'grün': {'grü': 2, 'grue': 2, 'gru': 3},
                    'blau': {'bla': 2, 'pla': 1}}
    results = []

    inputs = [word.lower() for word in inputs]
    for input in inputs:
        closest_match = 'other (' + input + ')'
        min_distance = float('inf')
        for color, prefixes_to_max_distances in targets_to_prefix_max_distance.items():
            for required_prefix, max_distance in prefixes_to_max_distances.items():
                if (input.startswith(required_prefix)):
                    current_distance = distance(input[len(required_prefix):], color[len(required_prefix):])
                    if current_distance <= max_distance and current_distance < min_distance:
                        min_distance = current_distance
                        closest_match = color
        results.append(closest_match)
    return results

def measure():
    files = [
        file
        for file in FOLDER_BASE.iterdir()
        if file.suffix in EXTS and file.with_suffix(".txt").exists()
    ]

    results = read_results()

    for file in track(files[:]):
        trans = transcript(file)

        if file.name not in results:
            results[file.name] = {"transcript": trans, "hash": hash(file)}

        # if "results_google" not in results[file.name]:
        #     res = post_process(google(file))
        #     results[file.name]["results_google"] = {
        #         "transcript": res,
        #         "levenshtein": distance(trans, res),
        #     }

        # if "results_deepgram" not in results[file.name]:
        #     res = post_process(deepgram(file))
        #     results[file.name]["results_deepgram"] = {
        #         "transcript": res,
        #         "levenshtein": distance(trans, res),
        #     }

        if "results_whisper_multilingual" not in results[file.name]:
            res = post_process(run_test_whisper_multilingual(file))
            results[file.name]["results_whisper_multilingual"] = {
                "transcript": res,
                "levenshtein": distance(trans, res),
            }

        if "results_whisper_german" not in results[file.name]:
            res = post_process(run_test_whisper_german(file))
            results[file.name]["results_whisper_german"] = {
                "transcript": res,
                "levenshtein": distance(trans, res),
            }
        with open(RESULTS, "w", encoding="utf-8") as handle:
            json.dump(results, handle, indent=2)

def hash(file: Path):
    return sha512(file.read_bytes()).hexdigest()


def transcript(file: Path) -> list[str]:
    return split(r"\s+", file.with_suffix(".txt").read_text(encoding="utf-8").strip())

def read_results() -> dict:
    with open(RESULTS, encoding="utf-8") as handle:
        return json.load(handle)

In [None]:
model = whisper.load_model("large")





  0%|                                              | 0.00/2.88G [00:00<?, ?iB/s][A[A[A[A



  0%|                                  | 64.0k/2.88G [00:01<15:19:40, 55.9kiB/s][A[A[A[A



  0%|                                     | 128k/2.88G [00:01<7:05:43, 121kiB/s][A[A[A[A



  0%|                                     | 352k/2.88G [00:01<2:05:08, 411kiB/s][A[A[A[A



  0%|                                     | 640k/2.88G [00:01<1:04:43, 795kiB/s][A[A[A[A



  0%|                                      | 992k/2.88G [00:01<39:28, 1.30MiB/s][A[A[A[A



  0%|                                     | 1.50M/2.88G [00:01<23:35, 2.18MiB/s][A[A[A[A



  0%|                                     | 1.84M/2.88G [00:01<22:00, 2.34MiB/s][A[A[A[A



  0%|                                     | 2.15M/2.88G [00:01<21:06, 2.44MiB/s][A[A[A[A



  0%|                                     | 3.19M/2.88G [00:02<11:35, 4.43MiB/s][A[A[A[A



  0%|                               