# ECS 289G: Automatic Speech Recongition using OpenAI Whisper model

This notebook conists the code for performing Automatic Speech Recognition using OpenAI whisper-large-v3 model.

In [1]:
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-sl1og3c6
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-sl1og3c6
  Resolved https://github.com/huggingface/transformers.git to commit 2c658b5a4282f2e824b4e23dc3bcda7ef27d5827
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate
  Downloadi

In [2]:
!pip install mltu

Collecting mltu
  Downloading mltu-1.1.7-py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.15.0 (from mltu)
  Downloading onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting coloredlogs (from onnxruntime>=1.15.0->mltu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m548.5 kB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.15.0->mltu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading mltu-1.1.7-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[

In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

# Using GPU if available
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Defining the openai/whisper-large-v3 model
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# Processing the pretrained model
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Testing on validation set of LJ Speech dataset
import typing
import numpy as np
import pandas as pd
from tqdm import tqdm
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

# Considering only first 20 data items
df = pd.read_csv("/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Models/05_sound_to_text/20231128/val.csv").values.tolist()
df = df[:20]

accum_cer, accum_wer = [], []

for wav_path, label in tqdm(df):
    # Predicting the output of the audio file supplied in .wav format
    result = pipe(wav_path)
    text = result["text"]

    true_label = "".join([l for l in label.lower() if l in "abcdefghijklmnopqrstuvwxyz'?!. "])

    cer = get_cer(text, true_label)
    wer = get_wer(text, true_label)

    accum_cer.append(cer)
    accum_wer.append(wer)
    print(str(cer) + " " + str(wer))

# Printing Average CER and Avergae WER on LJ Speech dataset
print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

  5%|▌         | 1/20 [02:12<41:49, 132.10s/it]

0.13333333333333333 0.5


 10%|█         | 2/20 [04:02<35:45, 119.18s/it]

0.08333333333333333 0.4444444444444444


 15%|█▌        | 3/20 [05:51<32:29, 114.70s/it]

0.05825242718446602 0.29411764705882354


 20%|██        | 4/20 [07:35<29:28, 110.56s/it]

0.06976744186046512 0.2222222222222222


 25%|██▌       | 5/20 [09:25<27:32, 110.15s/it]

0.0379746835443038 0.1111111111111111


 30%|███       | 6/20 [11:15<25:41, 110.08s/it]

0.06451612903225806 0.3125


 35%|███▌      | 7/20 [13:07<24:00, 110.84s/it]

0.037037037037037035 0.15


 40%|████      | 8/20 [14:57<22:07, 110.59s/it]

0.16304347826086957 0.13333333333333333


 45%|████▌     | 9/20 [16:46<20:10, 110.02s/it]

0.02654867256637168 0.06666666666666667


 50%|█████     | 10/20 [18:33<18:12, 109.21s/it]

0.04 0.1875


 55%|█████▌    | 11/20 [20:25<16:30, 110.09s/it]

0.0625 0.4117647058823529


 60%|██████    | 12/20 [22:15<14:39, 110.00s/it]

0.22 0.4117647058823529


 65%|██████▌   | 13/20 [24:12<13:04, 112.09s/it]

0.06060606060606061 0.18181818181818182


 70%|███████   | 14/20 [26:03<11:10, 111.81s/it]

0.2125984251968504 0.30434782608695654


 75%|███████▌  | 15/20 [27:52<09:14, 111.00s/it]

0.0673076923076923 0.35294117647058826


 80%|████████  | 16/20 [29:36<07:15, 108.91s/it]

0.0425531914893617 0.1


 85%|████████▌ | 17/20 [31:33<05:33, 111.27s/it]

0.04838709677419355 0.25


 90%|█████████ | 18/20 [33:21<03:40, 110.30s/it]

0.16455696202531644 0.3125


 95%|█████████▌| 19/20 [35:10<01:49, 109.87s/it]

0.16216216216216217 0.75


100%|██████████| 20/20 [36:55<00:00, 110.77s/it]

0.013333333333333334 0.0
Average CER: 0.08839057300237042, Average WER: 0.27485160104885165





In [4]:
# Testing it with custom audios (the audios which we obatined after running speaker diarization and audio splicing)
import os

directory_path = "/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Dataset/spliced"
output_file_path = "/content/drive/MyDrive/Fall 2023/ECS 289G/Project/all_transcriptions.txt"

files = []

for filename in os.listdir(directory_path):
    files.append(filename)

sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Opening the output file once for writing
with open(output_file_path, 'w') as output_file:
    for filename in sorted_files:
        if os.path.isfile(os.path.join(directory_path, filename)):
            file_path = os.path.join(directory_path, filename)
            result = pipe(file_path)
            speaker_id = filename.split('_')[1]
            transcription_text = result["text"]

            # Writing to the text file
            output_file.write(f"Speaker {speaker_id}: {transcription_text}\n")

            # Printing the transciption
            print(f"Speaker {speaker_id}: {transcription_text}\n")

    print(f"All transcriptions successfully stored in {output_file_path}")

Speaker 00:  you

Speaker 00:  Hey Shreyas, how's your day going?

Speaker 01:  Hi Rohit, my day is going good, how about yours?

Speaker 00:  Oh, that's great. Yeah, even mine's going great. I just got done with my artificial intelligence presentation a while ago, and now I'm working on my project for it. How about you?

Speaker 01:  I am just working on an assignment for my linear algebra course which is due by midnight

Speaker 00:  Oh, great. So, let's see how this audio transcript works in our Hugging Face model.

Speaker 00:  Shud.

All transcriptions successfully stored in /content/drive/MyDrive/Fall 2023/ECS 289G/Project/all_transcriptions.txt
