In [3]:
import os
import re
import librosa
import pandas as pd
from progressbar import progressbar
from datasets import load_metric

In [4]:
# path where you untar the downloaded file (~path~of~/t-seed/LibriSpeech/)
data_folder = "/home/ubuntu/speech_data/t-seed/LibriSpeech/"

In [5]:
variation = "other" # possbile values (clean, other)

In [6]:
tsv_file = os.path.join(data_folder, variation, "test.tsv")

In [7]:
data_frame = pd.read_csv(tsv_file, sep="\t")

In [8]:
data_frame.shape

(2939, 3)

In [9]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,1688-142285-0000.wav,THERE'S IRON THEY SAY IN ALL OUR BLOOD AND A G...
1,1,1688-142285-0001.wav,MARGARET SAID MISTER HALE AS HE RETURNED FROM ...
2,2,1688-142285-0002.wav,YOU DON'T MEAN THAT YOU THOUGHT ME SO SILLY


In [10]:
# add full path to the file names in the column data_frame["audios"]

data_frame["audios"] = data_frame["audios"].apply(lambda x: os.path.join(data_folder, variation, x))


# lower case the transcripts in the column data_frame["text"]

data_frame["text"] = data_frame["text"].apply(lambda x: x.lower())

# removing special characters from the transcripts in the column data_frame["text"]

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence)
    return sentence

data_frame["text"] = data_frame["text"].apply(lambda x: remove_special_characters(x))

In [11]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,there's iron they say in all our blood and a g...
1,1,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,margaret said mister hale as he returned from ...
2,2,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,you don't mean that you thought me so silly


In [12]:
# viewing path of a single file
# "/home/ubuntu/speech_data/t-seed/LibriSpeech/" will be path where you untar the downloaded file
print(data_frame["audios"][1])

/home/ubuntu/speech_data/t-seed/LibriSpeech/other/1688-142285-0001.wav


In [13]:
# check one file from the data_frame for specifications
audio_array, sampling_rate = librosa.load(data_frame["audios"][1])

print(sampling_rate)

22050


In [14]:
# resampling audio file to 16 KHz

audio_array, sampling_rate = librosa.load(data_frame["audios"][1], sr=16000)
print(sampling_rate)

16000


In [15]:
# getting unique words in the transcripts to use it with the language modeler

words = " ".join(list(data_frame["text"])).split()
unique_words = list(set(words))
print(len(unique_words))

7597


### Prediction

In [16]:

import config
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

transcriber_processor = Wav2Vec2Processor.from_pretrained("wav2vec2-large-rbg-tamil/")
transcriber_processor.tokenizer.do_lower_case = True
transcriber_encoder_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-rbg-tamil/")

transcriber_encoder_model  = transcriber_encoder_model.to("cuda")

In [17]:
# prediction against base model with and without language modeler

def get_transcriptions(audio_path, unique_words, e_type="base_model"):
    
    encoder_text, decoder_text = "None", "None" # "Not disclosed"
    
    return encoder_text, decoder_text

In [18]:
without_lm_op = list()
with_lm_op = list()
e_type = "enhanced"
for item in progressbar(data_frame["audios"]):
    e_text, d_text = get_transcriptions(item, unique_words, e_type=e_type)
    without_lm_op.append(e_text)
    with_lm_op.append(d_text)

data_frame["without_lm"] = without_lm_op
data_frame["with_lm"] = with_lm_op


  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1
100% (2939 of 2939) |####################| Elapsed Time: 0:38:35 Time:  0:38:35


In [19]:
data_frame.to_csv("results/LibriSpeech-"+ variation + "-" + e_type + ".tsv", sep="\t", index=False)

In [20]:
data_frame.head(5)

Unnamed: 0.1,Unnamed: 0,audios,text,without_lm,with_lm
0,0,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,there's iron they say in all our blood and a g...,there's ion they say in all our blood and a gr...,there's in they say in all our blood and a gra...
1,1,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,margaret said mister hale as he returned from ...,margaret said mister hale as he returned from ...,margaret said mister hale as he returned from ...
2,2,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,you don't mean that you thought me so silly,you don't mean that you thought me so silly,you don't mean that you thought me so silly
3,3,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,i really liked that account of himself better ...,i really liked that account of himself better ...,i really liked that account of himself better ...
4,4,/home/ubuntu/speech_data/t-seed/LibriSpeech/ot...,his statement of having been a shop boy was th...,his statement of having been a shopboy was the...,his statement of having been a shop boy was th...


In [21]:
wer_metric = load_metric("wer")
without_lm_wer = wer_metric.compute(predictions=data_frame["without_lm"], references=data_frame["text"])
with_lm_wer = wer_metric.compute(predictions=data_frame["with_lm"], references=data_frame["text"])

In [22]:
without_lm_wer 

0.09181743499608352

In [23]:
with_lm_wer

0.053015685000859715