In [3]:
import os
import re
import librosa
import pandas as pd
from progressbar import progressbar
from datasets import load_metric

In [4]:
# path where you untar the downloaded file (~path~of~/t-seed/LibriSpeech/)
data_folder = "/home/ubuntu/speech_data/t-seed/LibriAdapt/en-us"

In [5]:
variation1 = "rain" # possbile values (clean, rain, wind, laughter)

In [6]:
variation2 = "matrix" # possbile values (matrix, nexus6, pseye, respeaker, shure, usb)

In [7]:
tsv_file = os.path.join(data_folder,  variation2 + ".tsv")

In [8]:
data_frame = pd.read_csv(tsv_file, sep="\t")

In [9]:
data_frame.shape

(2600, 3)

In [10]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,8555-292519-0000.wav,brighter than early dawn's most brilliant dye ...
1,1,8555-292519-0001.wav,guided by you how we might stroll towards deat...
2,2,8555-292519-0002.wav,venice


In [11]:
# add full path to the file names in the column data_frame["audios"]

data_frame["audios"] = data_frame["audios"].apply(lambda x: os.path.join(
    data_folder, variation1, variation2, "test", x))


# lower case the transcripts in the column data_frame["text"]

data_frame["text"] = data_frame["text"].apply(lambda x: x.lower())

# removing special characters from the transcripts in the column data_frame["text"]

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence)
    return sentence

data_frame["text"] = data_frame["text"].apply(lambda x: remove_special_characters(x))

In [12]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,brighter than early dawn's most brilliant dye ...
1,1,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,guided by you how we might stroll towards deat...
2,2,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,venice


In [13]:
# viewing path of a single file
# "/home/ubuntu/speech_data/t-seed/LibriSpeech/" will be path where you untar the downloaded file
print(data_frame["audios"][1])

/home/ubuntu/speech_data/t-seed/LibriAdapt/en-us/rain/matrix/test/8555-292519-0001.wav


In [14]:
# check one file from the data_frame for specifications
audio_array, sampling_rate = librosa.load(data_frame["audios"][1])

print(sampling_rate)

22050


In [15]:
# resampling audio file to 16 KHz

audio_array, sampling_rate = librosa.load(data_frame["audios"][1], sr=16000)
print(sampling_rate)

16000


In [16]:
# getting unique words in the transcripts to use it with the language modeler

words = " ".join(list(data_frame["text"])).split()
unique_words = list(set(words))
print(len(unique_words))

7991


### Prediction

In [17]:

import config
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

transcriber_processor = Wav2Vec2Processor.from_pretrained("wav2vec2-large-rbg-tamil/")
transcriber_processor.tokenizer.do_lower_case = True
transcriber_encoder_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-rbg-tamil/")

transcriber_encoder_model  = transcriber_encoder_model.to("cuda")

In [19]:
# prediction against base model with and without language modeler

def get_transcriptions(audio_path, unique_words, e_type="base_model"):
    
    
    encoder_text, decoder_text = "None", "None" # "Not disclosed"
    
    return encoder_text, decoder_text

In [None]:
without_lm_op = list()
with_lm_op = list()
e_type = "enhanced"
for item in progressbar(data_frame["audios"]):
    e_text, d_text = get_transcriptions(item, unique_words, e_type=e_type)
    without_lm_op.append(e_text)
    with_lm_op.append(d_text)

data_frame["without_lm"] = without_lm_op
data_frame["with_lm"] = with_lm_op
data_frame.to_csv("results/LibriAdapt-"+ variation1 +"-"+ variation2 + "-" + e_type + ".tsv", sep="\t", index=False)

  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1
 31% (829 of 2600) |######               | Elapsed Time: 0:20:14 ETA:   0:37:19

In [None]:
data_frame.head(5)

In [None]:
wer_metric = load_metric("wer")
without_lm_wer = wer_metric.compute(predictions=data_frame["without_lm"], references=data_frame["text"])
with_lm_wer = wer_metric.compute(predictions=data_frame["with_lm"], references=data_frame["text"])

In [26]:
without_lm_wer 

0.09664169616836506

In [27]:
with_lm_wer

0.06065288399475815