In [1]:
import os
import re
import numpy as np
import librosa
import pandas as pd
from progressbar import progressbar
from datasets import load_metric
from pydub import AudioSegment
import random
random.seed(10)

In [2]:
# path where you untar the downloaded file
data_folder = "/home/ubuntu/speech_data/t-seed/CommonVoice/"

In [3]:
variation = "Transliterated"
e_type="base"
noise_variation = "laughter" # none, rain, wind, laughter

In [4]:
tsv_file = os.path.join(data_folder, "Tamil", "test.tsv")

In [5]:
data_frame = pd.read_csv(tsv_file, sep="\t")

In [6]:
data_frame.shape

(1781, 3)

In [7]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,common_voice_ta_21428682.wav,taim pillaaium sralen rezhundde
1,1,common_voice_ta_23721822.wav,buniarum neeim enbundai eannalum borundaukve
2,2,common_voice_ta_23796845.wav,kdai utkl avridam bdindaupon vishiam


In [8]:
# add full path to the file names in the column data_frame["audios"]

data_frame["audios"] = data_frame["audios"].apply(lambda x: os.path.join(data_folder, "Tamil", x))


# lower case the transcripts in the column data_frame["text"]

data_frame["text"] = data_frame["text"].apply(lambda x: x.lower())

# removing special characters from the transcripts in the column data_frame["text"]

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence)
    return sentence

data_frame["text"] = data_frame["text"].apply(lambda x: remove_special_characters(x))

In [9]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,taim pillaaium sralen rezhundde
1,1,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,buniarum neeim enbundai eannalum borundaukve
2,2,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,kdai utkl avridam bdindaupon vishiam


In [10]:
# viewing path of a single file
# "/home/ubuntu/speech_data/t-seed/LibriSpeech/" will be path where you untar the downloaded file
print(data_frame["audios"][1])

/home/ubuntu/speech_data/t-seed/CommonVoice/Tamil/common_voice_ta_23721822.wav


In [11]:
# check one file from the data_frame for specifications
audio_array, sampling_rate = librosa.load(data_frame["audios"][1])

print(sampling_rate)

22050


In [12]:
# resampling audio file to 16 KHz

audio_array, sampling_rate = librosa.load(data_frame["audios"][1], sr=16000)
print(sampling_rate)

16000


In [13]:
# getting unique words in the transcripts to use it with the language modeler

words = " ".join(list(data_frame["text"])).split()
unique_words = list(set(words))
print(len(unique_words))

5701


### Prediction

In [14]:

import config
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
transcriber_processor = Wav2Vec2Processor.from_pretrained("wav2vec2-large-rbg-tamil")
transcriber_processor.tokenizer.do_lower_case = True
transcriber_encoder_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-rbg-tamil")
transcriber_encoder_model  = transcriber_encoder_model.to("cuda")

In [15]:
# noise folder path
noise_folder = "noise/matrix/"

rain_noises = ['1-56311-A-10.wav', '1-63871-A-10.wav', '2-101676-A-10.wav', '2-117625-A-10.wav', '5-181766-A-10.wav', '5-188655-A-10.wav', '3-157487-A-10.wav', '3-157615-A-10.wav', '4-160999-A-10.wav', '4-161127-A-10.wav']
wind_noises = ['1-51037-A-16.wav', '1-69760-A-16.wav', '2-104952-A-16.wav', '2-104952-B-16.wav', '3-246513-A-16.wav', '3-246513-B-16.wav', '4-144083-A-16.wav', '4-144083-B-16.wav', '5-157204-A-16.wav', '5-157204-B-16.wav']
laughter_noises = ['1-72695-A-26.wav', '1-73123-A-26.wav', '2-109759-A-26.wav','2-109759-B-26.wav', '3-152912-A-26.wav', '3-152997-A-26.wav', '4-132803-A-26.wav', '4-132810-A-26.wav', '5-242932-B-26.wav','5-244526-A-26.wav']

rain_noises = [os.path.join(noise_folder, file) for file in rain_noises]
wind_noises = [os.path.join(noise_folder, file) for file in wind_noises]
laughter_noises = [os.path.join(noise_folder, file) for file in laughter_noises]

noise_dict = {'rain': rain_noises, 'wind': wind_noises, 'laughter': laughter_noises}

In [16]:
# prediction against base model with and without language modeler

def get_transcriptions(audio_path, unique_words, e_type="base", noise_type="none"):
    
    if noise_type == "none":
        audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
    else:
        GAIN_CONSTANT = -34.
        NOISE_VOLUME=0.3
        noise_file = random.choice(noise_dict[noise_type])
        audio = AudioSegment.from_file(audio_path, 'wav')
        audio = audio.normalize()
        background = AudioSegment.from_file(noise_file, 'wav')
        background = background.normalize()
        background = background.apply_gain(GAIN_CONSTANT * (1. - 0.3))
        output = audio.overlay(background, position=0, loop=True)
        output = output.set_frame_rate(16000)
        output.export("temp.wav", format='wav')
        audio_array, sampling_rate = librosa.load("temp.wav", sr=16000)
        
    inputs = transcriber_processor(audio_array, sampling_rate=16000,
                                       return_tensors="pt", padding=True)
    inputs = inputs.to("cuda")
    with torch.no_grad():
            character_probability = transcriber_encoder_model(inputs.input_values,
                                                              attention_mask=inputs.attention_mask).logits
    inputs = inputs.to("cpu")
    predicted_ids = torch.argmax(character_probability, dim=-1)
    encoder_text = transcriber_processor.batch_decode(predicted_ids)[0]
    
    decoder_text = "None" # "not disclosed"
    
    return encoder_text, decoder_text

In [17]:
without_lm_op = list()
with_lm_op = list()

for item in progressbar(data_frame["audios"]):
    e_text, d_text = get_transcriptions(item, unique_words, e_type="base", noise_type=noise_variation)
    without_lm_op.append(e_text)
    with_lm_op.append(d_text)

data_frame["without_lm"] = without_lm_op
data_frame["with_lm"] = with_lm_op
data_frame.to_csv("results/CommonVoice-Tamil-"+ e_type  + "-" + variation + "-" + noise_variation + ".tsv", sep="\t", index=False)

  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1
100% (1781 of 1781) |####################| Elapsed Time: 0:13:30 Time:  0:13:30


In [18]:
data_frame.head(5)

Unnamed: 0.1,Unnamed: 0,audios,text,without_lm,with_lm
0,0,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,taim pillaaium sralen rezhundde,taim pillaium srlin niyerundae,taim pillaaium sralen nedund
1,1,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,buniarum neeim enbundai eannalum borundaukve,boriddu meeim enkulvi ennalum dovitpuvudve,boradu neeim en udvi eannalum kovith udvi
2,2,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,kdai utkl avridam bdindaupon vishiam,kdai muthid laribm bdindau bolnher,kdai muth arib bdindau bonr sare
3,3,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,neengal nernru keiraib boryel sappittirkla,neengal nernru keirai boraiil sappittirkla,neengal nernru keiraib borul sappittirkla
4,4,/home/ubuntu/speech_data/t-seed/CommonVoice/Ta...,vaithl dmizher nurhl,vaitholl dmizher nurhl,vaith ull dmizher nurhl


In [19]:
wer_metric = load_metric("wer")
without_lm_wer = wer_metric.compute(predictions=data_frame["without_lm"], references=data_frame["text"])
with_lm_wer = wer_metric.compute(predictions=data_frame["with_lm"], references=data_frame["text"])

In [20]:
without_lm_wer 

0.6357485198759515

In [21]:
with_lm_wer

0.2864392444319143