In [1]:
from polyglot.downloader import downloader
downloader.download("embeddings2.en")
downloader.download("embeddings2.hi")
downloader.download("embeddings2.ta")
downloader.download("transliteration2.hi")
downloader.download("transliteration2.ta")

import os
import re
import numpy as np
import librosa
import pandas as pd
from progressbar import progressbar
from datasets import load_metric
from pydub import AudioSegment
import random
random.seed(10)

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/ubuntu/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package embeddings2.hi to
[polyglot_data]     /home/ubuntu/polyglot_data...
[polyglot_data]   Package embeddings2.hi is already up-to-date!
[polyglot_data] Downloading package embeddings2.ta to
[polyglot_data]     /home/ubuntu/polyglot_data...
[polyglot_data]   Package embeddings2.ta is already up-to-date!
[polyglot_data] Downloading package transliteration2.hi to
[polyglot_data]     /home/ubuntu/polyglot_data...
[polyglot_data]   Package transliteration2.hi is already up-to-date!
[polyglot_data] Downloading package transliteration2.ta to
[polyglot_data]     /home/ubuntu/polyglot_data...
[polyglot_data]   Package transliteration2.ta is already up-to-date!


In [2]:
# path where you untar the downloaded file
data_folder = "/home/ubuntu/speech_data/t-seed/MSR/microsoftspeechcorpusindianlanguages/ta-in-Test/"

In [3]:
variation = "Transliterated"
e_type="base"
noise_variation = "laughter" # none, rain, wind, laughter

In [4]:
tsv_file = os.path.join(data_folder, "transcription.txt")

In [5]:
data_frame = pd.read_csv(tsv_file, sep="\t", header=None)

In [6]:
data_frame.shape

(3081, 2)

In [7]:
data_frame.head(3)

Unnamed: 0,0,1
0,440203,நேபாளத்தில் ஏற்பட்ட நிலநடுக்கத்தில் பாதிக்கப்ப...
1,520021,நீங்க அழகா இருக்கீங்க என்று சொல்வது அப்போதைக்க...
2,150447,சம்பந்தப்பட்ட கடல் பகுதிகளில் உள்ள மாநில அரசிட...


In [8]:
# add full path to the file names in the column data_frame["audios"]
import re
import polyglot
from polyglot.text import Text


new_list = list()
for item in list(data_frame[0]):
    i_len = len(str(item))
    k = 9 - i_len
    item_str = "".join((["0"] * k) + [str(item)])
    new_list.append(item_str + ".wav")
    
    

def get_transliterations(t):
    t = Text(t, hint_language_code="ta")
    t = (" ".join(list(t.transliterate("en"))))
    return t

# lower case the transcripts in the column data_frame["text"]

data_frame["text"] = data_frame[1].apply(lambda x: get_transliterations(x))

# removing special characters from the transcripts in the column data_frame["text"]

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence)
    return sentence

data_frame["text"] = data_frame["text"].apply(lambda x: remove_special_characters(x))

In [9]:
data_frame.head(3)

Unnamed: 0,0,1,text
0,440203,நேபாளத்தில் ஏற்பட்ட நிலநடுக்கத்தில் பாதிக்கப்ப...,nebalthil atpat nilndukthil badikpptt mkluku a...
1,520021,நீங்க அழகா இருக்கீங்க என்று சொல்வது அப்போதைக்க...,neenga azhka irukkeinga enru solvdu appothikek...
2,150447,சம்பந்தப்பட்ட கடல் பகுதிகளில் உள்ள மாநில அரசிட...,smbandpptt kdl bucudiklil ull manil arsidam id...


In [10]:
data_frame["audios"] = new_list
data_frame["audios"] = data_frame["audios"].apply(lambda x: os.path.join(data_folder, "Audios", x))

In [11]:
# viewing path of a single file
# "/home/ubuntu/speech_data/t-seed/LibriSpeech/" will be path where you untar the downloaded file
print(data_frame["audios"][1])

/home/ubuntu/speech_data/t-seed/MSR/microsoftspeechcorpusindianlanguages/ta-in-Test/Audios/000520021.wav


In [12]:
# check one file from the data_frame for specifications
audio_array, sampling_rate = librosa.load(data_frame["audios"][1])

print(sampling_rate)

22050


In [13]:
librosa.load(data_frame["audios"][1])

(array([-0.0006323 , -0.00071442, -0.00069315, ...,  0.00147659,
         0.00207009,  0.00138447], dtype=float32),
 22050)

In [14]:
# resampling audio file to 16 KHz

audio_array, sampling_rate = librosa.load(data_frame["audios"][1], sr=16000)
print(sampling_rate)

16000


In [15]:
# getting unique words in the transcripts to use it with the language modeler

words = " ".join(list(data_frame["text"])).split()
unique_words = list(set(words))
print(len(unique_words))

11896


### Prediction

In [16]:

import config
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

transcriber_processor = Wav2Vec2Processor.from_pretrained("wav2vec2-large-rbg-tamil")
transcriber_processor.tokenizer.do_lower_case = True
transcriber_encoder_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-rbg-tamil")

transcriber_encoder_model  = transcriber_encoder_model.to("cuda")

In [17]:
# noise folder path
noise_folder = "noise/matrix/"

rain_noises = ['1-56311-A-10.wav', '1-63871-A-10.wav', '2-101676-A-10.wav', '2-117625-A-10.wav', '5-181766-A-10.wav', '5-188655-A-10.wav', '3-157487-A-10.wav', '3-157615-A-10.wav', '4-160999-A-10.wav', '4-161127-A-10.wav']
wind_noises = ['1-51037-A-16.wav', '1-69760-A-16.wav', '2-104952-A-16.wav', '2-104952-B-16.wav', '3-246513-A-16.wav', '3-246513-B-16.wav', '4-144083-A-16.wav', '4-144083-B-16.wav', '5-157204-A-16.wav', '5-157204-B-16.wav']
laughter_noises = ['1-72695-A-26.wav', '1-73123-A-26.wav', '2-109759-A-26.wav','2-109759-B-26.wav', '3-152912-A-26.wav', '3-152997-A-26.wav', '4-132803-A-26.wav', '4-132810-A-26.wav', '5-242932-B-26.wav','5-244526-A-26.wav']

rain_noises = [os.path.join(noise_folder, file) for file in rain_noises]
wind_noises = [os.path.join(noise_folder, file) for file in wind_noises]
laughter_noises = [os.path.join(noise_folder, file) for file in laughter_noises]

noise_dict = {'rain': rain_noises, 'wind': wind_noises, 'laughter': laughter_noises}

In [18]:
# prediction against base model with and without language modeler

def get_transcriptions(audio_path, unique_words, e_type="base", noise_type="none"):
    
    if noise_type == "none":
        audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
    else:
        GAIN_CONSTANT = -34.
        NOISE_VOLUME=0.3
        noise_file = random.choice(noise_dict[noise_type])
        audio = AudioSegment.from_file(audio_path, 'wav')
        audio = audio.normalize()
        background = AudioSegment.from_file(noise_file, 'wav')
        background = background.normalize()
        background = background.apply_gain(GAIN_CONSTANT * (1. - 0.3))
        output = audio.overlay(background, position=0, loop=True)
        output = output.set_frame_rate(16000)
        output.export("temp.wav", format='wav')
        audio_array, sampling_rate = librosa.load("temp.wav", sr=16000)
        
    inputs = transcriber_processor(audio_array, sampling_rate=16000,
                                       return_tensors="pt", padding=True)
    inputs = inputs.to("cuda")
    with torch.no_grad():
            character_probability = transcriber_encoder_model(inputs.input_values,
                                                              attention_mask=inputs.attention_mask).logits
    inputs = inputs.to("cpu")
    predicted_ids = torch.argmax(character_probability, dim=-1)
    encoder_text = transcriber_processor.batch_decode(predicted_ids)[0]
    
    decoder_text = "None" # "not disclosed"
    
    return encoder_text, decoder_text

In [19]:
without_lm_op = list()
with_lm_op = list()

for item in progressbar(data_frame["audios"]):
    e_text, d_text = get_transcriptions(item, unique_words, e_type="base", noise_type=noise_variation)
    without_lm_op.append(e_text)
    with_lm_op.append(d_text)

data_frame["without_lm"] = without_lm_op
data_frame["with_lm"] = with_lm_op
data_frame.to_csv("results/MSR-Tamil-"+ e_type  + "-" + variation + "-" + noise_variation + ".tsv", sep="\t", index=False)

  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1
100% (3081 of 3081) |####################| Elapsed Time: 0:35:58 Time:  0:35:58


In [20]:
data_frame.head(5)

Unnamed: 0,0,1,text,audios,without_lm,with_lm
0,440203,நேபாளத்தில் ஏற்பட்ட நிலநடுக்கத்தில் பாதிக்கப்ப...,nebalthil atpat nilndukthil badikpptt mkluku a...,/home/ubuntu/speech_data/t-seed/MSR/microsofts...,nabalthil iarptt nilenrukthil badikpptt mkluku...,nebalthil arpptt nilndukthil badikpptt mkluku ...
1,520021,நீங்க அழகா இருக்கீங்க என்று சொல்வது அப்போதைக்க...,neenga azhka irukkeinga enru solvdu appothikek...,/home/ubuntu/speech_data/t-seed/MSR/microsofts...,neenga azhka irukkeinga enru solvdu appothik...,neenga azhka irukkeinga enru solvdu appothik...
2,150447,சம்பந்தப்பட்ட கடல் பகுதிகளில் உள்ள மாநில அரசிட...,smbandpptt kdl bucudiklil ull manil arsidam id...,/home/ubuntu/speech_data/t-seed/MSR/microsofts...,samandpt kdlbucurealil ull manil arsidam iduku...,sarand kdl bucudeail ull manil arsidam idu kur...
3,80140,தமிழ்நாட்டு திரைப்பட இயக்குனர்கள் சங்கம் இன்று...,dmizhnnattu diraippd iykunerkl sngam inru ndat...,/home/ubuntu/speech_data/t-seed/MSR/microsofts...,dmizhnattud diraippd akunerkl sngam enru ndath...,dmizhnnattu diraippd aku neril sngam enru ndat...
4,1300218,பாகிஸ்தானில் பிறந்ததை தவிர அந்த பிஞ்சுகள் செய்...,bakistanil branddai dvir anth binjukl said bav...,/home/ubuntu/speech_data/t-seed/MSR/microsofts...,bakitnil branddaid dvir anthb binjukl said bav...,bakistanil branddai dvir anthb binjukl said ba...


In [21]:
wer_metric = load_metric("wer")
without_lm_wer = wer_metric.compute(predictions=data_frame["without_lm"], references=data_frame["text"])
with_lm_wer = wer_metric.compute(predictions=data_frame["with_lm"], references=data_frame["text"])

In [22]:
without_lm_wer 

0.7450738916256158

In [23]:
with_lm_wer

0.434900109469075