### Data Preparation

In [1]:
# for reproducing results, we kept only the test files in your repos without making any changes
# you shall download it through the followig commands

# !pip install gdown
# !gdown --id 1pyha4gUFtUG-pIS17IVKAm88II3GgknR
# tar -xvf LibriAdapt.tar.gz

In [2]:
# installing required libries for evaluation

# !pip install librosa
# !pip install datasets
# !pip install pandas
# !pip install jiwer
# !pip install progressbar

In [3]:
import os
import re
import librosa
import pandas as pd
from progressbar import progressbar
from datasets import load_metric

In [4]:
# path where you untar the downloaded file
data_folder = "/home/ubuntu/speech_data/t-seed/LibriAdapt/en-us"

In [5]:
variation1 = "rain" # possbile values (clean, rain, wind, laughter)

In [6]:
variation2 = "matrix" # possbile values (matrix, nexus6, pseye, respeaker, shure, usb)

In [7]:
tsv_file = os.path.join(data_folder,  variation2 + ".tsv")

In [8]:
data_frame = pd.read_csv(tsv_file, sep="\t")

In [9]:
data_frame.shape

(2600, 3)

In [10]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,8555-292519-0000.wav,brighter than early dawn's most brilliant dye ...
1,1,8555-292519-0001.wav,guided by you how we might stroll towards deat...
2,2,8555-292519-0002.wav,venice


In [11]:
# add full path to the file names in the column data_frame["audios"]

data_frame["audios"] = data_frame["audios"].apply(lambda x: os.path.join(
    data_folder, variation1, variation2, "test", x))


# lower case the transcripts in the column data_frame["text"]

data_frame["text"] = data_frame["text"].apply(lambda x: x.lower())

# removing special characters from the transcripts in the column data_frame["text"]

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence)
    return sentence

data_frame["text"] = data_frame["text"].apply(lambda x: remove_special_characters(x))

In [12]:
data_frame.head(3)

Unnamed: 0.1,Unnamed: 0,audios,text
0,0,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,brighter than early dawn's most brilliant dye ...
1,1,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,guided by you how we might stroll towards deat...
2,2,/home/ubuntu/speech_data/t-seed/LibriAdapt/en-...,venice


In [13]:
# viewing path of a single file
# "/home/ubuntu/speech_data/t-seed/LibriSpeech/" will be path where you untar the downloaded file
print(data_frame["audios"][1])

/home/ubuntu/speech_data/t-seed/LibriAdapt/en-us/rain/matrix/test/8555-292519-0001.wav


In [14]:
# check one file from the data_frame for specifications
audio_array, sampling_rate = librosa.load(data_frame["audios"][1])

print(sampling_rate)

22050


In [15]:
# resampling audio file to 16 KHz

audio_array, sampling_rate = librosa.load(data_frame["audios"][1], sr=16000)
print(sampling_rate)

16000


In [16]:
# getting unique words in the transcripts to use it with the language modeler

words = " ".join(list(data_frame["text"])).split()
unique_words = list(set(words))
print(len(unique_words))

7991


### Prediction

In [17]:
# cell to be removed
import config
import torch
import torchaudio
from common.transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from stack.VoiceTranscriber import VoiceDecoder
from stack.VoiceEnhancer import Enhancer

from demucs import pretrained
from demucs.apply import apply_model
from demucs.audio import AudioFile, convert_audio, save_audio

def load_track(track, device, audio_channels, samplerate):
    errors = {}
    wav = None

    try:
        wav = AudioFile(track).read(
            streams=0,
            samplerate=samplerate,
            channels=audio_channels).to(device)
    except FileNotFoundError:
        errors['ffmpeg'] = 'Ffmpeg is not installed.'
    except subprocess.CalledProcessError:
        errors['ffmpeg'] = 'FFmpeg could not read the file.'

    if wav is None:
        try:
            wav, sr = ta.load(str(track))
        except RuntimeError as err:
            errors['torchaudio'] = err.args[0]
        else:
            wav = wav.to(device)
            wav = convert_audio(wav, sr, samplerate, audio_channels)

    if wav is None:
        print(f"Could not load file {track}. "
              "Maybe it is not a supported file format? ")
        for backend, error in errors.items():
            print(f"When trying to load using {backend}, got the following error: {error}")
        sys.exit(1)
    return wav

def make_batch(wav_tensor, batch_size = 10):
    pad_value = batch_size - wav_tensor.shape[1] % batch_size
    pad_tensor = torch.zeros(wav_tensor.shape[0], pad_value)
    wav_tensor = torch.cat((wav_tensor, pad_tensor), 1)
    frame_size = int(wav_tensor.shape[1] / batch_size)
    for i in range (0,batch_size):
        if i == 0:
            temp_tensor = wav_tensor[:, i*frame_size:(i*frame_size)+frame_size]
            final_tensor = temp_tensor.unsqueeze(0)
        else:
            temp_tensor = wav_tensor[:, i*frame_size:(i*frame_size)+frame_size].unsqueeze(0)
            final_tensor = torch.cat((final_tensor, temp_tensor), 0)
    return final_tensor

def destroy_batch(batch_out, index):
    if index == None:
        for i in range(0, batch_out.shape[0]):
            if i == 0:
                temp_out = batch_out[i]
                final_out = temp_out
            else:
                temp_out = batch_out[i]
                final_out = torch.cat((final_out, temp_out), 1)
    else:
        for i in range(0, batch_out.shape[0]):
            if i == 0:
                temp_out = batch_out[i][index]
                final_out = temp_out
            else:
                temp_out = batch_out[i][index]
                final_out = torch.cat((final_out, temp_out), 1)
    return final_out


def get_batch_size(temp_wav):
    size = int(temp_wav.shape[1]/250000)

    if size == 0:
        b_size = 1
    elif size <= 4:
        b_size = 2
    elif size <= 7:
        b_size = 5
    elif size <= 12:
        b_size = 10
    elif size <= 17:
        b_size = 15
    elif size <= 22:
        b_size = 20
    elif size <= 27:
        b_size = 25
    else:
        b_size = 30
    return b_size


# model = pretrained.get_model('mdx')
# voice_separation_model = model.models[3]
# voice_separation_model = voice_separation_model.to("cuda")

transcriber_processor = Wav2Vec2Processor.from_pretrained(config.encoder_path)
transcriber_processor.tokenizer.do_lower_case = True
transcriber_encoder_model = Wav2Vec2ForCTC.from_pretrained(config.encoder_path)

lex_words = [word for word in unique_words if len(word.strip())>0]

temp_lexicon = "temp_lex.txt"
with open(temp_lexicon, "w+") as f:
    for word in lex_words:
        chars = [c for c in word]
        w_word = [word] + chars + ["|"]
        f.write(" ".join(w_word) + "\n")

transcriber_decoder_model = VoiceDecoder(transcriber_processor,
                                         config.decoder_path,
                                         temp_lexicon, lexicon_option=True)
transcriber_encoder_model  = transcriber_encoder_model.to("cuda")

noise_remover_model = Enhancer()
state_dict = torch.load(config.enhancer_path,
                        map_location=torch.device("cuda:0"))
noise_remover_model.load_state_dict(state_dict)
noise_remover_model = noise_remover_model.to("cuda")

In [18]:
# model.samplerate

In [19]:
# prediction against base model with and without language modeler

def get_transcriptions(audio_path, unique_words, e_type="base_model"):
    
    
#     raw_wav = load_track(audio_path, "cpu", model.audio_channels, 44100)

#     batch_size = get_batch_size(raw_wav)
#     batch_wav = make_batch(raw_wav, batch_size=batch_size)

#     for i, item in enumerate(batch_wav):
#         item = item.unsqueeze(0)
#         with torch.no_grad():
#             item = item.to("cuda")
#             output = voice_separation_model(item)
#             item = item.to("cpu")
#             if i == 0:
#                 batch_output = output.to("cpu")
#             else:
#                 batch_output = torch.cat((batch_output, output.to("cpu")))
                
#     vs_wav = destroy_batch(batch_output, index=3)
    
#     torchaudio.save("temp.wav", vs_wav, 44100)
    
    audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
    
    audio_array = torch.tensor(audio_array)
    
    audio_array = audio_array.to("cuda").unsqueeze(0)
    output = noise_remover_model(audio_array)
    
    
    dry = 0.7
    output = (1 - dry) * output + dry * audio_array
    
    wav = output / max(output.abs().max().item(), 1)
    
    wav = wav.squeeze(0).squeeze(0)
#     audio_array = audio_array.to("cpu")
    
    inputs = transcriber_processor(wav, sampling_rate=sampling_rate,
                                       return_tensors="pt", padding=True)
    inputs = inputs.to("cuda")
    with torch.no_grad():
            character_probability = transcriber_encoder_model(inputs.input_values,
                                                              attention_mask=inputs.attention_mask).logits
    inputs = inputs.to("cpu")
    predicted_ids = torch.argmax(character_probability, dim=-1)
    encoder_text = transcriber_processor.batch_decode(predicted_ids)[0]
    character_probability = torch.nn.functional.log_softmax(character_probability.float(), dim=-1)

    character_probability  =  character_probability.to("cpu")
    lm_tokens, lm_scores = transcriber_decoder_model.decode(character_probability)
    prediction_ids = lm_tokens[0][:]
    decoder_text = transcriber_processor.batch_decode(prediction_ids)[0]
    
    return encoder_text, decoder_text

In [None]:
without_lm_op = list()
with_lm_op = list()
e_type = "enhanced"
for item in progressbar(data_frame["audios"]):
    e_text, d_text = get_transcriptions(item, unique_words, e_type=e_type)
    without_lm_op.append(e_text)
    with_lm_op.append(d_text)

data_frame["without_lm"] = without_lm_op
data_frame["with_lm"] = with_lm_op
data_frame.to_csv("results/LibriAdapt-"+ variation1 +"-"+ variation2 + "-" + e_type + ".tsv", sep="\t", index=False)

  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1
 31% (829 of 2600) |######               | Elapsed Time: 0:20:14 ETA:   0:37:19

In [None]:
data_frame.head(5)

In [None]:
wer_metric = load_metric("wer")
without_lm_wer = wer_metric.compute(predictions=data_frame["without_lm"], references=data_frame["text"])
with_lm_wer = wer_metric.compute(predictions=data_frame["with_lm"], references=data_frame["text"])

In [26]:
without_lm_wer 

0.09664169616836506

In [27]:
with_lm_wer

0.06065288399475815

In [None]:
print("hi")