Create audio samples merged with background noise (saved as wav files), transcribes the samples with the ASR model, and translates the generated transcriptions. Then, saves a csv file containing the path to the original file, noise type, the original transcript, the asr transcript and the translation.

Install packages

In [None]:
!pip install -r requirements.txt
!pip install librosa
!pip install transformers
!pip install sentencepiece

In [1]:
# Imported libraries
import librosa
import transformers
import torchaudio
import sentencepiece
import numpy as np
import math
import regex as re
import csv

from scipy.io.wavfile import write
from pyctcdecode import build_ctcdecoder
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, AutoTokenizer, AutoModelForSeq2SeqLM

The add_background_noise() function uses the librosa library to load both files, adjust the length of the noise file, calculate the RMS to adjust the noise sample and merge both files.

In [2]:
def add_background_noise(s_path, n_path, offset=0.0, SNR=10):
    '''
    Input:
    
    s_path: a string containing the path for the speech file (.wav format)

    n_path: a string containing the path for the noise file (.wav format)

    offset: a float representing the starting point, in seconds, to start loading the noise file (default = 0)

    SNR: an integer representing the Signal to Noise Ratio (default = 10)
    
    Returns:
    audio sample with noise
    '''
    sample, sr = librosa.load(s_path)
    duration_s = librosa.get_duration(y=sample, sr=sr)
    noise, sr = librosa.load(n_path, offset=offset, duration=duration_s) # Load same length of noise file
    RMS_s = math.sqrt(np.mean(sample**2))  # Calculate RMS for audio sample
    RMS_n = math.sqrt(RMS_s**2/(pow(10,SNR/10)))  # Calculate required RMS for noise
    RMS_n_current = math.sqrt(np.mean(noise**2))  # Calculate current RMS for noise
    noise = noise*(RMS_n/RMS_n_current)  # Adjust noise
    sample_noise = sample + noise 
    sample_noise = librosa.resample(sample_noise, orig_sr=sr,target_sr=16000)
    return sample_noise

In [3]:
def asr_model(wav_file, model, processor):
    waveform, sample_rate = torchaudio.load(wav_file)
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000, resampling_method='sinc_interpolation')
    # Downsampling is required to match the model's frequency, i.e.
    # the frequency of the dataset the model was trained on.
    speech_array = resampler(waveform).squeeze().numpy()
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")['input_values']
    # Detach the tensors first (otherwise, they require gradients
    # to be computed); also, since we're simulating the inference
    # squeeze the resulting array at the first dimension (batch_size),
    # otherwise, leave it as is, and call batch_decode() instead.
    outputs = model(inputs)['logits'].detach().numpy().squeeze(0)
    return(processor.decode(outputs).text)

In [4]:
def translate_uk_en(model_name, ukrainian_text):
    tokenizer = { model_name : AutoTokenizer.from_pretrained(model_name) }
    model = { model_name : AutoModelForSeq2SeqLM.from_pretrained(model_name) }
    encoded = { model_name : model[model_name].generate(**tokenizer[model_name](ukrainian_text, return_tensors='pt', padding=True)) }
    translation = { model_name : [ tokenizer[model_name].decode([t for t in tensor], skip_special_tokens=True) for tensor in encoded[model_name] ] }
    return translation[model_name][0]

In [34]:
s_list = [
    ["common_voice_uk_32857863.wav", "Я чув і знав лиш одне"],
    ["common_voice_uk_32327955.wav", "Бойка дивує цей неймовірний тон але йому зовсім він не до вподоби"],
    ["common_voice_uk_29194547.wav", "Камрад покірно ліг і поклавши голову на лапи зітхнув"],
    ["common_voice_uk_25651279.wav", "Відтоді я встиг об'їхати увесь світ"],
    ["common_voice_uk_28716623.wav", "Червоноармійці вхопили мої руки і скрутили назад"],
    ["common_voice_uk_26940711.wav", "І тут наперед приготуйтеся"],
    ["common_voice_uk_33459867.wav", "Навіщо воно мені здалося"],
    ["common_voice_uk_32413786.wav", "От за це люблю"],
    ["common_voice_uk_29092801.wav", "Любуйся як Дидона стогне"],
    ["common_voice_uk_25828714.wav", "Повертаючись із Орди, помер в дорозі великий Володимирський князь Ярослав Ярославович"],
    ["common_voice_uk_33339057.wav", "Господь вислухав їх молитви"],
    ["common_voice_uk_21349472.wav", "Взяти ноги на плечі"],
    ["common_voice_uk_27109869.wav", "Туди за районного отамана Кузь- менко піде"],
    ["common_voice_uk_28744536.wav", "Що пропаду от лиш не видно"],
    ["common_voice_uk_21639338.wav", "Іде в три дороги"]
]

n_list = [
    ['noise/bus_ch01.wav', 0.0],
    ['noise/cafeteria_ch01.wav', 22.0], 
    ['noise/car_ch01.wav', 13.0],
    ['noise/field_ch01.wav', 79.0],
    ['noise/hallway_ch01.wav', 222.0], 
    ['noise/kitchen_ch01.wav', 145.0],
    ['noise/living_ch01.wav', 32.0],
    ['noise/meeting_ch01.wav', 3.0],
    ['noise/metro_ch01.wav', 0.0],
    ['noise/office_ch01.wav', 162.0],
    ['noise/park_ch01.wav', 42.0],
    ['noise/resto_ch01.wav', 0.0],
    ['noise/river_ch01.wav', 85.0], 
    ['noise/square_ch01.wav', 0.0],
    ['noise/station_ch01.wav', 59.0],
    ['noise/traffic_ch01.wav', 0.0],
    ['noise/washing_ch01.wav', 150.0]
]

In [22]:
sr_model = Wav2Vec2ForCTC.from_pretrained("Yehor/wav2vec2-xls-r-300m-uk-with-small-lm")
processor = Wav2Vec2ProcessorWithLM.from_pretrained("Yehor/wav2vec2-xls-r-300m-uk-with-small-lm")

nmt_model = "Helsinki-NLP/opus-mt-tc-big-zle-en"

header = ['path', 'original_transcript', 'noise_type', 'asr_transcript', 'translation']
data_translated = []

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading the LM will be faster if you build a binary file.
Reading /home/studio-lab-user/.cache/pyctcdecode/models--Yehor--wav2vec2-xls-r-300m-uk-with-small-lm/snapshots/bbd936400e7566ba44560440aa4abd05b5983c17/language_model/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [36]:
for s_file in s_list:
  for n_file in n_list:
    # Extract noise type
    noise_type = re.findall('/(\S+?)_ch01\.wav', n_file[0])[0]
    # Save file path
    file_path = 'samples_with_noise_snr5/' + re.findall('(\S+?)\.wav', s_file[0])[0] + '_' + noise_type + ".wav"
    # Add background noise and save wav file
    sr = librosa.get_samplerate(n_file[0])
    write(file_path, rate=sr, data=add_background_noise('samples/'+s_file[0], n_file[0], offset=n_file[1], SNR=5))
    # Create asr transcript
    asr_transcript = asr_model(file_path, sr_model, processor) 
    # Translate transcript and save all the file info into a list
    data_translated.append([file_path, s_file[1], noise_type, asr_transcript, translate_uk_en(nmt_model, asr_transcript)])



In [None]:
# Create csv file
with open('cv_translated_with_noise_snr5.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(data_translated)