In [1]:
import nemo

In [1]:
# This is the working directory for this tutorial. 
working_dir = 'am_finetuning/'
!mkdir -p $working_dir

# Import the necessary dependencies.
import wget
import glob
import os
import subprocess
import tarfile

# The AN4 directory will be created in `data_dir`. It is currently set to the `working_dir`.
data_dir = os.path.abspath(working_dir)

# Download the AN4 dataset if it doesn't already exist in `data_dir`. 
# This will take a few moments...
# We also set `an4_path` which points to the downloaded an4 dataset
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir)
    print(f"AN4 dataset downloaded at: {an4_path}")
else:
    print("AN4 dataset tarfile already exists. Proceed to the next step.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

AN4 dataset tarfile already exists. Proceed to the next step.


In [3]:
if not os.path.exists(data_dir + '/an4/'):
    # Untar
    tar = tarfile.open(an4_path)
    tar.extractall(path=data_dir)
    print("Completed untarring the an4 tarfile")
    # Convert .sph to .wav (using sox)
    print("Converting .sph to .wav...")
    sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
    for sph_path in sph_list:
        wav_path = sph_path[:-4] + '.wav'
        #converting to 16kHz wav
        cmd = ["sox", sph_path, "-r", "16000", wav_path]
        subprocess.run(cmd)
    print("Finished converting the .sph files to .wav files")
else:
    print("Can't find the an4 dataset directory. Please download the dataset first")

Completed untarring the an4 tarfile
Converting .sph to .wav...
Finished converting the .sph files to .wav files


In [10]:
import librosa
sample, _ = librosa.load("/home/heh/am_finetuning/an4/wav/an4_clstk/fash/an251-fash-b.sph", mono=False)
print(sample.shape)

(22050,)


In [8]:
!pip install mutagen
from mutagen.wave import WAVE

# Import the necessary libraries.
import json
import librosa

# Method to build a manifest.
def build_manifest(transcripts_path, manifest_path, wav_path):
    with open(transcripts_path, 'r') as fin:
        with open(manifest_path, 'w') as fout:
            for line in fin:
                # Lines look like this:
                # <s> transcript </s> (fileID)
                transcript = line[: line.find('(')-1].lower()
                transcript = transcript.replace('<s>', '').replace('</s>', '')
                transcript = transcript.strip()

                file_id = line[line.find('(')+1 : -2]  # e.g. "cen4-fash-b"
                audio_path = os.path.join(
                    data_dir, wav_path,
                    file_id[file_id.find('-')+1 : file_id.rfind('-')],
                    file_id + '.wav')

                # duration = librosa.core.get_duration(filename=audio_path)
                duration = WAVE(filename=audio_path).info.length
                
                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                }
                json.dump(metadata, fout)
                fout.write('\n')
                
# Building the manifest files.
print("***Building manifest files***")

# Building manifest files for the training data
train_transcripts = data_dir + '/an4/etc/an4_train.transcription'
train_manifest = data_dir + '/an4/train_manifest.json'
if not os.path.isfile(train_manifest):
    build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')
    print("Training manifest created at", train_manifest)

# Building manifest files for the test data
test_transcripts = data_dir + '/an4/etc/an4_test.transcription'
test_manifest = data_dir + '/an4/test_manifest.json'
if not os.path.isfile(test_manifest):
    build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')
    print("Test manifest created at", test_manifest)

print("***Done***")

Collecting mutagen
  Downloading mutagen-1.46.0-py3-none-any.whl (193 kB)
[K     |████████████████████████████████| 193 kB 14.2 MB/s eta 0:00:01
[?25hInstalling collected packages: mutagen
Successfully installed mutagen-1.46.0
***Building manifest files***
Training manifest created at /home/heh/am_finetuning/an4/train_manifest.json
Test manifest created at /home/heh/am_finetuning/an4/test_manifest.json
***Done***


In [9]:
# Download the background noise dataset if it doesn't already exist in `data_dir`. 
# This will take a few moments...
# We also set `noise_path` which points to the downloaded background noise dataset.

if not os.path.exists(data_dir + '/rirs_noises.zip'):
    slr28_url = 'https://www.openslr.org/resources/28/rirs_noises.zip'
    noise_path = wget.download(slr28_url, data_dir)
    print("Background noise dataset download complete.")
else:
    print("Background noise dataset already exists. Please proceed to the next step.")
    noise_path = data_dir + '/rirs_noises.zip'

100% [....................................................................] 1311166223 / 1311166223Background noise dataset download complete.


In [10]:
# Extract noise data
from zipfile import ZipFile
try:
    with ZipFile(noise_path, "r") as zipObj:
        zipObj.extractall(data_dir)
        print("Extracting noise data complete")
except Exception:
    print("Not extracting. Extracted noise data might already exist.")


Extracting noise data complete


In [11]:
import json
iso_path = os.path.join(data_dir,"RIRS_NOISES/real_rirs_isotropic_noises")
iso_noise_list = os.path.join(iso_path, "noise_list")

# Create the manifest files from noise files
def process_row(row, offset, duration):
  try:
    entry = {}
    wav_f = row['wav_filename']
    newfile = wav_f
    duration = subprocess.check_output('soxi -D {0}'.format(newfile), shell=True)
    entry['audio_filepath'] = newfile
    entry['duration'] = float(duration)
    entry['offset'] = offset
    entry['text'] = row['transcript']
    return entry
  except Exception as e:
    wav_f = row['wav_filename']
    newfile = wav_f
    print(f"Error processing {newfile} file!!!")
    
train_rows = []
test_rows = []

with open(iso_noise_list,"r") as in_f:
    for line in in_f:
        row = {}
        data = line.rstrip().split()
        row['wav_filename'] = os.path.join(data_dir,data[-1])
        row['transcript'] = "-"
        train_rows.append(process_row(row, 0 , 15))
        test_rows.append(process_row(row, 15 , 15))

# Writing manifest files
def write_manifest(manifest_file, manifest_lines):
    with open(manifest_file, 'w') as fout:
      for m in manifest_lines:
        fout.write(json.dumps(m) + '\n')
      print("Writing manifest file to", manifest_file, "complete")

# Writing training and test manifest files
test_noise_manifest = os.path.join(data_dir, "test_noise.json")
train_noise_manifest = os.path.join(data_dir, "train_noise.json")
write_manifest(test_noise_manifest, test_rows)
write_manifest(train_noise_manifest, train_rows)

Writing manifest file to /home/heh/am_finetuning/test_noise.json complete
Writing manifest file to /home/heh/am_finetuning/train_noise.json complete


In [9]:
sample, _ = librosa.load("/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_1.wav", mono=False)
print(sample.shape)
print(len(sample.shape))
import soundfile as sf
audio_file = "/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_1.wav"
with sf.SoundFile(audio_file, 'r') as f:
    dtype = 'float32'
    sample_rate = f.samplerate
    samples = f.read(dtype=dtype)
    print(samples.shape)

(8, 661500)
2
(480000, 8)


In [14]:
final_data_dir = data_dir + '/noise_data'

run = f"python /workspace/nemo/scripts/dataset_processing/add_noise.py \
    --input_manifest={train_manifest} \
    --noise_manifest={train_noise_manifest} \
    --snrs 5 \
    --out_dir={final_data_dir}"

nemo_container = 'nvcr.io/nvidia/nemo:22.09'

! sudo docker run --gpus=all --rm -it -v $data_dir:$data_dir --net=host --ipc=host \
--ulimit memlock=-1 --ulimit stack=67108864 $nemo_container $run

[sudo] password for heh: 


In [7]:
from pathlib import Path
import json
def check_data(manifest_file):
    data = []
    with Path(manifest_file).open("r") as fin:
        for line in fin.readlines():
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    cnt = 0
    for item in data:
        audio_file = item["audio_filepath"]
        sample, _ = librosa.load(audio_file, mono=False)
        if len(sample.shape) > 1:
            print(audio_file, sample.shape)
            cnt += 1
    print(f"{cnt}/{len(data)} are not mono-channel audios")

In [8]:
inputs = ["/home/heh/am_finetuning/test_noise.json"]#, "/home/heh/am_finetuning/train_noise.json", "/home/heh/am_finetuning/an4/train_manifest.json", "/home/heh/am_finetuning/an4/test_manifest.json"]
for filepath in inputs:
    print(filepath)
    check_data(filepath)

/home/heh/am_finetuning/test_noise.json
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_1.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_10.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_2.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_3.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_4.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_5.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_6.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/RVB2014_type1_noise_largeroom1_7.wav (8, 661500)
/home/heh/am_finetuning/RIRS_NOISES/real_rirs_isotropic_noises/

In [None]:


sudo docker run --gpus=all --rm -it -v /home/heh/am_finetuning:/home/heh/am_finetuning --net=host --ipc=host \
-v /media/data/projects/NeMo-vad/project/synth_audio_val:/media/data/projects/NeMo-vad/project/synth_audio_val \
--ulimit memlock=-1 --ulimit stack=67108864 gitlab-master.nvidia.com/heh/nemo_containers:nemo-main-22.09 /bin/bash



python /workspace/nemo/scripts/dataset_processing/add_noise.py \
    --input_manifest=/home/heh/am_finetuning/an4/train_manifest.json \
    --noise_manifest=/home/heh/am_finetuning/train_noise.json \
    --snrs 5 \
    --out_dir=/home/heh/am_finetuning/output


python /workspace/nemo/scripts/dataset_processing/add_noise.py \
    --input_manifest=/media/data/projects/NeMo-vad/project/synth_audio_val/synth_manifest.json \
    --noise_manifest=/media/data/projects/NeMo-vad/project/synth_audio_val/synth_manifest.json \
    --snrs 5 \
    --out_dir=/home/heh/am_finetuning/output