## Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Installs

In [None]:
!pip install speechaugs

In [None]:
import speechaugs
import torch, torchaudio
import albumentations as A
import matplotlib.pyplot as plt
import numpy as np
import IPython

In [None]:
speechaugs.__version__

In [None]:
!git clone https://github.com/waveletdeboshir/speechaugs.git

In [None]:
import torch, torchaudio
import IPython
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import random
import csv
import os

In [None]:
torchaudio.set_audio_backend('sox_io')

## Audio Augmentation

SpeechAug class that extends the SpeechAug so that it can be applied to an entire dataset and not just individual examples. All credit for the actual augmentations goes to the authors of speechaugs (https://github.com/waveletdeboshir/speechaugs/)

In [None]:
class SpeechAug():
  def __init__(self, name: str, file_type: str, csv_dir: str, outfolder_dir: str, sampling_rate=16000):
    self.SR            = sampling_rate
    self.filetype      = file_type
    self.max_duration  = 55 #seconds
    self.df            = self.load_csv(csv_dir)
    self.selected      = {'bks': False, 'fds': False,
                          'ps': False, 'vtlp': False,
                          'shnoi': False, 'amp': False}
    self.outfolder_dir = outfolder_dir
    self.out_csv       = os.path.join(outfolder_dir, f'{name}_real+augmented_data_train.csv')
    
  def load_csv(self, csv_path):
    df = pd.read_csv(csv_path)
    df = df.dropna()
    return df
    
  def open_audio(self, path):
    try:
      waveform, _ = torchaudio.load(path)
    except FileNotFoundError:
      raise FileNotFoundError
    else:
      return waveform

  def augment(self):
    if self.filetype not in ['wav', 'mp3']:
      print("Wrong audio file type. Must be wav or mp3 for processing. Exiting.")
      sys.exit()
    with open(self.out_csv, 'w', encoding='utf-8') as outfile:
      writer = csv.writer(outfile)
      # Write headers
      writer.writerow(['file', 'text'])
      # Counter
      count = 0
      for file, text in zip(self.df['file'], self.df['text']):
        # File name
        filename = file.split('/')[-1]
        # Randomized Augmentation Hyperparameters
        random_semitone = round(random.uniform(0.01,1.0),2)
        short_burst_freq = np.random.randint(1,3)
        amplitude_change = np.random.randint(1,2)

        actual_wav = self.open_audio(file)
        # write actual audio
        audio_path0    = self.outfolder_dir + '/original_' + filename
        csv_row0       = [audio_path0, text]
        writer.writerow(csv_row0)
        torchaudio.save(audio_path0, actual_wav, self.SR)
        count += 1
        if self.selected['bks'] is True:
          bks_augment   = speechaugs.TimeStretchLibrosa(p=1., max_duration=self.max_duration,sr=self.SR)(waveform=actual_wav)['waveform']
          audio_path1    = self.outfolder_dir + '/bkStretch_' + filename
          csv_row1       = [audio_path1, text]
          writer.writerow(csv_row1)
          torchaudio.save(audio_path1, bks_augment, self.SR)
          count += 1
        if self.selected['fds'] is True:
          fds_augment   = speechaugs.ForwardTimeShift(p=1., max_duration=self.max_duration, sr=self.SR)(waveform=actual_wav)['waveform']
          audio_path2    = self.outfolder_dir + '/fdStretch_' + filename
          csv_row2       = [audio_path2, text]
          writer.writerow(csv_row2)
          torchaudio.save(audio_path2, fds_augment, self.SR)
          count += 1
        if self.selected['ps'] is True:
          ps_augment    = speechaugs.PitchShiftLibrosa(p=random_semitone, sr=self.SR)(waveform=actual_wav)['waveform']
          audio_path3    = self.outfolder_dir + '/pitchshift_' + filename
          csv_row3       = [audio_path3, text]
          writer.writerow(csv_row3)
          torchaudio.save(audio_path3, ps_augment, self.SR)
          count += 1
        if self.selected['vtlp'] is True:
          vtlp_augment  = speechaugs.VTLP(p=random_semitone, sr=self.SR)(waveform=actual_wav)['waveform']
          audio_path4    = self.outfolder_dir + '/vtlp_' + filename
          csv_row4       = [audio_path4, text]
          writer.writerow(csv_row4)
          torchaudio.save(audio_path4, vtlp_augment, self.SR)
          count += 1
        if self.selected['shnoi'] is True:
          shnoi_augment = speechaugs.ShortNoises(p=short_burst_freq, max_n_noises=7)(waveform=actual_wav)['waveform']
          audio_path5    = self.outfolder_dir + '/shortnoise_' + filename
          csv_row5       = [audio_path5, text]
          writer.writerow(csv_row5)
          torchaudio.save(audio_path5, shnoi_augment, self.SR)
          count += 1
        if self.selected['amp'] is True:
          amp_augment   = speechaugs.LoudnessChange(p=amplitude_change)(waveform=actual_wav)['waveform']
          audio_path6    = self.outfolder_dir + '/amplitudeRaise_' + filename
          csv_row6       = [audio_path6, text]
          writer.writerow(csv_row6)
          torchaudio.save(audio_path6, amp_augment, self.SR)
          count += 1
    print(f"Finished. Total files added: {count}")

  def add_back_stretch(self):
    self.selected['bks'] = True

  def add_forward_stretch(self):
    self.selected['fds'] = True

  def add_pitch_shift(self):
    self.selected['ps'] = True

  def add_vtlp(self):
    self.selected['vtlp'] = True

  def add_short_noises(self):
    self.selected['shnoi'] = True

  def add_amplitude_change(self):
    self.selected['amp'] = True

**Arguments to SpeechAug Class:**

*   name -> outfile name
*   file_type -> type of audio file (wav, mp3, etc.)
*   csv_dir -> path to csv file with audio file names and transcripts
*   outfolder_dir -> path to which the new augmented audio files will be written



In [None]:
csv_dir   = '/content/drive/MyDrive/path/to/your/csv_file.csv'
outfolder = '/content/drive/MyDrive/outfolder/for/augmented/audio/files'

aug = SpeechAug('outfile-name', 'wav', csv_dir, outfolder)
aug.add_back_stretch()
aug.add_forward_stretch()
aug.add_pitch_shift()
aug.add_vtlp()
aug.add_short_noises()
aug.add_amplitude_change()
aug.augment()