In [3]:
# Package installation

# !pip install aiortc pyee
!pip install noisereduce pydub
# !apt-get install ffmpeg -y
!pip install webrtcvad

Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, noisereduce
Successfully installed noisereduce-3.0.3 pydub-0.25.1
Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp310-cp310-linux_x86_64.whl size=73466 sha256=7b81e54fc0ad1c27bb949950c58df7f0d1c6b98d2b6cb9761623fe1969de0f19
  Stored in directory: /root/.cache/pip/wheels/2a/2b/84/ac7bacfe8c68a87c1ee3dd3c66818a5

In [7]:
# Package imports
import torch
import os
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torchaudio
import time
import pandas as pd
import numpy as np
import webrtcvad as wb
import matplotlib.pyplot as plt
import kagglehub as kh
from scipy.signal import stft, istft
from scipy.signal.windows import hamming
from scipy.io import wavfile
from pydub import AudioSegment
from skimage.exposure import match_histograms


path = kh.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")
print(path)
# from aiortc.contrib.media import MediaPlayer, MediaRecorder
# import noisereduce as nr
# from pydub import AudioSegment
# import librosa  # To load audio as NumPy array

/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1


In [8]:
def noise_removal(signal, sampling_rate):
  vad = wb.Vad(2)
  frame_size = int(sampling_rate * 0.02)
  frames = []
  #splitting audio into frames
  for i in range(0, len(signal), frame_size):
      frames.append(signal[i:i+frame_size].astype(np.int16))

  #performing de-noising
  filtered_audio = []
  count = 0
  for frame in frames:
      if len(frame) < frame_size:
        padding = np.zeros(frame_size - len(frame), dtype=np.int16)
        frame = np.concatenate((frame, padding))
      if vad.is_speech(frame.tobytes(), sampling_rate):
          filtered_audio.append(frame)
      #     print("Speech detected")
      # else:
      #     print("Speech not detected")
      # print(count)
      count += 1
  filtered_audio = np.array(filtered_audio)

  filtered_audio = np.concatenate(filtered_audio).astype(np.int16)
  return filtered_audio

In [9]:
def export_audio_soundfile(filtered_audio, sample_rate, output_file):
    # Save the numpy array as a WAV file
    wavfile.write(output_file, sample_rate, filtered_audio)

In [11]:
audio_data_paths = []

# frame_size = int(sample_rate
for actor in os.listdir(path + '/audio_speech_actors_01-24'):
    count = 0
    for audio_clip in os.listdir(path + '/audio_speech_actors_01-24/' + actor):
        aud_path = path + '/audio_speech_actors_01-24/' + actor + '/' + audio_clip
        audio_data_paths.append(aud_path)

In [12]:
import random
random.seed(42)
random.shuffle(audio_data_paths)

print(len(audio_data_paths))

1440


In [13]:
train_index = int(0.6 * len(audio_data_paths))
val_index = int(0.75 * len(audio_data_paths))

In [14]:
train_files = audio_data_paths[:train_index]
val_files = audio_data_paths[train_index:val_index]
test_files = audio_data_paths[val_index:]

In [15]:
print(len(train_files))
print(len(val_files))
print(len(test_files))
print(train_files[0])

864
216
360
/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/audio_speech_actors_01-24/Actor_10/03-01-04-01-01-01-10.wav


In [16]:
os.mkdir('/content/training')
os.mkdir('/content/validation')
os.mkdir('/content/testing')

In [17]:
def resample_audio_and_denoise_and_save(audio_paths, dir='training'):
    resample_rate = 16000
    dir = '/content/' + dir + '/'
    out = []
    for i, aud_path in enumerate(audio_paths):
        file_name = aud_path.split('/')[-1]
        if i % 100 == 0:
            print(i)
        audio = AudioSegment.from_file(aud_path) # Get wav segment
        audio_16kHz = audio.set_frame_rate(resample_rate) # Resample audio

        signal = np.array(audio_16kHz.get_array_of_samples())
        filtered_audio = noise_removal(signal, resample_rate) # remove noise

        export_audio_soundfile(filtered_audio, resample_rate, dir + file_name)
        out.append(dir + file_name)

    return out

train_data_paths = resample_audio_and_denoise_and_save(train_files, dir='training')
val_data_paths = resample_audio_and_denoise_and_save(val_files, dir='validation')
test_data_paths = resample_audio_and_denoise_and_save(test_files, dir='testing')

0
100
200
300
400
500
600
700
800
0
100
200
0
100
200
300


In [None]:
def data_augmentation(audio_orig, sample_rate):
    # Create some data augmented versions
    # Add some guassian noise
    audio_augmented = []
    audio_augmented.append(audio_orig + 0.01 * torch.randn(audio_orig.shape)) # Add some gaussian noise
    pitch_shift = torchaudio.transforms.PitchShift(sample_rate, 3)
    audio_augmented.append(pitch_shift(audio_orig)) # Shift pitch up
    pitch_shift = torchaudio.transforms.PitchShift(sample_rate, -3)
    audio_augmented.append(pitch_shift(audio_orig)) # Shift pitch down

    return audio_augmented

def add_augmented_data(audio_paths):
    sr = 16000
    count = 0
    out = []

    while count < len(audio_paths):
        list_of_auds = []
        list_of_lengths = []
        list_of_file_names = []
        while count < len(audio_paths):
            aud_path = audio_paths[count]
            file_name = aud_path.split('/')[-1]
            audio, s = torchaudio.load(aud_path) # Want to normalize it for adding noise
            list_of_auds.append(audio.T)
            list_of_lengths.append(audio.shape[1])
            list_of_file_names.append(file_name)
            if count != 0 and count % 100 == 0:
                count += 1
                break
            count += 1


        # Pad items in sequence for batch shifting (Is much faster)
        print(len(list_of_auds))
        X = pad_sequence(list_of_auds, padding_value=0).permute(1, 2, 0)

        # Unpad sequence
        X_aug = data_augmentation(X, sr)
        print("Done Getting Augmented", "Count", count)
        for i in range(len(X_aug)):
            for j in range(len(list_of_auds)):
                file_name = '/content/training/' + list_of_file_names[j].split('.wav')[0] + '_aug'
                x_aug = X_aug[i][j][:, :list_of_lengths[j]]
                f = file_name + str(i) + '.wav'
                torchaudio.save(f, x_aug, sr)
                out.append(f)

    return out

train_data_paths.extend(add_augmented_data(train_data_paths))

101
Done Getting Augmented Count 101
100
Done Getting Augmented Count 201
100
Done Getting Augmented Count 301
100
Done Getting Augmented Count 401
100
Done Getting Augmented Count 501
100
Done Getting Augmented Count 601
100
Done Getting Augmented Count 701
100
Done Getting Augmented Count 801
63
Done Getting Augmented Count 864


In [1]:
augs = [file for file in os.listdir('/content/training/') if "aug" in file]
print(len(augs))

NameError: name 'os' is not defined

In [None]:
Audio('/content/training/' + augs[8])

In [None]:
Audio('/content/training/03-01-01-02-02-02-08_aug2.wav')

In [None]:
from IPython.display import Audio

In [None]:
print(len(train_data_paths))

3456


In [None]:
# Download training, test, and validation sets
!zip -r /content/training.zip /content/training
!zip -r /content/validation.zip /content/validation
!zip -r /content/testing.zip /content/testing

  adding: content/training/ (stored 0%)
  adding: content/training/03-01-07-01-02-01-02_aug2.wav (deflated 33%)
  adding: content/training/03-01-03-02-02-01-24_aug2.wav (deflated 25%)
  adding: content/training/03-01-07-02-01-02-23_aug0.wav (deflated 18%)
  adding: content/training/03-01-04-02-01-01-17_aug0.wav (deflated 16%)
  adding: content/training/03-01-05-02-02-02-20_aug2.wav (deflated 26%)
  adding: content/training/03-01-02-01-01-01-20.wav (deflated 43%)
  adding: content/training/03-01-05-02-02-02-15_aug2.wav (deflated 18%)
  adding: content/training/03-01-01-01-02-01-24.wav (deflated 31%)
  adding: content/training/03-01-06-01-01-02-08.wav (deflated 27%)
  adding: content/training/03-01-05-01-01-02-05_aug0.wav (deflated 17%)
  adding: content/training/03-01-06-02-01-01-16_aug2.wav (deflated 20%)
  adding: content/training/03-01-06-01-02-01-11_aug2.wav (deflated 28%)
  adding: content/training/03-01-03-02-02-02-18_aug1.wav (deflated 21%)
  adding: content/training/03-01-07-01-