<a href="https://colab.research.google.com/github/riyag283/Federated-Learning/blob/main/SpeechRecogFL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install syft

Collecting syft
[?25l  Downloading https://files.pythonhosted.org/packages/1c/73/891ba1dca7e0ba77be211c36688f083184d8c9d5901b8cd59cbf867052f3/syft-0.2.9-py3-none-any.whl (433kB)
[K     |████████████████████████████████| 440kB 6.1MB/s 
[?25hCollecting torch~=1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 22kB/s 
[?25hCollecting notebook==5.7.8
[?25l  Downloading https://files.pythonhosted.org/packages/f6/36/89ebfffc9dd8c8dbd81c1ffb53e3d4233ee666414c143959477cb07cc5f5/notebook-5.7.8-py2.py3-none-any.whl (9.0MB)
[K     |████████████████████████████████| 9.0MB 32.2MB/s 
Collecting tornado==4.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/e3/7b/e29ab3d51c8df66922fea216e2bddfcb6430fb29620e5165b16a216e0d3c/tornado-4.5.3.tar.gz (484kB)
[K     |████████████████████████████████| 491k

In [2]:
import warnings
warnings.filterwarnings('ignore')

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import torch
from torchvision.transforms import Compose
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import math
import time
from tqdm import *
import os
import librosa
import numpy as np
import random
import shutil

In [4]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)

In [5]:
CLASSES = 'unknown, silence, yes, no, left, right'.split(', ')

In [6]:
# prepare datasets

# create directory if not exist
# we put data on datasets directory
if os.path.isdir('./datasets') is False:
    try:
        os.mkdir('./datasets')
    except OSError:
        print ("Creation of the directory datasets failed")

if os.path.isdir('./datasets/speech_commands') is True:
    print("datasets seems to exists.")
else :
    # download data
    ! wget -O datasets/speech_commands_v0.01.tar.gz http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
    
    # create directory
    os.mkdir('./datasets/speech_commands')
    
    # create audio directory
    if os.path.isdir('./datasets/speech_commands/audio') is False:
        try:
            os.mkdir('./datasets/speech_commands/audio')
        except OSError:
            print ("Creation of the directory datasets/speech_commands/audio failed")
        

    # untar files.
    ! tar -xzf datasets/speech_commands_v0.01.tar.gz -C datasets/speech_commands/audio

--2020-10-16 16:11:38--  http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 66.102.1.128, 2a00:1450:400c:c06::80
Connecting to download.tensorflow.org (download.tensorflow.org)|66.102.1.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489096277 (1.4G) [application/gzip]
Saving to: ‘datasets/speech_commands_v0.01.tar.gz’


2020-10-16 16:12:15 (39.4 MB/s) - ‘datasets/speech_commands_v0.01.tar.gz’ saved [1489096277/1489096277]



In [7]:
def move_files(src_folder, to_folder, list_file):
    with open(list_file) as f:
        for line in f.readlines():
            line = line.rstrip()
            dirname = os.path.dirname(line)
            dest = os.path.join(to_folder, dirname)
            if not os.path.exists(dest):
                os.mkdir(dest)
            shutil.move(os.path.join(src_folder, line), dest)

In [8]:
def prepare_dataset():
    audio_folder = "datasets/speech_commands/audio"
    validation_path = "datasets/speech_commands/audio/validation_list.txt"
    test_path = "datasets/speech_commands/audio/testing_list.txt"

    valid_folder = "datasets/speech_commands/valid"
    test_folder = "datasets/speech_commands/test"
    train_folder = "datasets/speech_commands/train"

    if os.path.isdir(valid_folder) is False:
        os.mkdir(valid_folder)
    if os.path.isdir(test_folder) is False:
        os.mkdir(test_folder)

    move_files(audio_folder, test_folder, test_path)
    move_files(audio_folder, valid_folder, validation_path)
    os.rename(audio_folder, train_folder)

In [9]:
if os.path.isdir('./datasets/speech_commands/train') is False:
    prepare_dataset()

In [10]:
import IPython.display
example_path = "datasets/speech_commands/train/right/9f4098cb_nohash_0.wav"

IPython.display.Audio(example_path)

In [11]:
def should_apply_transform(prob=0.5):
    """Transforms are only randomly applied with the given probability."""
    return random.random() < prob

In [12]:
class ChangeAmplitude(object):
    """Changes amplitude of an audio randomly."""

    def __init__(self, amplitude_range=(0.7, 1.1)):
        self.amplitude_range = amplitude_range

    def __call__(self, data):
        if not should_apply_transform():
            return data

        data['samples'] = data['samples'] * random.uniform(*self.amplitude_range)
        return data

In [15]:
class ChangeSpeedAndPitchAudio(object):
    """Change the speed of an audio. This transform also changes the pitch of the audio."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        samples = data['samples']
        sample_rate = data['sample_rate']
        scale = random.uniform(-self.max_scale, self.max_scale)
        speed_fac = 1.0  / (1 + scale)
        data['samples'] = np.interp(np.arange(0, len(samples), speed_fac), np.arange(0,len(samples)), samples).astype(np.float32)
        return data

In [16]:
class FixAudioLength(object):
    """Either pads or truncates an audio into a fixed length."""

    def __init__(self, time=1):
        self.time = time

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        length = int(self.time * sample_rate)
        if length < len(samples):
            data['samples'] = samples[:length]
        elif length > len(samples):
            data['samples'] = np.pad(samples, (0, length - len(samples)), "constant")
        return data

In [17]:
class ToSTFT(object):
    """Applies on an audio the short time fourier transform."""

    def __init__(self, n_fft=2048, hop_length=512):
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        data['n_fft'] = self.n_fft
        data['hop_length'] = self.hop_length
        data['stft'] = librosa.stft(samples, n_fft=self.n_fft, hop_length=self.hop_length)
        data['stft_shape'] = data['stft'].shape
        return data

In [18]:
class StretchAudioOnSTFT(object):
    """Stretches an audio on the frequency domain."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data['stft']
        sample_rate = data['sample_rate']
        hop_length = data['hop_length']
        scale = random.uniform(-self.max_scale, self.max_scale)
        stft_stretch = librosa.core.phase_vocoder(stft, 1+scale, hop_length=hop_length)
        data['stft'] = stft_stretch
        return data

In [19]:
class TimeshiftAudioOnSTFT(object):
    """A simple timeshift on the frequency domain without multiplying with exp."""

    def __init__(self, max_shift=8):
        self.max_shift = max_shift

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data['stft']
        shift = random.randint(-self.max_shift, self.max_shift)
        a = -min(0, shift)
        b = max(0, shift)
        stft = np.pad(stft, ((0, 0), (a, b)), "constant")
        if a == 0:
            stft = stft[:,b:]
        else:
            stft = stft[:,0:-a]
        data['stft'] = stft
        return data

In [20]:
class FixSTFTDimension(object):
    """Either pads or truncates in the time axis on the frequency domain, applied after stretching, time shifting etc."""

    def __call__(self, data):
        stft = data['stft']
        t_len = stft.shape[1]
        orig_t_len = data['stft_shape'][1]
        if t_len > orig_t_len:
            stft = stft[:,0:orig_t_len]
        elif t_len < orig_t_len:
            stft = np.pad(stft, ((0, 0), (0, orig_t_len-t_len)), "constant")

        data['stft'] = stft
        return data

In [21]:
data_aug_transform = Compose([
    ChangeAmplitude(), 
    ChangeSpeedAndPitchAudio(), 
    FixAudioLength(), 
    ToSTFT(), 
    StretchAudioOnSTFT(), 
    TimeshiftAudioOnSTFT(), 
    FixSTFTDimension()])

In [22]:
class BackgroundNoiseDataset(Dataset):
    """Dataset for silence / background noise."""

    def __init__(self, folder, transform=None, sample_rate=16000, sample_length=1):
        audio_files = [d for d in os.listdir(folder) if os.path.isfile(os.path.join(folder, d)) and d.endswith('.wav')]
        samples = []
        for f in audio_files:
            path = os.path.join(folder, f)
            s, sr = librosa.load(path, sample_rate)
            samples.append(s)

        samples = np.hstack(samples)
        c = int(sample_rate * sample_length)
        r = len(samples) // c
        self.samples = samples[:r*c].reshape(-1, c)
        self.sample_rate = sample_rate
        self.classes = CLASSES
        self.transform = transform
        self.path = folder

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        data = {'samples': self.samples[index], 'sample_rate': self.sample_rate, 'target': 1, 'path': self.path}
        
        if self.transform is not None:
            data = self.transform(data)

        return data

In [23]:
background_noise_dir = "./datasets/speech_commands/train/_background_noise_"
bg_dataset = BackgroundNoiseDataset(background_noise_dir, data_aug_transform)

In [24]:
class AddBackgroundNoiseOnSTFT(Dataset):
    """Adds a random background noise on the frequency domain."""

    def __init__(self, bg_dataset, max_percentage=0.45):
        self.bg_dataset = bg_dataset
        self.max_percentage = max_percentage

    def __call__(self, data):
        if not should_apply_transform():
            return data

        noise = random.choice(self.bg_dataset)['stft']
        percentage = random.uniform(0, self.max_percentage)
        data['stft'] = data['stft'] * (1 - percentage) + noise * percentage
        return data

In [25]:
add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset)

In [26]:
class ToMelSpectrogramFromSTFT(object):
    """Creates the mel spectrogram from the short time fourier transform of a file. The result is a 32x32 matrix."""

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        stft = data['stft']
        sample_rate = data['sample_rate']
        n_fft = data['n_fft']
        mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels)
        s = np.dot(mel_basis, np.abs(stft)**2.0)
        data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max)
        return data

In [27]:
class DeleteSTFT(object):
    """Pytorch doesn't like complex numbers, use this transform to remove STFT after computing the mel spectrogram."""

    def __call__(self, data):
        del data['stft']
        return data

In [28]:
class ToTensor(object):
    """Converts into a tensor."""

    def __init__(self, np_name, tensor_name, normalize=None):
        self.np_name = np_name
        self.tensor_name = tensor_name
        self.normalize = normalize

    def __call__(self, data):
        tensor = torch.FloatTensor(data[self.np_name])
        if self.normalize is not None:
            mean, std = self.normalize
            tensor -= mean
            tensor /= std
        data[self.tensor_name] = tensor
        return data

In [29]:
n_mels = 32

train_feature_transform = Compose([
    ToMelSpectrogramFromSTFT(n_mels=n_mels), 
    DeleteSTFT(), 
    ToTensor('mel_spectrogram', 'input')])

In [30]:
class LoadAudio(object):
    """Loads an audio into a numpy array."""

    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate

    def __call__(self, data):
        
        path = data['path']
        if path:
            samples, sample_rate = librosa.load(path, self.sample_rate)
        else:
            # silence
            sample_rate = self.sample_rate
            samples = np.zeros(sample_rate, dtype=np.float32)
        data['samples'] = samples
        data['sample_rate'] = sample_rate
        return data

In [31]:
from random import shuffle
from random import randrange

class SpeechCommandsDataset(Dataset):
    """Google speech commands dataset. Only 'yes', 'no', 'up', 'down', 'left',
    'right', 'on', 'off', 'stop' and 'go' are treated as known classes.
    All other classes are used as 'unknown' samples.
    See for more information: https://www.kaggle.com/c/tensorflow-speech-recognition-challenge
    """

    def __init__(self, folder, transform=None, classes=CLASSES, silence_percentage=0.1, use_rate=1.0):
        all_classes = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d)) and not d.startswith('_')]
        #for c in classes[2:]:
        #    assert c in all_classes
        
        class_to_idx = {classes[i]: i for i in range(len(classes))}
        for c in all_classes:
            if c not in class_to_idx:
                class_to_idx[c] = 0
        
        # we use subset of datasets
        # use 10 % of unkown.
        data = []
        for c in all_classes:
            d = os.path.join(folder, c)
            target = class_to_idx[c]
            if c in classes:
                for f in os.listdir(d):
                    path = os.path.join(d, f)
                    data.append((path, target))
#             else:
#                 # add unkown
#                 if randrange(10) < 1:
#                     for f in os.listdir(d):
#                         path = os.path.join(d, f)
#                         data.append((path, target)) 

        
        shuffle(data)
        if use_rate != 1.0:
            sample_count = int(len(data) * use_rate)
            data = data[:sample_count]
        

        # add silence
        target = class_to_idx['silence']
        data += [('', target)] * int(len(data) * silence_percentage)

        self.classes = classes
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        path, target = self.data[index]
        data = {'path': path, 'target': target}

        if self.transform is not None:
            data = self.transform(data)

        return data['input'], target

    def make_weights_for_balanced_classes(self):
        """adopted from https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3"""

        nclasses = len(self.classes)
        count = np.zeros(nclasses)
        for item in self.data:
            count[item[1]] += 1

        N = float(sum(count))
        weight_per_class = N / count
        weight = np.zeros(len(self))
        for idx, item in enumerate(self.data):
            weight[idx] = weight_per_class[item[1]]
        return weight

In [32]:
use_rate = 1.0

train_dataset_dir = "./datasets/speech_commands/train"
train_dataset = SpeechCommandsDataset(train_dataset_dir,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         add_bg_noise,
                                         train_feature_transform]), use_rate=use_rate)

In [33]:
len(train_dataset)

8144

In [34]:
class ToMelSpectrogram(object):
    """Creates the mel spectrogram from an audio. The result is a 32x32 matrix."""

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels)
        data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max)
        return data

In [35]:
valid_feature_transform = Compose([
    ToMelSpectrogram(n_mels=n_mels), 
    ToTensor('mel_spectrogram', 'input')])

In [36]:
valid_dataset_dir = "./datasets/speech_commands/valid"
valid_dataset = SpeechCommandsDataset(valid_dataset_dir,
                                Compose([LoadAudio(),
                                         FixAudioLength(),
                                         valid_feature_transform]))

In [37]:
len(valid_dataset)

1137

In [38]:
# define dataloaders

# batch size is 64
batch_size = 64

# we define training dataloader later right after importing PySyft, library for privacy preserving deep learning

# define validation dataloader, which is just normal dataloader
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [39]:
import syft as sy  # <-- NEW: import the Pysyft library
hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice

In [41]:
# defaine federated dataloader
# it takes time. be patient.
federated_train_loader = sy.FederatedDataLoader(
    train_dataset.federate((bob, alice)),
    batch_size=batch_size,
)

In [42]:
print('done')

done
