spokenlanguage · egpbos · May 18, 2021 · May 17, 2021 · cwmeijer · May 18, 2021
diff --git a/platalea/asr.py b/platalea/asr.py
@@ -82,6 +82,7 @@ def cost(self, item):
 
 def experiment(net, data, config, slt=False):
     _device = platalea.hardware.device()
+
     def val_loss():
         with torch.no_grad():
             net.eval()
@@ -154,6 +155,7 @@ def val_loss():
         torch.save(net, 'net.best.pt')
     return results
 
+
 def get_default_config(hidden_size_factor=1024):
     fd = D.Flickr8KData
     hidden_size = hidden_size_factor * 3 // 4

diff --git a/platalea/attention.py b/platalea/attention.py
@@ -18,6 +18,7 @@ def forward(self, input):
         # return the resulting embedding
         return x
 
+
 class MeanPool(nn.Module):
     def __init__(self):
         super(MeanPool, self).__init__()
@@ -75,7 +76,7 @@ def __init__(self, in_size_enc, in_size_state, hidden_size):
         self.U_a = nn.Linear(in_size_enc, hidden_size, bias=False)
         self.W_a = nn.Linear(in_size_state, hidden_size, bias=False)
         self.v_a = nn.Linear(hidden_size, 1, bias=True)
-        self.prev_enc_out= None
+        self.prev_enc_out = None
 
     def forward(self, hidden, encoder_outputs):
         # Calculate energies for each encoder output

diff --git a/platalea/audio/features.py b/platalea/audio/features.py
@@ -5,38 +5,40 @@
 
 @author: danny
 """
-from platalea.audio.preproc import four,pad,preemph, hamming, notch
-from platalea.audio.filters import apply_filterbanks,filter_centers, create_filterbanks
+from platalea.audio.preproc import four, pad, preemph, hamming, notch
+from platalea.audio.filters import apply_filterbanks, filter_centers, create_filterbanks
 from scipy.fftpack import dct
 import numpy
 import math
 
 # this file contains the main bulk of the actuall feature creation functions
 
-def delta (data, N):
-# calculate delta features, n is the number of frames to look forward and backward
+
+def delta(data, N):
+    # calculate delta features, n is the number of frames to look forward and backward
 
     # create a delta array of the right shape
     dt = numpy.zeros(data.shape)
     # pad data with first and last frame for size of n
-    for n in range (N):
-        data = numpy.row_stack((data[0,:],data, data[-1,:]))
+    for n in range(N):
+        data = numpy.row_stack((data[0, :], data, data[-1, :]))
     # calc n*c[x+n] + c[x-n] for n in Nand sum them
-    for n in range (1, N + 1):
-       dt += numpy.array([n * (data[x+n,:] - data[x-n,:]) for x  in range (N, len(data) - N)])
+    for n in range(1, N + 1):
+        dt += numpy.array([n * (data[x+n, :] - data[x-n, :]) for x in range(N, len(data) - N)])
     # normalise the deltas for the size of N
-    normalise = 2* sum([numpy.power(x,2) for x in range (1, N+1)])
+    normalise = 2 * sum([numpy.power(x, 2) for x in range(1, N+1)])
 
     dt = dt/normalise
 
     return (dt)
 
+
 def raw_frames(data, frame_shift, window_size):
-# this function cuts the data into frames and calculates each frames' accuracy
+    # this function cuts the data into frames and calculates each frames' accuracy
 
-    #determine the number of frames to be extracted
+    # determine the number of frames to be extracted
     nframes = math.floor(data.size/frame_shift)
-    #apply notch filter
+    # apply notch filter
     notched_data = notch(data)
     # pad the data
     data = pad(notched_data, window_size, frame_shift)
@@ -46,8 +48,8 @@ def raw_frames(data, frame_shift, window_size):
     frames = []
     energy = []
 
-    for f in range (0, nframes):
-        frame = data[f * frame_shift : f * frame_shift + window_size]
+    for f in range(0, nframes):
+        frame = data[(f * frame_shift):(f * frame_shift + window_size)]
         energy.append(numpy.log(numpy.sum(numpy.square(frame), 0)))
         frames.append(frame)
 
@@ -59,43 +61,46 @@ def raw_frames(data, frame_shift, window_size):
 
     return (frames, energy)
 
+
 def get_freqspectrum(frames, alpha, fs, window_size):
-# this function prepares the raw frames for conversion to frequency spectrum
-# and applies fft
+    # this function prepares the raw frames for conversion to frequency spectrum
+    # and applies fft
 
     # apply preemphasis
     frames = preemph(frames, alpha)
     # apply hamming windowing
     frames = hamming(frames)
     # apply fft
-    freq_spectrum = four(frames,fs,window_size)
+    freq_spectrum = four(frames, fs, window_size)
 
     return freq_spectrum
 
+
 def get_fbanks(freq_spectrum, nfilters, fs):
-#  this function calculates the filters and creates filterbank features from
-#  the fft features
+    #  this function calculates the filters and creates filterbank features from
+    #  the fft features
 
     # get the frequencies corresponding to the bins returned by the fft
     xf = numpy.linspace(0.0, fs/2, numpy.shape(freq_spectrum)[1])
     # get the filter frequencies
-    fc = filter_centers (nfilters,fs,xf)
+    fc = filter_centers(nfilters, fs, xf)
     # create filterbanks
     filterbanks = create_filterbanks(nfilters, xf, fc)
     # apply filterbanks
     fbanks = apply_filterbanks(freq_spectrum, filterbanks)
 
     return fbanks
 
+
 def get_mfcc(fbanks):
-# this function creates mfccs from the fbank features
+    # this function creates mfccs from the fbank features
 
     # apply discrete cosine transform to get mfccs. According to convention,
     # we discard the first filterbank (which is roughly equal to the method
     # where we only space filters from 1000hz onwards)
-    mfcc = dct(fbanks[:,1:])
+    mfcc = dct(fbanks[:, 1:])
     # discard the first coefficient of the mffc as well and take the next 13
     # coefficients.
-    mfcc = mfcc[:,1:13]
+    mfcc = mfcc[:, 1:13]
 
     return mfcc
diff --git a/platalea/audio/filters.py b/platalea/audio/filters.py
@@ -9,24 +9,25 @@
 from platalea.audio.melfreq import freq2mel, mel2freq
 import numpy
 
-def create_filterbanks (nfilters,freqrange,fc):
+
+def create_filterbanks(nfilters, freqrange, fc):
     # function to create filter banks. takes as input
     # the number of filters to be created, the frequency range and the
     # filter centers
     filterbank = []
     # for the desired # of filters do
-    for n in range (0,nfilters):
+    for n in range(0, nfilters):
         # set the begin center and end frequency of the filters
         begin = fc[n]
-        center= fc[n+1]
+        center = fc[n+1]
         end = fc[n+2]
         f = []
         # create triangular filters
         for x in freqrange:
             # 0 for f outside the filter
             if x < begin:
                 f.append(0)
-            #increasing to 1 towards the center
+            # increasing to 1 towards the center
             elif begin <= x and x <= center:
                 f.append((x-begin)/(center-begin))
             # decreasing to 0 upwards from the center
@@ -36,27 +37,29 @@ def create_filterbanks (nfilters,freqrange,fc):
             elif x > end:
                 f.append(0)
         filterbank.append(f)
-        
+
     return filterbank
-
+
+
 def filter_centers(nfilters, fs, xf):
     # calculates the center frequencies for the mel filters
-    
-    #space the filters equally in mels
+
+    # space the filters equally in mels
     spacing = numpy.linspace(0, freq2mel(fs/2), nfilters+2)
-    #back from mels to frequency
+    # back from mels to frequency
     spacing = mel2freq(spacing)
     # round the filter frequencies to the nearest availlable fft bin frequencies
-    # and return the centers for the filters.  
-    filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing]    
-    
+    # and return the centers for the filters.
+    filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing]
+
     return filters
-
+
+
 def apply_filterbanks(data, filters):
     # function to apply the filterbanks and take the log of the filterbanks
-    filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters)))  
+    filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters)))
     # same as with energy, taking the log of a filter bank with 0 power results in -inf
     # we approximate 0 power with -50 the log of 2e-22
-    filtered_freq[filtered_freq == numpy.log(0)] = -50     
-    
+    filtered_freq[filtered_freq == numpy.log(0)] = -50
+
     return filtered_freq
diff --git a/platalea/audio/melfreq.py b/platalea/audio/melfreq.py
@@ -6,14 +6,16 @@
 @author: danny
 """
 import numpy
-#provides simple functions to convert a frequency to mel and vice versa
+# provides simple functions to convert a frequency to mel and vice versa
+
 
 def freq2mel(f):
-    #converts a frequency to mel
-    mel=1125*numpy.log(1+f/700)
+    # converts a frequency to mel
+    mel = 1125*numpy.log(1+f/700)
     return (mel)
 
+
 def mel2freq(m):
-    #converts mel to frequency
-    f=700*(numpy.exp(m/1125)-1)
-    return (f)
+    # converts mel to frequency
+    f = 700*(numpy.exp(m/1125)-1)
+    return f
diff --git a/platalea/audio/preproc.py b/platalea/audio/preproc.py
@@ -10,63 +10,67 @@
 import numpy
 # provides some basic preprocessing functions for audio files, such as
 # padding the frames, hammingwindow for the  frames, data preemphasis and fourrier
-# transform 
+# transform
 
 
 def four(frames, fs, windowsize):
-   # fft works on frames of size 2^x, first find the appropriate padsize for 
-   # our framesize.
-   exp = 1
-   while True:
-       if numpy.power(2,exp) - windowsize >= 0:
-           padsize= numpy.power(2,exp) - windowsize
-           break
-       else:
-           exp += 1
-   # pad frames to be of size 2^x        
-   frames = numpy.pad(frames, [(0,0), (0,padsize)], 'constant', constant_values = 0)
-   # set cutoff at the half the frame size (+1 to keep the bin around 
-   # which the spectrum is mirrored)
-   cutoff = int((windowsize+padsize)/2)+1
-   # perform fast fourier transform
-   Y = fft(frames)    
-   # take absolute power and collapse spectrum. Normalise the power for the
-   # amount of bins but multiply by 2 to make up for the collapse of the spectrum
-   Yamp = 2/(windowsize+padsize)* numpy.abs(Y[:, 0:cutoff])
-   # first amp (dc component) and nyquist freq bin are not to be doubled (as they
-   # are not mirrored in the fft)
-   Yamp[:,0] = Yamp[:,0]/2
-   Yamp[:,-1] = Yamp[:,-1]/2
-   return (Yamp)
+    # fft works on frames of size 2^x, first find the appropriate padsize for
+    # our framesize.
+    exp = 1
+    while True:
+        if numpy.power(2, exp) - windowsize >= 0:
+            padsize = numpy.power(2, exp) - windowsize
+            break
+        else:
+            exp += 1
+    # pad frames to be of size 2^x
+    frames = numpy.pad(frames, [(0, 0), (0, padsize)], 'constant', constant_values=0)
+    # set cutoff at the half the frame size (+1 to keep the bin around
+    # which the spectrum is mirrored)
+    cutoff = int((windowsize+padsize)/2)+1
+    # perform fast fourier transform
+    Y = fft(frames)
+    # take absolute power and collapse spectrum. Normalise the power for the
+    # amount of bins but multiply by 2 to make up for the collapse of the spectrum
+    Yamp = 2/(windowsize+padsize) * numpy.abs(Y[:, 0:cutoff])
+    # first amp (dc component) and nyquist freq bin are not to be doubled (as they
+    # are not mirrored in the fft)
+    Yamp[:, 0] = Yamp[:, 0]/2
+    Yamp[:, -1] = Yamp[:, -1]/2
+    return (Yamp)
+
 
 def notch(data):
-# apply a notch filter to remove the DC offset
+    # apply a notch filter to remove the DC offset
     b, a = iirnotch(0.001, 3.5)
     notched = lfilter(b, a, data)
     return notched
-
-def pad (data,window_size, frame_shift):
+
+
+def pad(data, window_size, frame_shift):
     # function to pad the audio file to fit the frameshift
     context_size = (window_size-frame_shift)/2
-    pad_size = context_size - numpy.mod(data.size, frame_shift) 
+    pad_size = context_size - numpy.mod(data.size, frame_shift)
     # if needed add padding to the end of the data
     if pad_size > 0:
         data = numpy.append(data, numpy.zeros(int(numpy.ceil(pad_size))))
-    #always add padding to the front of the data
+    # always add padding to the front of the data
     data = numpy.append(numpy.zeros(int(context_size)), data)
     return(data)
-
+
+
 def preemph(data, alpha):
     # preemphasises the data: x(preemph) = X(t) - X(t-1)*alpha
     xt = data
     xtminus1 = data*alpha
-    xtminus1 = numpy.insert(xtminus1,0,0,1)[:,:-1]
-    data_preemph = xt-xtminus1  
+    xtminus1 = numpy.insert(xtminus1, 0, 0, 1)[:, :-1]
+    data_preemph = xt-xtminus1
     return data_preemph
-
+
+
 def hamming(data):
     # apply hamming windowing to a frame of data
     L = numpy.shape(data)[1]
     hammingwindow = 0.54-(0.46*numpy.cos(2*numpy.pi*numpy.arange(L)/(L-1)))
-    data = numpy.multiply(data,hammingwindow)
+    data = numpy.multiply(data, hammingwindow)
     return data