Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix flake8 layout warnings #91

Merged
merged 1 commit into from
May 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions platalea/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def cost(self, item):

def experiment(net, data, config, slt=False):
_device = platalea.hardware.device()

def val_loss():
with torch.no_grad():
net.eval()
Expand Down Expand Up @@ -154,6 +155,7 @@ def val_loss():
torch.save(net, 'net.best.pt')
return results


def get_default_config(hidden_size_factor=1024):
fd = D.Flickr8KData
hidden_size = hidden_size_factor * 3 // 4
Expand Down
3 changes: 2 additions & 1 deletion platalea/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def forward(self, input):
# return the resulting embedding
return x


class MeanPool(nn.Module):
def __init__(self):
super(MeanPool, self).__init__()
Expand Down Expand Up @@ -75,7 +76,7 @@ def __init__(self, in_size_enc, in_size_state, hidden_size):
self.U_a = nn.Linear(in_size_enc, hidden_size, bias=False)
self.W_a = nn.Linear(in_size_state, hidden_size, bias=False)
self.v_a = nn.Linear(hidden_size, 1, bias=True)
self.prev_enc_out= None
self.prev_enc_out = None

def forward(self, hidden, encoder_outputs):
# Calculate energies for each encoder output
Expand Down
51 changes: 28 additions & 23 deletions platalea/audio/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,40 @@

@author: danny
"""
from platalea.audio.preproc import four,pad,preemph, hamming, notch
from platalea.audio.filters import apply_filterbanks,filter_centers, create_filterbanks
from platalea.audio.preproc import four, pad, preemph, hamming, notch
from platalea.audio.filters import apply_filterbanks, filter_centers, create_filterbanks
from scipy.fftpack import dct
import numpy
import math

# this file contains the main bulk of the actuall feature creation functions

def delta (data, N):
# calculate delta features, n is the number of frames to look forward and backward

def delta(data, N):
# calculate delta features, n is the number of frames to look forward and backward

# create a delta array of the right shape
dt = numpy.zeros(data.shape)
# pad data with first and last frame for size of n
for n in range (N):
data = numpy.row_stack((data[0,:],data, data[-1,:]))
for n in range(N):
data = numpy.row_stack((data[0, :], data, data[-1, :]))
# calc n*c[x+n] + c[x-n] for n in Nand sum them
for n in range (1, N + 1):
dt += numpy.array([n * (data[x+n,:] - data[x-n,:]) for x in range (N, len(data) - N)])
for n in range(1, N + 1):
dt += numpy.array([n * (data[x+n, :] - data[x-n, :]) for x in range(N, len(data) - N)])
# normalise the deltas for the size of N
normalise = 2* sum([numpy.power(x,2) for x in range (1, N+1)])
normalise = 2 * sum([numpy.power(x, 2) for x in range(1, N+1)])

dt = dt/normalise

return (dt)


def raw_frames(data, frame_shift, window_size):
# this function cuts the data into frames and calculates each frames' accuracy
# this function cuts the data into frames and calculates each frames' accuracy

#determine the number of frames to be extracted
# determine the number of frames to be extracted
nframes = math.floor(data.size/frame_shift)
#apply notch filter
# apply notch filter
notched_data = notch(data)
# pad the data
data = pad(notched_data, window_size, frame_shift)
Expand All @@ -46,8 +48,8 @@ def raw_frames(data, frame_shift, window_size):
frames = []
energy = []

for f in range (0, nframes):
frame = data[f * frame_shift : f * frame_shift + window_size]
for f in range(0, nframes):
frame = data[(f * frame_shift):(f * frame_shift + window_size)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is interesting to me. Is it better with brackets? Is that a personal thing or did some linter suggest this?
It's not that I disagree; either way is fine with me, just curious.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it was interesting. The linter complained about the space in front of the colon. Not about the one behind the colon, strangely. However, when you write it without the parentheses, it becomes hard to read, because the colon is hard to notice, so it looks more like a single index with a long, complex calculation, than a range-index. I think the underlying issue here is that for readability it would be better to split into three lines: begin = f * frame_shift, end = f * frame_shift + window_size and frame = data[begin:end]. But not hugely important, I would say.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or actually just data[begin:begin+window_size].

energy.append(numpy.log(numpy.sum(numpy.square(frame), 0)))
frames.append(frame)

Expand All @@ -59,43 +61,46 @@ def raw_frames(data, frame_shift, window_size):

return (frames, energy)


def get_freqspectrum(frames, alpha, fs, window_size):
# this function prepares the raw frames for conversion to frequency spectrum
# and applies fft
# this function prepares the raw frames for conversion to frequency spectrum
# and applies fft

# apply preemphasis
frames = preemph(frames, alpha)
# apply hamming windowing
frames = hamming(frames)
# apply fft
freq_spectrum = four(frames,fs,window_size)
freq_spectrum = four(frames, fs, window_size)

return freq_spectrum


def get_fbanks(freq_spectrum, nfilters, fs):
# this function calculates the filters and creates filterbank features from
# the fft features
# this function calculates the filters and creates filterbank features from
# the fft features

# get the frequencies corresponding to the bins returned by the fft
xf = numpy.linspace(0.0, fs/2, numpy.shape(freq_spectrum)[1])
# get the filter frequencies
fc = filter_centers (nfilters,fs,xf)
fc = filter_centers(nfilters, fs, xf)
# create filterbanks
filterbanks = create_filterbanks(nfilters, xf, fc)
# apply filterbanks
fbanks = apply_filterbanks(freq_spectrum, filterbanks)

return fbanks


def get_mfcc(fbanks):
# this function creates mfccs from the fbank features
# this function creates mfccs from the fbank features

# apply discrete cosine transform to get mfccs. According to convention,
# we discard the first filterbank (which is roughly equal to the method
# where we only space filters from 1000hz onwards)
mfcc = dct(fbanks[:,1:])
mfcc = dct(fbanks[:, 1:])
# discard the first coefficient of the mffc as well and take the next 13
# coefficients.
mfcc = mfcc[:,1:13]
mfcc = mfcc[:, 1:13]

return mfcc
35 changes: 19 additions & 16 deletions platalea/audio/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,25 @@
from platalea.audio.melfreq import freq2mel, mel2freq
import numpy

def create_filterbanks (nfilters,freqrange,fc):

def create_filterbanks(nfilters, freqrange, fc):
# function to create filter banks. takes as input
# the number of filters to be created, the frequency range and the
# filter centers
filterbank = []
# for the desired # of filters do
for n in range (0,nfilters):
for n in range(0, nfilters):
# set the begin center and end frequency of the filters
begin = fc[n]
center= fc[n+1]
center = fc[n+1]
end = fc[n+2]
f = []
# create triangular filters
for x in freqrange:
# 0 for f outside the filter
if x < begin:
f.append(0)
#increasing to 1 towards the center
# increasing to 1 towards the center
elif begin <= x and x <= center:
f.append((x-begin)/(center-begin))
# decreasing to 0 upwards from the center
Expand All @@ -36,27 +37,29 @@ def create_filterbanks (nfilters,freqrange,fc):
elif x > end:
f.append(0)
filterbank.append(f)

return filterbank



def filter_centers(nfilters, fs, xf):
# calculates the center frequencies for the mel filters
#space the filters equally in mels

# space the filters equally in mels
spacing = numpy.linspace(0, freq2mel(fs/2), nfilters+2)
#back from mels to frequency
# back from mels to frequency
spacing = mel2freq(spacing)
# round the filter frequencies to the nearest availlable fft bin frequencies
# and return the centers for the filters.
filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing]
# and return the centers for the filters.
filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing]

return filters



def apply_filterbanks(data, filters):
# function to apply the filterbanks and take the log of the filterbanks
filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters)))
filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters)))
# same as with energy, taking the log of a filter bank with 0 power results in -inf
# we approximate 0 power with -50 the log of 2e-22
filtered_freq[filtered_freq == numpy.log(0)] = -50
filtered_freq[filtered_freq == numpy.log(0)] = -50

return filtered_freq
14 changes: 8 additions & 6 deletions platalea/audio/melfreq.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
@author: danny
"""
import numpy
#provides simple functions to convert a frequency to mel and vice versa
# provides simple functions to convert a frequency to mel and vice versa


def freq2mel(f):
#converts a frequency to mel
mel=1125*numpy.log(1+f/700)
# converts a frequency to mel
mel = 1125*numpy.log(1+f/700)
return (mel)


def mel2freq(m):
#converts mel to frequency
f=700*(numpy.exp(m/1125)-1)
return (f)
# converts mel to frequency
f = 700*(numpy.exp(m/1125)-1)
return f
74 changes: 39 additions & 35 deletions platalea/audio/preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,63 +10,67 @@
import numpy
# provides some basic preprocessing functions for audio files, such as
# padding the frames, hammingwindow for the frames, data preemphasis and fourrier
# transform
# transform


def four(frames, fs, windowsize):
# fft works on frames of size 2^x, first find the appropriate padsize for
# our framesize.
exp = 1
while True:
if numpy.power(2,exp) - windowsize >= 0:
padsize= numpy.power(2,exp) - windowsize
break
else:
exp += 1
# pad frames to be of size 2^x
frames = numpy.pad(frames, [(0,0), (0,padsize)], 'constant', constant_values = 0)
# set cutoff at the half the frame size (+1 to keep the bin around
# which the spectrum is mirrored)
cutoff = int((windowsize+padsize)/2)+1
# perform fast fourier transform
Y = fft(frames)
# take absolute power and collapse spectrum. Normalise the power for the
# amount of bins but multiply by 2 to make up for the collapse of the spectrum
Yamp = 2/(windowsize+padsize)* numpy.abs(Y[:, 0:cutoff])
# first amp (dc component) and nyquist freq bin are not to be doubled (as they
# are not mirrored in the fft)
Yamp[:,0] = Yamp[:,0]/2
Yamp[:,-1] = Yamp[:,-1]/2
return (Yamp)
# fft works on frames of size 2^x, first find the appropriate padsize for
# our framesize.
exp = 1
while True:
if numpy.power(2, exp) - windowsize >= 0:
padsize = numpy.power(2, exp) - windowsize
break
else:
exp += 1
# pad frames to be of size 2^x
frames = numpy.pad(frames, [(0, 0), (0, padsize)], 'constant', constant_values=0)
# set cutoff at the half the frame size (+1 to keep the bin around
# which the spectrum is mirrored)
cutoff = int((windowsize+padsize)/2)+1
# perform fast fourier transform
Y = fft(frames)
# take absolute power and collapse spectrum. Normalise the power for the
# amount of bins but multiply by 2 to make up for the collapse of the spectrum
Yamp = 2/(windowsize+padsize) * numpy.abs(Y[:, 0:cutoff])
# first amp (dc component) and nyquist freq bin are not to be doubled (as they
# are not mirrored in the fft)
Yamp[:, 0] = Yamp[:, 0]/2
Yamp[:, -1] = Yamp[:, -1]/2
return (Yamp)


def notch(data):
# apply a notch filter to remove the DC offset
# apply a notch filter to remove the DC offset
b, a = iirnotch(0.001, 3.5)
notched = lfilter(b, a, data)
return notched

def pad (data,window_size, frame_shift):


def pad(data, window_size, frame_shift):
# function to pad the audio file to fit the frameshift
context_size = (window_size-frame_shift)/2
pad_size = context_size - numpy.mod(data.size, frame_shift)
pad_size = context_size - numpy.mod(data.size, frame_shift)
# if needed add padding to the end of the data
if pad_size > 0:
data = numpy.append(data, numpy.zeros(int(numpy.ceil(pad_size))))
#always add padding to the front of the data
# always add padding to the front of the data
data = numpy.append(numpy.zeros(int(context_size)), data)
return(data)



def preemph(data, alpha):
# preemphasises the data: x(preemph) = X(t) - X(t-1)*alpha
xt = data
xtminus1 = data*alpha
xtminus1 = numpy.insert(xtminus1,0,0,1)[:,:-1]
data_preemph = xt-xtminus1
xtminus1 = numpy.insert(xtminus1, 0, 0, 1)[:, :-1]
data_preemph = xt-xtminus1
return data_preemph



def hamming(data):
# apply hamming windowing to a frame of data
L = numpy.shape(data)[1]
hammingwindow = 0.54-(0.46*numpy.cos(2*numpy.pi*numpy.arange(L)/(L-1)))
data = numpy.multiply(data,hammingwindow)
data = numpy.multiply(data, hammingwindow)
return data
Loading