From 896f0d213441ffbabca467f079d35ae2f28be82b Mon Sep 17 00:00:00 2001 From: "E. G. Patrick Bos" Date: Mon, 17 May 2021 20:46:23 +0200 Subject: [PATCH] fix flake8 layout warnings --- platalea/asr.py | 2 + platalea/attention.py | 3 +- platalea/audio/features.py | 51 ++++++++------ platalea/audio/filters.py | 35 ++++----- platalea/audio/melfreq.py | 14 ++-- platalea/audio/preproc.py | 74 +++++++++++--------- platalea/basicvq.py | 25 ++++--- platalea/dataset.py | 1 - platalea/encoders.py | 11 +-- platalea/experiments/flickr8k/pip_seq.py | 2 +- platalea/experiments/flickr8k/transformer.py | 8 ++- platalea/fix_json.py | 20 +++--- platalea/hardware.py | 6 +- platalea/ipa.py | 9 +-- platalea/loss.py | 8 ++- platalea/optimizers.py | 2 +- platalea/rank_eval.py | 6 +- platalea/schedulers.py | 2 +- platalea/score.py | 8 +-- platalea/text_image.py | 1 + platalea/utils/copy_best.py | 6 +- platalea/utils/evaluate_net.py | 2 + platalea/utils/extract_transcriptions.py | 2 + platalea/utils/flickr8k_filter_metadata.py | 3 + platalea/vq.py | 8 +-- platalea/vq_encode.py | 10 +-- platalea/xer.py | 3 - setup.py | 4 +- tests/test_experiments.py | 3 +- 29 files changed, 182 insertions(+), 147 deletions(-) diff --git a/platalea/asr.py b/platalea/asr.py index b25af66..ebef59e 100644 --- a/platalea/asr.py +++ b/platalea/asr.py @@ -82,6 +82,7 @@ def cost(self, item): def experiment(net, data, config, slt=False): _device = platalea.hardware.device() + def val_loss(): with torch.no_grad(): net.eval() @@ -154,6 +155,7 @@ def val_loss(): torch.save(net, 'net.best.pt') return results + def get_default_config(hidden_size_factor=1024): fd = D.Flickr8KData hidden_size = hidden_size_factor * 3 // 4 diff --git a/platalea/attention.py b/platalea/attention.py index 8534940..abfa352 100644 --- a/platalea/attention.py +++ b/platalea/attention.py @@ -18,6 +18,7 @@ def forward(self, input): # return the resulting embedding return x + class MeanPool(nn.Module): def __init__(self): super(MeanPool, self).__init__() @@ -75,7 +76,7 @@ def __init__(self, in_size_enc, in_size_state, hidden_size): self.U_a = nn.Linear(in_size_enc, hidden_size, bias=False) self.W_a = nn.Linear(in_size_state, hidden_size, bias=False) self.v_a = nn.Linear(hidden_size, 1, bias=True) - self.prev_enc_out= None + self.prev_enc_out = None def forward(self, hidden, encoder_outputs): # Calculate energies for each encoder output diff --git a/platalea/audio/features.py b/platalea/audio/features.py index b7c6a59..d6eb04c 100644 --- a/platalea/audio/features.py +++ b/platalea/audio/features.py @@ -5,38 +5,40 @@ @author: danny """ -from platalea.audio.preproc import four,pad,preemph, hamming, notch -from platalea.audio.filters import apply_filterbanks,filter_centers, create_filterbanks +from platalea.audio.preproc import four, pad, preemph, hamming, notch +from platalea.audio.filters import apply_filterbanks, filter_centers, create_filterbanks from scipy.fftpack import dct import numpy import math # this file contains the main bulk of the actuall feature creation functions -def delta (data, N): -# calculate delta features, n is the number of frames to look forward and backward + +def delta(data, N): + # calculate delta features, n is the number of frames to look forward and backward # create a delta array of the right shape dt = numpy.zeros(data.shape) # pad data with first and last frame for size of n - for n in range (N): - data = numpy.row_stack((data[0,:],data, data[-1,:])) + for n in range(N): + data = numpy.row_stack((data[0, :], data, data[-1, :])) # calc n*c[x+n] + c[x-n] for n in Nand sum them - for n in range (1, N + 1): - dt += numpy.array([n * (data[x+n,:] - data[x-n,:]) for x in range (N, len(data) - N)]) + for n in range(1, N + 1): + dt += numpy.array([n * (data[x+n, :] - data[x-n, :]) for x in range(N, len(data) - N)]) # normalise the deltas for the size of N - normalise = 2* sum([numpy.power(x,2) for x in range (1, N+1)]) + normalise = 2 * sum([numpy.power(x, 2) for x in range(1, N+1)]) dt = dt/normalise return (dt) + def raw_frames(data, frame_shift, window_size): -# this function cuts the data into frames and calculates each frames' accuracy + # this function cuts the data into frames and calculates each frames' accuracy - #determine the number of frames to be extracted + # determine the number of frames to be extracted nframes = math.floor(data.size/frame_shift) - #apply notch filter + # apply notch filter notched_data = notch(data) # pad the data data = pad(notched_data, window_size, frame_shift) @@ -46,8 +48,8 @@ def raw_frames(data, frame_shift, window_size): frames = [] energy = [] - for f in range (0, nframes): - frame = data[f * frame_shift : f * frame_shift + window_size] + for f in range(0, nframes): + frame = data[(f * frame_shift):(f * frame_shift + window_size)] energy.append(numpy.log(numpy.sum(numpy.square(frame), 0))) frames.append(frame) @@ -59,27 +61,29 @@ def raw_frames(data, frame_shift, window_size): return (frames, energy) + def get_freqspectrum(frames, alpha, fs, window_size): -# this function prepares the raw frames for conversion to frequency spectrum -# and applies fft + # this function prepares the raw frames for conversion to frequency spectrum + # and applies fft # apply preemphasis frames = preemph(frames, alpha) # apply hamming windowing frames = hamming(frames) # apply fft - freq_spectrum = four(frames,fs,window_size) + freq_spectrum = four(frames, fs, window_size) return freq_spectrum + def get_fbanks(freq_spectrum, nfilters, fs): -# this function calculates the filters and creates filterbank features from -# the fft features + # this function calculates the filters and creates filterbank features from + # the fft features # get the frequencies corresponding to the bins returned by the fft xf = numpy.linspace(0.0, fs/2, numpy.shape(freq_spectrum)[1]) # get the filter frequencies - fc = filter_centers (nfilters,fs,xf) + fc = filter_centers(nfilters, fs, xf) # create filterbanks filterbanks = create_filterbanks(nfilters, xf, fc) # apply filterbanks @@ -87,15 +91,16 @@ def get_fbanks(freq_spectrum, nfilters, fs): return fbanks + def get_mfcc(fbanks): -# this function creates mfccs from the fbank features + # this function creates mfccs from the fbank features # apply discrete cosine transform to get mfccs. According to convention, # we discard the first filterbank (which is roughly equal to the method # where we only space filters from 1000hz onwards) - mfcc = dct(fbanks[:,1:]) + mfcc = dct(fbanks[:, 1:]) # discard the first coefficient of the mffc as well and take the next 13 # coefficients. - mfcc = mfcc[:,1:13] + mfcc = mfcc[:, 1:13] return mfcc diff --git a/platalea/audio/filters.py b/platalea/audio/filters.py index e95572f..22b5680 100644 --- a/platalea/audio/filters.py +++ b/platalea/audio/filters.py @@ -9,16 +9,17 @@ from platalea.audio.melfreq import freq2mel, mel2freq import numpy -def create_filterbanks (nfilters,freqrange,fc): + +def create_filterbanks(nfilters, freqrange, fc): # function to create filter banks. takes as input # the number of filters to be created, the frequency range and the # filter centers filterbank = [] # for the desired # of filters do - for n in range (0,nfilters): + for n in range(0, nfilters): # set the begin center and end frequency of the filters begin = fc[n] - center= fc[n+1] + center = fc[n+1] end = fc[n+2] f = [] # create triangular filters @@ -26,7 +27,7 @@ def create_filterbanks (nfilters,freqrange,fc): # 0 for f outside the filter if x < begin: f.append(0) - #increasing to 1 towards the center + # increasing to 1 towards the center elif begin <= x and x <= center: f.append((x-begin)/(center-begin)) # decreasing to 0 upwards from the center @@ -36,27 +37,29 @@ def create_filterbanks (nfilters,freqrange,fc): elif x > end: f.append(0) filterbank.append(f) - + return filterbank - + + def filter_centers(nfilters, fs, xf): # calculates the center frequencies for the mel filters - - #space the filters equally in mels + + # space the filters equally in mels spacing = numpy.linspace(0, freq2mel(fs/2), nfilters+2) - #back from mels to frequency + # back from mels to frequency spacing = mel2freq(spacing) # round the filter frequencies to the nearest availlable fft bin frequencies - # and return the centers for the filters. - filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing] - + # and return the centers for the filters. + filters = [xf[numpy.argmin(numpy.abs(xf-x))] for x in spacing] + return filters - + + def apply_filterbanks(data, filters): # function to apply the filterbanks and take the log of the filterbanks - filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters))) + filtered_freq = numpy.log(numpy.dot(data, numpy.transpose(filters))) # same as with energy, taking the log of a filter bank with 0 power results in -inf # we approximate 0 power with -50 the log of 2e-22 - filtered_freq[filtered_freq == numpy.log(0)] = -50 - + filtered_freq[filtered_freq == numpy.log(0)] = -50 + return filtered_freq diff --git a/platalea/audio/melfreq.py b/platalea/audio/melfreq.py index bf2179f..4401bde 100644 --- a/platalea/audio/melfreq.py +++ b/platalea/audio/melfreq.py @@ -6,14 +6,16 @@ @author: danny """ import numpy -#provides simple functions to convert a frequency to mel and vice versa +# provides simple functions to convert a frequency to mel and vice versa + def freq2mel(f): - #converts a frequency to mel - mel=1125*numpy.log(1+f/700) + # converts a frequency to mel + mel = 1125*numpy.log(1+f/700) return (mel) + def mel2freq(m): - #converts mel to frequency - f=700*(numpy.exp(m/1125)-1) - return (f) \ No newline at end of file + # converts mel to frequency + f = 700*(numpy.exp(m/1125)-1) + return f diff --git a/platalea/audio/preproc.py b/platalea/audio/preproc.py index aea6876..d60c4cf 100644 --- a/platalea/audio/preproc.py +++ b/platalea/audio/preproc.py @@ -10,63 +10,67 @@ import numpy # provides some basic preprocessing functions for audio files, such as # padding the frames, hammingwindow for the frames, data preemphasis and fourrier -# transform +# transform def four(frames, fs, windowsize): - # fft works on frames of size 2^x, first find the appropriate padsize for - # our framesize. - exp = 1 - while True: - if numpy.power(2,exp) - windowsize >= 0: - padsize= numpy.power(2,exp) - windowsize - break - else: - exp += 1 - # pad frames to be of size 2^x - frames = numpy.pad(frames, [(0,0), (0,padsize)], 'constant', constant_values = 0) - # set cutoff at the half the frame size (+1 to keep the bin around - # which the spectrum is mirrored) - cutoff = int((windowsize+padsize)/2)+1 - # perform fast fourier transform - Y = fft(frames) - # take absolute power and collapse spectrum. Normalise the power for the - # amount of bins but multiply by 2 to make up for the collapse of the spectrum - Yamp = 2/(windowsize+padsize)* numpy.abs(Y[:, 0:cutoff]) - # first amp (dc component) and nyquist freq bin are not to be doubled (as they - # are not mirrored in the fft) - Yamp[:,0] = Yamp[:,0]/2 - Yamp[:,-1] = Yamp[:,-1]/2 - return (Yamp) + # fft works on frames of size 2^x, first find the appropriate padsize for + # our framesize. + exp = 1 + while True: + if numpy.power(2, exp) - windowsize >= 0: + padsize = numpy.power(2, exp) - windowsize + break + else: + exp += 1 + # pad frames to be of size 2^x + frames = numpy.pad(frames, [(0, 0), (0, padsize)], 'constant', constant_values=0) + # set cutoff at the half the frame size (+1 to keep the bin around + # which the spectrum is mirrored) + cutoff = int((windowsize+padsize)/2)+1 + # perform fast fourier transform + Y = fft(frames) + # take absolute power and collapse spectrum. Normalise the power for the + # amount of bins but multiply by 2 to make up for the collapse of the spectrum + Yamp = 2/(windowsize+padsize) * numpy.abs(Y[:, 0:cutoff]) + # first amp (dc component) and nyquist freq bin are not to be doubled (as they + # are not mirrored in the fft) + Yamp[:, 0] = Yamp[:, 0]/2 + Yamp[:, -1] = Yamp[:, -1]/2 + return (Yamp) + def notch(data): -# apply a notch filter to remove the DC offset + # apply a notch filter to remove the DC offset b, a = iirnotch(0.001, 3.5) notched = lfilter(b, a, data) return notched - -def pad (data,window_size, frame_shift): + + +def pad(data, window_size, frame_shift): # function to pad the audio file to fit the frameshift context_size = (window_size-frame_shift)/2 - pad_size = context_size - numpy.mod(data.size, frame_shift) + pad_size = context_size - numpy.mod(data.size, frame_shift) # if needed add padding to the end of the data if pad_size > 0: data = numpy.append(data, numpy.zeros(int(numpy.ceil(pad_size)))) - #always add padding to the front of the data + # always add padding to the front of the data data = numpy.append(numpy.zeros(int(context_size)), data) return(data) - + + def preemph(data, alpha): # preemphasises the data: x(preemph) = X(t) - X(t-1)*alpha xt = data xtminus1 = data*alpha - xtminus1 = numpy.insert(xtminus1,0,0,1)[:,:-1] - data_preemph = xt-xtminus1 + xtminus1 = numpy.insert(xtminus1, 0, 0, 1)[:, :-1] + data_preemph = xt-xtminus1 return data_preemph - + + def hamming(data): # apply hamming windowing to a frame of data L = numpy.shape(data)[1] hammingwindow = 0.54-(0.46*numpy.cos(2*numpy.pi*numpy.arange(L)/(L-1))) - data = numpy.multiply(data,hammingwindow) + data = numpy.multiply(data, hammingwindow) return data diff --git a/platalea/basicvq.py b/platalea/basicvq.py index 259607e..c749455 100644 --- a/platalea/basicvq.py +++ b/platalea/basicvq.py @@ -12,6 +12,7 @@ from platalea.optimizers import create_optimizer from platalea.schedulers import create_scheduler + class SpeechImage(nn.Module): def __init__(self, config): super(SpeechImage, self).__init__() @@ -27,7 +28,7 @@ def cost(self, item): speech_enc = self.SpeechEncoder(item['audio'], item['audio_len']) image_enc = self.ImageEncoder(item['image']) scores = platalea.loss.cosine_matrix(speech_enc, image_enc) - loss = platalea.loss.contrastive(scores, margin=self.config['margin_size']) + loss = platalea.loss.contrastive(scores, margin=self.config['margin_size']) return loss def embed_image(self, images): @@ -49,8 +50,8 @@ def embed_audio(self, audios): audio_e.append(self.SpeechEncoder(a.cuda(), l.cuda()).detach().cpu().numpy()) audio_e = np.concatenate(audio_e) return audio_e - - def code_audio(self, audios, one_hot=False): #FIXME messed up sized ETC + + def code_audio(self, audios, one_hot=False): # FIXME messed up sized ETC audio = torch.utils.data.DataLoader(dataset=audios, batch_size=32, shuffle=False, collate_fn=D.batch_audio) @@ -87,14 +88,14 @@ def val_loss(): with open("result.json", "w") as out: for epoch in range(1, config['epochs']+1): cost = Counter() - for j, item in enumerate(data['train'], start=1): # check reshuffling + for j, item in enumerate(data['train'], start=1): # check reshuffling item = {key: value.cuda() for key, value in item.items()} loss = net.cost(item) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() - cost += Counter({'cost': loss.item(), 'N':1}) + cost += Counter({'cost': loss.item(), 'N': 1}) average_loss = cost['cost'] / cost['N'] if j % 100 == 0: logging.info("train {} {} {}".format(epoch, j, average_loss)) @@ -109,12 +110,14 @@ def val_loss(): torch.save(net, "net.{}.pt".format(epoch)) return results -DEFAULT_CONFIG = dict(SpeechEncoder=dict(SpeechEncoderBottom=dict(conv=dict(in_channels=39, out_channels=64, kernel_size=6, stride=2, padding=0, bias=False), - rnn= dict(input_size=64, hidden_size=1024, num_layers=2, - bidirectional=True, dropout=0)), + +DEFAULT_CONFIG = dict(SpeechEncoder=dict(SpeechEncoderBottom=dict(conv=dict(in_channels=39, out_channels=64, kernel_size=6, + stride=2, padding=0, bias=False), + rnn=dict(input_size=64, hidden_size=1024, num_layers=2, + bidirectional=True, dropout=0)), VQEmbedding=dict(num_codebook_embeddings=256, embedding_dim=1024, jitter=0.12), - SpeechEncoderTop=dict(rnn= dict(input_size=64, hidden_size=1024, num_layers=2, - bidirectional=True, dropout=0), - att= dict(in_size=2048, hidden_size=128))), + SpeechEncoderTop=dict(rnn=dict(input_size=64, hidden_size=1024, num_layers=2, + bidirectional=True, dropout=0), + att=dict(in_size=2048, hidden_size=128))), ImageEncoder=dict(linear=dict(in_size=2048, out_size=2*1024), norm=True), margin_size=0.2) diff --git a/platalea/dataset.py b/platalea/dataset.py index 8a9646b..47c2ae8 100644 --- a/platalea/dataset.py +++ b/platalea/dataset.py @@ -1,5 +1,4 @@ import json -import logging import numpy as np import pathlib import pickle diff --git a/platalea/encoders.py b/platalea/encoders.py index cf614ce..e66ecc0 100644 --- a/platalea/encoders.py +++ b/platalea/encoders.py @@ -431,7 +431,6 @@ def forward(self, x): # Expecting packed sequence if self.RNN is not None: x, _ = self.RNN(x) - #x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True) return x def introspect(self, input, length): @@ -526,7 +525,8 @@ class SpeechEncoderVQ(nn.Module): def __init__(self, config): super(SpeechEncoderVQ, self).__init__() self.Bottom = SpeechEncoderBottom(config['SpeechEncoderBottom']) - self.Codebook = VQEmbeddingEMA(config['VQEmbedding']['num_codebook_embeddings'], config['VQEmbedding']['embedding_dim'], jitter=config['VQEmbedding']['jitter']) + self.Codebook = VQEmbeddingEMA(config['VQEmbedding']['num_codebook_embeddings'], + config['VQEmbedding']['embedding_dim'], jitter=config['VQEmbedding']['jitter']) self.Top = SpeechEncoderTop(config['SpeechEncoderTop']) def forward(self, input, length): @@ -549,13 +549,14 @@ class SpeechEncoderVQ2(nn.Module): def __init__(self, config): super(SpeechEncoderVQ2, self).__init__() self.Bottom = SpeechEncoderBottom(config['SpeechEncoderBottom']) - self.Codebook1 = VQEmbeddingEMA(config['VQEmbedding1']['num_codebook_embeddings'], config['VQEmbedding1']['embedding_dim'], jitter=config['VQEmbedding1']['jitter']) + self.Codebook1 = VQEmbeddingEMA(config['VQEmbedding1']['num_codebook_embeddings'], + config['VQEmbedding1']['embedding_dim'], jitter=config['VQEmbedding1']['jitter']) self.Middle = SpeechEncoderMiddle(config['SpeechEncoderMiddle']) - self.Codebook2 = VQEmbeddingEMA(config['VQEmbedding2']['num_codebook_embeddings'], config['VQEmbedding2']['embedding_dim'], jitter=config['VQEmbedding2']['jitter']) + self.Codebook2 = VQEmbeddingEMA(config['VQEmbedding2']['num_codebook_embeddings'], + config['VQEmbedding2']['embedding_dim'], jitter=config['VQEmbedding2']['jitter']) self.Top = SpeechEncoderTop(config['SpeechEncoderTop']) def forward(self, input, length): - #return self.Top(self.Codebook(self.Bottom(input, length))['quantized']) return self.Top(self.Codebook2(self.Middle(self.Codebook1(self.Bottom(input, length))['quantized']))['quantized']) def introspect(self, input, length): diff --git a/platalea/experiments/flickr8k/pip_seq.py b/platalea/experiments/flickr8k/pip_seq.py index 0553c0f..635d65a 100644 --- a/platalea/experiments/flickr8k/pip_seq.py +++ b/platalea/experiments/flickr8k/pip_seq.py @@ -12,7 +12,7 @@ from platalea.experiments.config import get_argument_parser -args = get_argument_parser()# import cProfile +args = get_argument_parser() # Parsing arguments args.add_argument( diff --git a/platalea/experiments/flickr8k/transformer.py b/platalea/experiments/flickr8k/transformer.py index 2e4f14e..9872f40 100644 --- a/platalea/experiments/flickr8k/transformer.py +++ b/platalea/experiments/flickr8k/transformer.py @@ -9,7 +9,7 @@ from platalea.experiments.config import get_argument_parser -args = get_argument_parser()# Parsing arguments +args = get_argument_parser() # Parsing arguments args.add_argument('--batch_size', default=32, type=int, help='How many samples per batch to load.') args.add_argument('--conv_stride', default=2, type=int, @@ -23,6 +23,7 @@ args.add_argument('--trafo_feedforward_dim', default=1024, type=int, help='TRANSFORMER: Dimensionality of feedforward layer at the end of the transformer layer stack.') + class unit_float(float): def __new__(cls, value): value = float(value) @@ -31,6 +32,7 @@ def __new__(cls, value): else: raise ValueError(f"{value} is not a proper unit_float, because it is not between 0 and 1") + args.add_argument('--trafo_dropout', default=0, type=unit_float, help='TRANSFORMER: Dropout factor, used for regularization.') @@ -50,11 +52,11 @@ def __new__(cls, value): data = dict( train=D.flickr8k_loader( args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language, - args.audio_features_fn, split='train', batch_size=args.batch_size, shuffle=True, + args.audio_features_fn, split='train', batch_size=args.batch_size, shuffle=True, downsampling_factor=args.downsampling_factor), val=D.flickr8k_loader( args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language, - args.audio_features_fn, split='val', batch_size=args.batch_size, shuffle=False) + args.audio_features_fn, split='val', batch_size=args.batch_size, shuffle=False) ) diff --git a/platalea/fix_json.py b/platalea/fix_json.py index 2cc9dc9..4a03c84 100644 --- a/platalea/fix_json.py +++ b/platalea/fix_json.py @@ -7,25 +7,27 @@ logging.basicConfig(level=logging.INFO) + def fix(): paths = glob.glob("experiments/*/result.json") for path in paths: logging.info("Fixing {}".format(path)) copyfile(path, path + ".orig") with open(path, 'w') as out: - data = [ eval(line) for line in open(path + ".orig") ] + data = [eval(line) for line in open(path + ".orig")] for datum in data: print(json.dumps(datum), file=out) - + def load_results(): - tables = [] - for file in glob.glob("experiments/vq*/result.json"): - data = [ flat(json.loads(line)) for line in open(file) ] - table = pd.read_json(io.StringIO(json.dumps(data)), orient='records') - table['path']=file - tables.append(table) - return tables + tables = [] + for file in glob.glob("experiments/vq*/result.json"): + data = [flat(json.loads(line)) for line in open(file)] + table = pd.read_json(io.StringIO(json.dumps(data)), orient='records') + table['path'] = file + tables.append(table) + return tables + def flat(rec): return dict(epoch=rec['epoch'], diff --git a/platalea/hardware.py b/platalea/hardware.py index d0e5f57..303a78f 100644 --- a/platalea/hardware.py +++ b/platalea/hardware.py @@ -4,20 +4,22 @@ _device = None + def set_device(device: Optional[str]): global _device _device = device + def device(ordinal: Optional[int] = None): """Return a device. - + By default, if available, it returns a GPU id string. Optionally, the user can specify the ordinal identifying a specific GPU. If GPUs are not available, it will return a CPU string. It is also possible to use set_device to set a custom device string. If set, i.e. if not None (the default value), this value is used. - + This function can only be used by models that run on a single device. """ global _device diff --git a/platalea/ipa.py b/platalea/ipa.py index d2a8955..3975f1f 100644 --- a/platalea/ipa.py +++ b/platalea/ipa.py @@ -1,5 +1,5 @@ import logging -PHONEMES="""arpabet ipa class +PHONEMES = """arpabet ipa class aa ɑ vowel ae æ vowel ah ə vowel @@ -40,21 +40,22 @@ z z fricative zh ʒ fricative""" + def parseipa(): - mapping = {} + mapping = {} lines = PHONEMES.split("\n") for line in lines[1:]: arpa, ipa, _ = line.split() mapping[arpa] = ipa return mapping + _arpa2ipa = parseipa() + def arpa2ipa(arpa, default=None): try: return _arpa2ipa[arpa] except KeyError: logging.warning("Key not found: {}".format(arpa)) return default - - diff --git a/platalea/loss.py b/platalea/loss.py index 8a4890e..4af8912 100644 --- a/platalea/loss.py +++ b/platalea/loss.py @@ -1,14 +1,16 @@ import torch -def contrastive(M, margin=0.2): - "Returns contrastive margin loss over similarity matrix M." + +def contrastive(M, margin=0.2): + "Returns contrastive margin loss over similarity matrix M." E = - M D = torch.diag(E) C_c = torch.clamp(margin - E + D, min=0) - C_r = torch.clamp(margin - E + D.view(-1,1), min=0) + C_r = torch.clamp(margin - E + D.view(-1, 1), min=0) C = C_c + C_r return (C.sum() - torch.diag(C).sum())/C.size(0)**2 + def cosine_matrix(U, V): "Returns the matrix of cosine similarity between each row of U and each row of V." U_norm = U / U.norm(2, dim=1, keepdim=True) diff --git a/platalea/optimizers.py b/platalea/optimizers.py index 6cc5050..7adac53 100644 --- a/platalea/optimizers.py +++ b/platalea/optimizers.py @@ -13,4 +13,4 @@ def create_optimizer(config, net_parameters): else: optimizer = optim.Adam(net_parameters, lr=1, weight_decay=config['l2_regularization']) optimizer.zero_grad() - return optimizer \ No newline at end of file + return optimizer diff --git a/platalea/rank_eval.py b/platalea/rank_eval.py index c763d35..4cc02dc 100644 --- a/platalea/rank_eval.py +++ b/platalea/rank_eval.py @@ -3,18 +3,20 @@ import numpy from scipy.spatial.distance import cdist + def cosine(x, y): return cdist(x, y, metric='cosine') + def ranking(candidates, references, correct, metric=cosine, ns=(1, 5, 10)): """Rank `candidates` in order of similarity for each vector and return evaluation metrics. `correct[i][j]` indicates whether for reference item i the candidate j is correct. """ distances = cdist(references, candidates) - result = {'ranks' : [] , 'recall' : {} } + result = {'ranks': [], 'recall': {}} for n in ns: - result['recall'][n] = [] + result['recall'][n] = [] for j, row in enumerate(distances): ranked = numpy.argsort(row) id_correct = numpy.where(correct[j][ranked])[0] diff --git a/platalea/schedulers.py b/platalea/schedulers.py index d551d53..40d8a4b 100644 --- a/platalea/schedulers.py +++ b/platalea/schedulers.py @@ -66,7 +66,7 @@ def create_scheduler(config, optimizer, data): if configured_scheduler is None or configured_scheduler == 'cyclic': scheduler = cyclic(optimizer, len(data['train']), max_lr=config['max_lr'], - min_lr=config['min_lr']) + min_lr=config['min_lr']) elif configured_scheduler == 'noam': scheduler = noam(optimizer, config['d_model']) elif configured_scheduler == 'constant': diff --git a/platalea/score.py b/platalea/score.py index fd6594f..3900f24 100644 --- a/platalea/score.py +++ b/platalea/score.py @@ -1,5 +1,4 @@ import numpy as np -import platalea.dataset as D import platalea.rank_eval as E import platalea.xer as xer import torch @@ -14,7 +13,7 @@ def score(net, dataset): return dict(medr=np.median(result['ranks']), recall={1: np.mean(result['recall'][1]), 5: np.mean(result['recall'][5]), - 10: np.mean(result['recall'][10])}) + 10: np.mean(result['recall'][10])}) def score_text_image(net, dataset): @@ -26,7 +25,7 @@ def score_text_image(net, dataset): return dict(medr=np.median(result['ranks']), recall={1: np.mean(result['recall'][1]), 5: np.mean(result['recall'][5]), - 10: np.mean(result['recall'][10])}) + 10: np.mean(result['recall'][10])}) def score_speech_text(net, dataset): @@ -38,7 +37,7 @@ def score_speech_text(net, dataset): return dict(medr=np.median(result['ranks']), recall={1: np.mean(result['recall'][1]), 5: np.mean(result['recall'][5]), - 10: np.mean(result['recall'][10])}) + 10: np.mean(result['recall'][10])}) def score_asr(net, dataset, beam_size=None): @@ -57,6 +56,7 @@ def bleu_score(references, hypotheses): bleu[i] = sentence_bleu([references[i]], hypotheses[i]) return bleu.mean() + def score_slt(net, dataset, beam_size=None): data = dataset.evaluation() trn = net.transcribe(data['audio'], beam_size=beam_size) diff --git a/platalea/text_image.py b/platalea/text_image.py index 443bf67..936430c 100644 --- a/platalea/text_image.py +++ b/platalea/text_image.py @@ -64,6 +64,7 @@ def embed_text(self, texts): def experiment(net, data, config): _device = platalea.hardware.device() + def val_loss(): net.eval() result = [] diff --git a/platalea/utils/copy_best.py b/platalea/utils/copy_best.py index 11a276f..f8e423c 100755 --- a/platalea/utils/copy_best.py +++ b/platalea/utils/copy_best.py @@ -40,17 +40,17 @@ def copy_best(exp_path=['.'], result_fname='result.json', save_fname='net.best.p 'exp_path', help='Path to the experiment', default=['.'], nargs='*') parser.add_argument( '--result', - help='Path to the JSON file containing the results'\ + help='Path to the JSON file containing the results' ' (default=result.json).', type=str, default='result.json') parser.add_argument( '--save', - help='Path where the corresponding net should be saved'\ + help='Path where the corresponding net should be saved' ' (default=net.best.pt).', type=str, default='net.best.pt') parser.add_argument( '--experiment_type', dest='experiment_type', - help='Type of experiment. Determines which metric is used'\ + help='Type of experiment. Determines which metric is used' ' (default=retrieval).', type=str, choices=['retrieval', 'asr', 'mtl', 'slt'], default='retrieval') diff --git a/platalea/utils/evaluate_net.py b/platalea/utils/evaluate_net.py index 9cfb649..3074cc8 100755 --- a/platalea/utils/evaluate_net.py +++ b/platalea/utils/evaluate_net.py @@ -16,6 +16,8 @@ args = get_argument_parser() + + def get_score_fn_speech_transcriber(is_slt, use_beam_decoding): if is_slt: score_fn = platalea.score.score_slt diff --git a/platalea/utils/extract_transcriptions.py b/platalea/utils/extract_transcriptions.py index e9f38a9..0c5d7db 100755 --- a/platalea/utils/extract_transcriptions.py +++ b/platalea/utils/extract_transcriptions.py @@ -8,6 +8,8 @@ args = get_argument_parser() + + def extract_trn(net, dataset, use_beam_decoding=False): d = dataset.evaluation() ref = d['text'] diff --git a/platalea/utils/flickr8k_filter_metadata.py b/platalea/utils/flickr8k_filter_metadata.py index f810ca2..4895e4a 100644 --- a/platalea/utils/flickr8k_filter_metadata.py +++ b/platalea/utils/flickr8k_filter_metadata.py @@ -2,6 +2,7 @@ M = json.load(open('dataset_multilingual_human.json')) + def filter(M): I = [] for i, m in enumerate(M['images']): @@ -16,6 +17,7 @@ def filter(M): I.append(m) M['images'] = I + def count_sent(M): cntr = 0 for m in M['images']: @@ -23,6 +25,7 @@ def count_sent(M): cntr += 1 return cntr + filter(M) count_sent(M) json.dump(M, open('dataset_multilingual_human_only.json', 'w')) diff --git a/platalea/vq.py b/platalea/vq.py index 9d6ca4c..5585a06 100644 --- a/platalea/vq.py +++ b/platalea/vq.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from torch.distributions import Categorical + class Jitter(nn.Module): def __init__(self, p): super().__init__() @@ -26,6 +27,7 @@ def forward(self, x): x = torch.gather(x, 1, index.unsqueeze(-1).expand(-1, -1, channels)) return x + class VQEmbeddingEMA(nn.Module): def __init__(self, num_embeddings, embedding_dim, commitment_cost=0.25, decay=0.999, epsilon=1e-5, jitter=0.12): super(VQEmbeddingEMA, self).__init__() @@ -40,25 +42,23 @@ def __init__(self, num_embeddings, embedding_dim, commitment_cost=0.25, decay=0. self.register_buffer("ema_count", torch.zeros(num_embeddings)) self.register_buffer("ema_weight", self.embedding.clone()) self.jitter = Jitter(jitter) if jitter > 0 else None - + def forward(self, x): M, D = self.embedding.size() # unpack packed_sequence x, l = nn.utils.rnn.pad_packed_sequence(x, batch_first=True) x_flat = x.detach().reshape(-1, D) - + distances = torch.addmm(torch.sum(self.embedding ** 2, dim=1) + torch.sum(x_flat ** 2, dim=1, keepdim=True), x_flat, self.embedding.t(), alpha=-2.0, beta=1.0) - #distances = ((self.embedding - x_flat.unsqueeze(dim=1))**2).sum(dim=2) indices = torch.argmin(distances.float(), dim=-1) encodings = F.one_hot(indices, M).float() quantized = F.embedding(indices, self.embedding) quantized = quantized.view_as(x) - if self.training: self.ema_count = self.decay * self.ema_count + (1 - self.decay) * torch.sum(encodings, dim=0) diff --git a/platalea/vq_encode.py b/platalea/vq_encode.py index 0764262..18c8367 100644 --- a/platalea/vq_encode.py +++ b/platalea/vq_encode.py @@ -6,14 +6,12 @@ import os.path import numpy as np import torch -from pathlib import Path config = dict(type='mfcc', delta=True, alpha=0.97, n_filters=40, window_size=0.025, frame_shift=0.010) - def encode(net, datadir, outdir): - paths = glob.glob(datadir + "/*.wav") + paths = glob.glob(datadir + "/*.wav") assert len(paths) > 0 try: feat = torch.load(datadir + "_audiofeat.pt") @@ -29,9 +27,11 @@ def encode(net, datadir, outdir): out = outdir + '/' + filename + ".txt" assert code.shape[0] > 0 np.savetxt(out, code.astype(int), fmt='%d') - + + def encode_zerospeech(net, outdir='.'): - encode(net, "/roaming/gchrupal/verdigris/platalea.vq/data/2020/2019/english/test/") + encode(net, "/roaming/gchrupal/verdigris/platalea.vq/data/2020/2019/english/test/") + def evaluate_zerospeech(net, outdir='.'): encode_zerospeech(net, outdir=outdir) diff --git a/platalea/xer.py b/platalea/xer.py index 12e5ea5..3e9c979 100644 --- a/platalea/xer.py +++ b/platalea/xer.py @@ -1,6 +1,3 @@ - - - def nbeditops(s1, s2): import Levenshtein as L d = 0 diff --git a/setup.py b/setup.py index e336551..3f9eceb 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from setuptools import setup, find_packages +from setuptools import setup setup(name='platalea', description='Understanding visually grounded spoken language via multi-tasking', @@ -24,4 +24,4 @@ 'python-Levenshtein>=0.12.0'], use_scm_version=True, setup_requires=['setuptools_scm'], -) + ) diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 6afd3b2..338fd06 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -17,7 +17,7 @@ def test_config(): assert args.epochs == 2 assert args.flickr8k_meta == 'thisandthat.json' - assert args.verbose == True + assert args.verbose assert args.lr_scheduler == 'noam' @@ -179,7 +179,6 @@ def test_pip_ind_experiment(): _assert_nested_almost_equal(result, expected) - def test_pip_seq_experiment(): expected = [{'medr': 1.5, 'recall': {1: 0.5, 5: 1.0, 10: 1.0}, 'average_loss': 0.3918714001774788,