## What's the plan?

Sentence ----> Speech

- Regression problem
- Sentence can be encoded to a n dimensional vector
- Speech is basically a m dimensional array of numbers, depends on 
        - sampling frequency
        - duration of the audio
- To make m fixed throughout the training,fix the audio length and do padding to samples under length m seconds
- Fit a Seq-Seq model to map  n dim vector to m dim vector

In [1]:
import numpy as np
from scipy.io import wavfile
import wavio
from librosa import load
from pydub import AudioSegment
import glob
import os
from tqdm import tqdm

In [2]:
%config Completer.use_jedi = False

## Preprocess Audio
Here I am trying to pad the audio date to a fixed length

In [31]:
def pad_audio_files(path):
    ps_max=0
    if not os.path.exists('Data/spoken_numbers/padded_recordings'):
        os.mkdir('Data/spoken_numbers/padded_recordings')
    for file in glob.glob(f'{path}*.wav'):
        
        audio = AudioSegment.from_file(file)
        if ps_max < len(audio):
            ps_max = len(audio)
            
    for file in tqdm(glob.glob(f'{path}*.wav')):    
        
        audio = AudioSegment.from_file(file)
        silence = AudioSegment.silent(ps_max-len(audio)+1)
        padded = audio + silence
        newfile = file.split('/')[-1]
        padded.export(f'Data/spoken_numbers/padded_recordings/{newfile}',format='wav')
        
        
pad_audio_files("Data/spoken_numbers/recordings/")    

100%|██████████| 1000/1000 [01:04<00:00, 15.59it/s]


In [19]:
AudioSegment.from_file("Data/spoken_numbers/padded_recordings/one.wav")

In [92]:
wavfile.read("Data/spoken_numbers/padded_recordings/one.wav")

(22050, array([0, 0, 0, ..., 0, 0, 0], dtype=int16))

In [93]:
22050*2

44100

## Vectorize Input Text data
Here I will vectorize the input sequences

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter
import pandas as pd
from torchtext import vocab
from torch.utils.data import Dataset,DataLoader
import torch
from torch import nn 
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence
from torchtext.legacy.data import BucketIterator
from torch.optim import Adam

In [4]:
config = {
    'num_epochs':5,
    'bs':8,
    'lr':1e-3
}

In [5]:
df = pd.read_csv("Data/spoken_numbers/data.csv")

In [6]:
vocabulary = vocab.build_vocab_from_iterator(df.text.str.split().values,min_freq=0,specials=['<pad>','<unk>'])

In [7]:
vectors = vocab.Vectors(os.path.join("/Users/shahules/AMPLYFI/MACHINELEARNING/base-taxonomy-model/.vector_cache","glove.6B.100d.txt"))

In [8]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocabulary.get_itos())


In [9]:
pretrained_embedding.shape

torch.Size([32, 100])

## Build Dataset

In [10]:
class SynthesisDataset(Dataset):
    
    def __init__(self,df):
        
        self.texts = df.text.str.split()
        self.tokens_array = pad_sequence([torch.tensor(vocabulary(text)) for text in self.texts],batch_first=True)
        self.audiopaths = df.filename.values
        self.ROOT = "Data/spoken_numbers/padded_recordings"
        
    def __getitem__(self,idx):
        
        text = self.tokens_array[idx]
        samplefreq, audio = wavfile.read(os.path.join(self.ROOT,self.audiopaths[idx]))
        
        return {'text':torch.LongTensor(text),
               'audio':torch.FloatTensor(audio[:44100])}
    
    def __len__(self):
        return len(self.texts)
        
        
        

## Model

In [11]:
class lstm_encoder(nn.Module):
    
    def __init__(self,emb_dim,hidden_size,layers=1,bidirectional=False):
        super(lstm_encoder, self).__init__()
        
        self.embedding = nn.Embedding(len(vocabulary),emb_dim)
        self.lstm = nn.LSTM(input_size = emb_dim,
                            hidden_size = hidden_size,
                            num_layers = layers,
                            batch_first=True,
                            bidirectional= bidirectional)
                  
        
    def forward(self,x):
        
        emb = self.embedding(x)
        out,hidden = self.lstm(emb)
        
        
        return out,hidden
    
    
class lstm_decoder(nn.Module):
    
    def __init__(self,hidden_size,layers=1,bidirectional=False):
        super(lstm_decoder,self).__init__()
    
        self.lstm = nn.LSTM(input_size=1,
                           hidden_size=hidden_size,
                           num_layers = layers,
                           batch_first=True,
                           bidirectional=bidirectional)
        
        
    def forward(self,x_input,hidden):
        
        
        out,hidden = self.lstm(x_input,hidden)
        
        return out[:,:,0],hidden
    
    
    
        

In [12]:
class SeqModel(nn.Module):
    
    def __init__(self,hidden_size=100,teacher_forcing=0):
        super(SeqModel,self).__init__()
        
        self.embed_size = pretrained_embedding.shape[1]
        self.hidden_size = hidden_size
        self.sequence_len = 44100
        self.features = 1
        self.teacher_forcing = teacher_forcing
        
        
        
        self.encoder = lstm_encoder(self.embed_size,self.hidden_size)
        self.decoder = lstm_decoder(self.hidden_size)
        
        
    def forward(self,x_input,targets):
        
        encoder,context = self.encoder(x_input)
        predictions = torch.zeros(x_input.shape[0],self.sequence_len,1)
        
        past_input = targets[:,0].unsqueeze(-1).unsqueeze(-1)
     
        for index in range(len(targets)):
            
            prediction,context = self.decoder(past_input,context)
            #print(predictions.shape,prediction.shape)
            
            predictions[:,index,:] = prediction
            
            if np.random.uniform() < self.teacher_forcing:
                past_input = targets[:,index].unsqueeze(-1)
            else:
                past_input = prediction.unsqueeze(-1)
                
         
        return predictions.squeeze(-1)
            
            
    

## Train Loop

In [13]:
def train_one_batch(batch,model,optimizer,criterion):
    
    model.train()
    optimizer.zero_grad()
    
    out = model(batch['text'], batch['audio'])
    loss = criterion(batch['audio'],out)
    loss.backward()
    optimizer.step()
    
    return model,loss
    

In [14]:
def train_and_eval():
    
    model = SeqModel()
    train_dataset = DataLoader(SynthesisDataset(df),
                                  batch_size=config['bs'],
                                  shuffle=True,
                                  )

    criterion = nn.MSELoss() 
    optimizer = Adam(model.parameters(),lr=config['lr'])
    
    print(len(train_dataset))
    
    for epoch in range(config['num_epochs']):
        epoch_loss = 0.0
        for batchid,batch in enumerate(train_dataset):

            model,loss = train_one_batch(batch,model,optimizer,criterion)
            
            epoch_loss += loss.item()
            
        print(f"loss for epoch {epoch_loss/len(train_dataset)}")
        
    torch.save(model.state_dict(),"spoken_digits_baseline.pt") 


train_and_eval()

125


  app.launch_new_instance()


loss for epoch 14343321.168
loss for epoch 14343321.184
loss for epoch 14343321.152
loss for epoch 14343321.224
loss for epoch 14343321.152


## Inference

In [20]:
def inference(model,test_df):
    samplerate = 21050
    outputs=[]
    model.eval()
    test_dataset = SynthesisDataset(test_df)
    
    for i,data in enumerate(test_dataset):
        
        name = test_df.iloc[i]['text']
        out = model(data['text'].unsqueeze(0),data['audio'].unsqueeze(0))
        out = out.float().detach().numpy()
        #wavfile.write(f"{'_'.join(name.split())}.wav",data = out,rate=samplerate)
        outputs.append(out)
        
    return outputs

In [21]:
model = SeqModel()
model.load_state_dict(torch.load("spoken_digits_baseline.pt"))
out=inference(model,df.sample(5))

In [22]:
import IPython.display as ipd

In [27]:
ipd.Audio(out[4],rate=21050)