In [1]:
import glob 
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset,DataLoader
import torchaudio
import librosa
from torchaudio import transforms
from wavenet import WaveNet

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display
import numpy as np
import random

In [3]:
class VCTK(Dataset):
    def __init__(self,path='./VCTK/',speaker='p225',transform=None,sr=16000,top_db=10):
        self.wav_list = glob.glob(path + speaker +'/*.wav')
        self.wav_ids = sorted([f.split('/')[-1] for f in glob.glob(path+'*')])
        self.transform = transform
        self.sr = sr
        self.top_db = top_db
        
    def __getitem__(self, index):
        f = self.wav_list[index]
        audio,_ = librosa.load(f,sr=self.sr,mono=True)
        audio,_ = librosa.effects.trim(audio, top_db=self.top_db, frame_length=2048)
        audio = np.clip(audio,-1,1)
        wav_tensor = torch.from_numpy(audio).unsqueeze(1)
        wav_id = f.split('/')[3]
        if self.transform is not None:
            wav_tensor = self.transform(wav_tensor)
        
        return wav_tensor
    
    def __len__(self):
        return len(self.wav_list)

In [4]:
t = transforms.Compose([
        transforms.MuLawEncoding(),
        transforms.LC2CL()])

def collate_fn_(batch_data, max_len=40000):
    audio = batch_data[0]
    audio_len = audio.size(1)
    if audio_len > max_len:
        idx = random.randint(0,audio_len - max_len)
        return audio[:,idx:idx+max_len]
    else:
        return audio

In [5]:

vctk = VCTK(speaker='p225',transform=t,sr=16000)
training_data = DataLoader(vctk,batch_size=1, shuffle=True,collate_fn=collate_fn_)

In [6]:

model = WaveNet().cuda()
train_step = optim.Adam(model.parameters(),lr=2e-2, eps=1e-4)

In [7]:
scheduler = optim.lr_scheduler.MultiStepLR(train_step, milestones=[50,150,250], gamma=0.5)

In [8]:
for epoch in range(300):
    loss_= []
    scheduler.step()
    for data in training_data:
        
        data = Variable(data).cuda()
        x = data[:,:-1]
        logits = model(x)
        y = data[:,-logits.size(2):]
        loss = F.cross_entropy(logits.transpose(1,2).contiguous().view(-1,256), y.view(-1))
        train_step.zero_grad()
        loss.backward()
        train_step.step()
        loss_.append(loss.data[0])
    if (epoch+1)%20 == 0:
        torch.save(model.state_dict(),'model_%s.pth'%(str(epoch+1)))
    if epoch%5 == 0:
        print epoch,np.mean(loss_)

0 5.54576349258
5 5.17097759247
10 4.68942022324
15 4.11701393127
20 3.97902035713
25 2.99311327934
30 2.1784787178
35 1.17133200169
40 0.672031521797
45 0.0956522896886
50 0.0156723242253
55 0.00487605528906
60 0.00221115117893
65 0.00123464234639
70 0.000777395616751
75 0.000537805375643
80 0.000402514386224
85 0.000320124527207
90 0.000266821269179
95 0.000230616744375
100 0.0002049238974
105 0.000185854369192
110 0.000171155334101
115 0.000159416595125
120 0.000149726387463
125 0.000141529060784
130 0.000134454618092
135 0.000128221974592
140 0.00012267632701
145 0.000117670024338
150 0.000113103778858
155 0.000110973996925
160 0.000108949359856
165 0.000107010884676
170 0.000105135623016
175 0.000103312275314
180 0.000101556099253
185 9.9858691101e-05
190 9.81919220067e-05
195 9.65765866567e-05
200 9.49896129896e-05
205 9.34484269237e-05
210 9.19663652894e-05
215 9.04959524632e-05
220 8.90813025762e-05
225 8.76989797689e-05
230 8.63427339937e-05
235 8.50333526614e-05
240 8.3743325

In [11]:
logits

Variable containing:
(  0  ,.,.) = 
 -3.2650e+01 -3.1655e+01 -3.4533e+01  ...  -4.2706e+01 -4.0256e+01 -3.6308e+01
 -3.1576e+01 -3.0497e+01 -3.8252e+01  ...  -4.3902e+01 -3.7529e+01 -3.8794e+01
 -3.3516e+01 -3.3208e+01 -3.5913e+01  ...  -3.9251e+01 -4.2566e+01 -3.7099e+01
                 ...                   ⋱                   ...                
 -3.2975e+01 -3.3029e+01 -3.4870e+01  ...  -4.1254e+01 -3.7543e+01 -3.8281e+01
 -3.3611e+01 -3.5202e+01 -3.5310e+01  ...  -4.1819e+01 -4.1373e+01 -3.6366e+01
 -3.0597e+01 -3.0185e+01 -3.1446e+01  ...  -4.1941e+01 -4.1382e+01 -3.5701e+01
[torch.cuda.FloatTensor of size 1x256x15872 (GPU 0)]

In [None]:
h

In [None]:
torch.save(model.state_dict(),'model.pth')

In [None]:

audio,_ = librosa.load('./VCTK/p225/p225_001.wav',sr=16000,mono=True)
audio,_ = librosa.effects.trim(audio, top_db=10, frame_length=2048)
wav_tensor = torch.from_numpy(audio).unsqueeze(1)
wav_tensor = transforms.MuLawEncoding()(wav_tensor).transpose(0,1)

In [2]:
recp_field=5116
sample_len = 16000*3
sample = Variable(wav_tensor[:,:recp_field]).cuda()
for i in range(sample_len):
    logits = model(sample[:,-recp_field:])
    m = torch.distributions.Categorical(F.softmax(logits,dim=1).view(-1))
    new = m.sample().view(1,-1)
    #print sample.size(),new.size()
    sample = torch.cat((sample,new),dim=1)
    print sample.size()
    if i % 16000 == 0:
        print i

NameError: name 'Variable' is not defined

In [16]:
sample.size()

torch.Size([1, 5117])