In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.rcParams['figure.figsize'] = [10,5]

from tqdm import tqdm_notebook
import numpy as np
import torch
import copy
import math
from torch.utils.data import DataLoader
from patter import ModelFactory
from patter.data import AudioDataset, BucketingSampler, audio_seq_collate_fn
from patter.decoder import GreedyCTCDecoder
from patter.data.features import PerturbedSpectrogramFeaturizer
from patter.evaluator import validate
from patter.models import SpeechModel

In [2]:
seed_model_path = "/data/users/ryan/models/deepspeech/librispeech_pretrained_patter.pt"
new_model_path = "/data/users/ryan/models/deepspeech/an4_transferred.pt"
seed_model, seed_package = ModelFactory.load(seed_model_path, include_package=True)

In [3]:
# make a copy of the model
model = copy.copy(seed_model)

In [4]:
# decide on set of labels you want for the new model
# for this experiment (going from english to english, we can just reuse the set of labels from the original model)
# NB: the first label MUST represent the CTC blank label (canonically is '\xa0')
labels = seed_model.labels

# when new labels are set, tack them into the model
model.labels = labels
print("New Labels:", labels)

New Labels: ['\xa0', "'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' ']


In [5]:
# create a new output layer entirely
model.output[1] = torch.nn.Linear(in_features=800, out_features=len(labels), bias=False)

# and initialize it
for p in model.output[1].parameters():
    torch.nn.init.xavier_uniform_(p)

In [6]:
# set requires_grad = False for all but the output layers
for x, y in model.conv.named_parameters():
    y.requires_grad_(False)
for x, y in model.rnn.named_parameters():
    y.requires_grad_(False)

In [7]:
# create a featurizer based on the model's expected features, optionally specify perturbations on training data
featurizer = PerturbedSpectrogramFeaturizer.from_config(seed_model.input_cfg, perturbation_configs=None)

# load datasets for train and dev sets that you want to transfer model to
train_manifest_path = "/home/ryan/data/patter_data/an4-jl/an4_train_manifest.jl"
val_manifest_path = "/home/ryan/data/patter_data/an4-jl/an4_val_manifest.jl"
train_corpus = AudioDataset(train_manifest_path, labels, featurizer, max_duration=17.0, min_duration=1.0)
val_corpus = AudioDataset(val_manifest_path, labels, featurizer, max_duration=17.0, min_duration=1.0)

Dataset loaded with 0.70 hours. Filtered 0.00 hours.
Dataset loaded with 0.10 hours. Filtered 0.00 hours.


In [8]:
# set up data loaders
batch_size = 32
num_workers = 4
cuda = True
train_sampler = BucketingSampler(train_corpus, batch_size=batch_size)
train_loader = DataLoader(train_corpus, num_workers=num_workers, collate_fn=audio_seq_collate_fn, pin_memory=cuda, batch_sampler=train_sampler)
eval_loader = DataLoader(val_corpus, num_workers=4, collate_fn=audio_seq_collate_fn, pin_memory=cuda, batch_size=batch_size)

In [9]:
if cuda:
    model = model.cuda()

In [10]:
# set up optimizer
lr = 3e-4
momentum = 0.9
annealing = 1.01
trainable_params = set([x for x in model.parameters() if x.requires_grad])
optimizer = torch.optim.SGD(trainable_params, lr=lr, momentum=momentum, nesterov=True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1/annealing)

# Don't need a learning rate schedule if using Adam
class NoOpScheduler(object):
    def __init__(self):
        pass

    def step(self):
        pass

optimizer = torch.optim.Adam(trainable_params, lr=lr, amsgrad=True)
scheduler = NoOpScheduler()

In [11]:
# Report the initial performance of the model on the new validation set:
err = validate(eval_loader, model, tqdm=False)
print("WER: {0:.3f}, CER: {1:.3f}".format(err.wer, err.cer))

best_wer = err.wer

WER: 101.175, CER: 260.417


In [12]:
num_epochs = 40
for epoch in range(num_epochs):
    train_sampler.shuffle()
    scheduler.step()
    
    model.train()
    train_loader = tqdm_notebook(train_loader, desc="Epoch {}".format(epoch+1))
    for i, data in enumerate(train_loader):
        feat, target, feat_len, target_len = data
        if cuda:
            feat = feat.cuda(async=True)
        
        optimizer.zero_grad()
        
        output, output_len = model(feat, feat_len)
        loss = model.loss(output, target, output_len.squeeze(0), target_len)
        
        scalar_loss = loss.item()/feat.size(0)
        if abs(scalar_loss) == math.inf:
            scalar_loss = 0
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable_params, 400)
        optimizer.step()
    model.eval()
    err = validate(eval_loader, model, tqdm=False)
    print("Epoch {0} :: WER: {1:.3f}, CER: {2:.3f}".format(epoch+1, err.wer, err.cer))
    
    if err.wer < best_wer:
        best_wer = err.wer
        torch.save(SpeechModel.serialize(model), new_model_path)


Epoch 1 :: WER: 124.674, CER: 106.486



Epoch 2 :: WER: 98.564, CER: 58.176



Epoch 3 :: WER: 93.342, CER: 59.277



Epoch 4 :: WER: 93.603, CER: 61.124



Epoch 5 :: WER: 96.475, CER: 60.181



Epoch 6 :: WER: 95.822, CER: 48.349



Epoch 7 :: WER: 91.123, CER: 38.601



Epoch 8 :: WER: 83.290, CER: 30.739



Epoch 9 :: WER: 73.890, CER: 21.934



Epoch 10 :: WER: 66.710, CER: 19.182



Epoch 11 :: WER: 58.877, CER: 15.802



Epoch 12 :: WER: 53.525, CER: 13.758



Epoch 13 :: WER: 48.433, CER: 11.439



Epoch 14 :: WER: 46.475, CER: 11.046



Epoch 15 :: WER: 42.037, CER: 9.552



Epoch 16 :: WER: 36.031, CER: 8.373



Epoch 17 :: WER: 33.159, CER: 8.333



Epoch 18 :: WER: 29.504, CER: 7.665



Epoch 19 :: WER: 25.457, CER: 6.722



Epoch 20 :: WER: 24.282, CER: 6.486



Epoch 21 :: WER: 22.063, CER: 5.857



Epoch 22 :: WER: 20.888, CER: 5.621



Epoch 23 :: WER: 19.060, CER: 5.189



Epoch 24 :: WER: 17.885, CER: 4.992



Epoch 25 :: WER: 17.102, CER: 4.638



Epoch 26 :: WER: 17.102, CER: 4.638



Epoch 27 :: WER: 16.841, CER: 4.599



Epoch 28 :: WER: 16.057, CER: 4.403



Epoch 29 :: WER: 16.188, CER: 4.363



Epoch 30 :: WER: 15.144, CER: 3.813



Epoch 31 :: WER: 14.491, CER: 4.088



Epoch 32 :: WER: 13.577, CER: 3.852



Epoch 33 :: WER: 14.883, CER: 4.088



Epoch 34 :: WER: 12.924, CER: 3.695



Epoch 35 :: WER: 14.099, CER: 3.970



Epoch 36 :: WER: 13.316, CER: 3.656



Epoch 37 :: WER: 12.924, CER: 3.734



Epoch 38 :: WER: 12.533, CER: 3.616



Epoch 39 :: WER: 12.402, CER: 3.577



Epoch 40 :: WER: 11.880, CER: 3.459


In [13]:
# reload the previously best found model
model = ModelFactory.load(new_model_path)
if cuda:
    model = model.cuda()

err = validate(eval_loader, model, tqdm=False)
print("WER: {0:.3f}, CER: {1:.3f}".format(err.wer, err.cer))

# add the rnns for additional fine tuning
for x, y in model.conv.named_parameters():
    y.requires_grad_(False)
trainable_params = set([x for x in model.parameters() if x.requires_grad])
#lr = optimizer.param_groups[0]['lr']
#optimizer = torch.optim.SGD(trainable_params, lr=lr, momentum=momentum, nesterov=True)
optimizer = torch.optim.Adam(trainable_params, lr=3e-4, amsgrad=True)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1/annealing)

WER: 11.880, CER: 3.459


In [14]:
# run another 20 epochs of training
num_epochs = 20
for epoch in range(num_epochs):
    train_sampler.shuffle()
    #scheduler.step()
    
    model.train()
    train_loader = tqdm_notebook(train_loader, desc="Epoch {}".format(epoch+1))
    for i, data in enumerate(train_loader):
        feat, target, feat_len, target_len = data
        if cuda:
            feat = feat.cuda(async=True)
        
        optimizer.zero_grad()
        
        output, output_len = model(feat, feat_len)
        loss = model.loss(output, target, output_len.squeeze(0), target_len)
        
        scalar_loss = loss.item()/feat.size(0)
        if abs(scalar_loss) == math.inf:
            scalar_loss = 0
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable_params, 400)
        optimizer.step()
    model.eval()
    err = validate(eval_loader, model, tqdm=False)
    print("Epoch {0} :: WER: {1:.3f}, CER: {2:.3f}".format(epoch+1, err.wer, err.cer))
    
    if err.wer < best_wer:
        best_wer = err.wer
        torch.save(SpeechModel.serialize(model), new_model_path)


Epoch 1 :: WER: 3.525, CER: 0.983



Epoch 2 :: WER: 2.350, CER: 0.825



Epoch 3 :: WER: 2.480, CER: 0.825



Epoch 4 :: WER: 2.089, CER: 0.668



Epoch 5 :: WER: 2.089, CER: 0.708



Epoch 6 :: WER: 1.828, CER: 0.550



Epoch 7 :: WER: 1.958, CER: 0.668



Epoch 8 :: WER: 1.958, CER: 0.668



Epoch 9 :: WER: 1.958, CER: 0.668



Epoch 10 :: WER: 2.089, CER: 0.708



Epoch 11 :: WER: 1.828, CER: 0.629



Epoch 12 :: WER: 2.089, CER: 0.708



Epoch 13 :: WER: 2.219, CER: 0.747



Epoch 14 :: WER: 2.219, CER: 0.747



Epoch 15 :: WER: 1.958, CER: 0.668



Epoch 16 :: WER: 2.219, CER: 0.747



Epoch 17 :: WER: 2.089, CER: 0.708



Epoch 18 :: WER: 2.089, CER: 0.708



Epoch 19 :: WER: 2.219, CER: 0.747



Epoch 20 :: WER: 2.350, CER: 0.786
