In [1]:
import numpy as np
import torch
from patter import ModelFactory
from patter.data import AudioSegment
from patter.decoder import BeamCTCDecoder
from patter.data.features import PerturbedSpectrogramFeaturizer

WARN: CTCLoss not imported. Use only for inference.


### Paths to Models/Data

In [2]:
model_path = "../../models/librispeech_pretrained_patter.pt"
lm_path = "../../models/lm/3-gram.pruned.1e-7.bin"
audio_path = "../../data/sample/1089-134691-0003.wav"

### Load model
The ModelFactory class is responsible for reading a serialized model file, and initializing an instance
of the correct type of model (e.g. DeepSpeech2 or its variants, Wav2Letter, etc). The optional keyword
argument `include_package`, when True, returns a second object which is a dictionary of additional
model metadata. It is not necessary if using the model for training or decoding. The contents of the
dictionary are only useful for introspection into how the model was created and its performance.

In [3]:
#model = ModelFactory.load("../../models/librispeech_pretrained_patter.pt")
model, package = ModelFactory.load("../../models/librispeech_pretrained_patter.pt", include_package=True)

# put model in evaluation mode (crucial!)
model.eval()

In [4]:
print(model)

DeepSpeechOptim(
  (conv): Sequential(
    (0.cnn): Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(0, 10))
    (0.batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
    (0.act): Hardtanh(min_val=0, max_val=20)
    (1.cnn): Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1))
    (1.batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
    (1.act): Hardtanh(min_val=0, max_val=20)
  )
  (rnn): DeepBatchRNN(
    (batch_norm): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True)
    (rnns): Sequential(
      (0): BatchRNN(
        (rnn): GRU(672, 800, bias=False, bidirectional=True)
      )
      (1): BatchRNN(
        (batch_norm): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True)
        (rnn): GRU(800, 800, bias=False, bidirectional=True)
      )
      (2): BatchRNN(
        (batch_norm): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True)
        (rnn): GRU(800, 800, bias=False, bidirectional=True)
      )
      (3): BatchRNN(
        

### Set up Featurizer
The model configuration includes what featurization is required to convert the audio to a format acceptable to it.
A patter featurizer is responsible for reading that configuration, then reading in audio from its on-disk format
and returning a tensor ready for the model.


In [5]:
featurizer = PerturbedSpectrogramFeaturizer.from_config(model.input_cfg)

### Set up Decoder
Setup a beam decoder with a language model that can be used to convert the acoustic model outputs to the final transcription.

In [6]:
decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, alpha=2.15, beta=0.85, beam_width=100, blank_index=0)

## Transcribe audio
To transcribe audio, it must be loaded from disk and featurized. The input features are then passed through the
acoustic model. The output of the acoustic model must then be decoded using a properly initialized decoder.

In [9]:
features = featurizer.process(audio_path)

In [10]:
# This adds two singleton dimensions to the feature tensor.
# The model expects input in the form (1, batch_size, feature_size, max_seq_len)
features = features.unsqueeze(0).unsqueeze(0)
seq_len = torch.IntTensor([features.size(3)])

# run model
output, output_len = model(torch.autograd.Variable(features, volatile=True),
                           torch.autograd.Variable(seq_len, volatile=True))

In [11]:
### Decode acoustic model output
output = output.transpose(0, 1) # decoder requires [batch_size, seq_len, character_classes]
transcript, offsets, scores = decoder.decode(output.data, output_len.data, num_results=1)

In [12]:
print("Top-rated final transcript for first utterance in batch:")
print(transcript[0][0])

Top-rated final transcript for first utterance in batch:
THE UNIVERSITY


### Appendix
**Note:** If you want to run the above on a GPU, the model and Variables must be moved to the GPU, i.e.:

```python
model = model.cuda()

features = torch.autograd.Variable(features, volatile=True).cuda()
seq_len = torch.autograd.Variable(seq_len, volatile=True).cuda()
```

and the output will have to be moved back to the CPU prior to decoding, i.e.:

```python
output = output.cpu()
```