-
Notifications
You must be signed in to change notification settings - Fork 5
/
transcribe_audio_file.py
125 lines (102 loc) · 5.06 KB
/
transcribe_audio_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import tensorflow as tf
import librosa
from joblib import dump
import utils
from model_helper import las_model_fn
from preprocess_all import calculate_acoustic_features
SAMPLE_RATE = 16000
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--waveform', type=str, required=True,
help='Acoustic file')
parser.add_argument('--vocab', type=str, required=True,
help='vocabulary table, listing vocabulary line by line')
parser.add_argument('--norm', type=str, default=None,
help='normalization params')
parser.add_argument('--mapping', type=str,
help='additional mapping when evaluation')
parser.add_argument('--model_dir', type=str, required=True,
help='path of imported model')
parser.add_argument('--beam_width', type=int, default=0,
help='number of beams (default 0: using greedy decoding)')
parser.add_argument('--delimiter', help='Symbols delimiter. Default: " "', type=str, default=' ')
parser.add_argument('--binf_map', type=str, default='misc/binf_map.csv',
help='Path to CSV with phonemes to binary features map')
parser.add_argument('--use_phones_from_binf', help='User phonemes decoded from binary features decoder outputs.',
action='store_true')
parser.add_argument('--feature_type', help='Acoustic feature type.', type=str,
choices=['mfe', 'mfcc', 'lyon'], default='mfcc')
parser.add_argument('--backend', help='Library for calculating acoustic features.', type=str,
choices=['speechpy', 'librosa'], default='librosa')
parser.add_argument('--n_mfcc', help='Number of MFCC coeffs.', type=int, default=13)
parser.add_argument('--n_mels', help='Number of mel-filters.', type=int, default=40)
parser.add_argument('--window', help='Analysis window length in ms.', type=int, default=20)
parser.add_argument('--step', help='Analysis window step in ms.', type=int, default=10)
parser.add_argument('--deltas', help='Calculate deltas and double-deltas.', action='store_true')
parser.add_argument('--energy', help='Compute energy.', action='store_true')
parser.add_argument('--output_file', help='location for saving predictions')
return parser.parse_args()
def input_fn(features, vocab_filename, norm_filename=None):
def gen():
for item in features:
yield item
output_shapes = tf.TensorShape([None, features[0].shape[-1]])
dataset = tf.data.Dataset.from_generator(gen, tf.float32, output_shapes)
vocab_table = utils.create_vocab_table(vocab_filename)
if norm_filename is not None:
means, stds = utils.load_normalization(norm_filename)
else:
means = stds = None
dataset = utils.process_dataset(dataset, vocab_table, utils.SOS, utils.EOS,
means, stds, 1, 1, is_infer=True)
return dataset
def to_text(vocab_list, sample_ids):
sym_list = [vocab_list[x] for x in sample_ids] + [utils.EOS]
return args.delimiter.join(sym_list[:sym_list.index(utils.EOS)])
def main(args):
config = tf.estimator.RunConfig(model_dir=args.model_dir)
hparams = utils.create_hparams(args)
hparams.decoder.set_hparam('beam_width', args.beam_width)
vocab_list = utils.load_vocab(args.vocab)
binf2phone_np = None
if hparams.decoder.binary_outputs:
if args.mapping is not None:
vocab_list, mapping = utils.get_mapping(args.mapping, args.vocab)
hparams.del_hparam('mapping')
hparams.add_hparam('mapping', mapping)
binf2phone = utils.load_binf2phone(args.binf_map, vocab_list)
binf2phone_np = binf2phone.values
def model_fn(features, labels, mode, config, params):
return las_model_fn(features, labels, mode, config, params,
binf2phone=binf2phone_np, transparent_projection=args.use_phones_from_binf)
model = tf.estimator.Estimator(
model_fn=model_fn,
config=config,
params=hparams)
audio, _ = librosa.load(args.waveform, sr=SAMPLE_RATE, mono=True)
features = [calculate_acoustic_features(args, audio)]
predictions = model.predict(
input_fn=lambda: input_fn(features, args.vocab, args.norm))
predictions = list(predictions)
for p in predictions:
phone_pred_key = next(k for k in p.keys() if k.startswith('sample_ids'))
beams = p[phone_pred_key].T
if len(beams.shape) > 1:
i = beams[0]
else:
i = beams
i = i.tolist() + [utils.EOS_ID]
i = i[:i.index(utils.EOS_ID)]
text = to_text(vocab_list, i)
text = text.split(args.delimiter)
for k in p.keys():
print(f'{k}: {p[k].shape}')
print(text)
if args.output_file:
dump(predictions, args.output_file)
print(f'Predictions are saved to {args.output_file}')
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
args = parse_args()
main(args)