In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
from itertools import zip_longest
from collections import defaultdict
from glob import iglob
from utils import read_wav, extract_feats, read_dataset, batch, decode, list_2d_to_sparse
from IPython.display import Audio
from sklearn.model_selection import train_test_split
from IPython.core.display import HTML
from random import choice

from keras.layers import LSTM, Dense, Convolution1D
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed, Bidirectional

%matplotlib inline

Using TensorFlow backend.


In [6]:
TRAIN_PATH = "/toshiba/TEDLIUM_release2/train/"
TEST_PATH = "/toshiba/TEDLIUM_release2/test/"

In [3]:
vocab = {' ': 0,
         "'": 1,
         'a': 2,
         'b': 3,
         'c': 4,
         'd': 5,
         'e': 6,
         'f': 7,
         'g': 8,
         'h': 9,
         'i': 10,
         'j': 11,
         'k': 12,
         'l': 13,
         'm': 14,
         'n': 15,
         'o': 16,
         'p': 17,
         'q': 18,
         'r': 19,
         's': 20,
         't': 21,
         'u': 22,
         'v': 23,
         'w': 24,
         'x': 25,
         'y': 26,
         'z': 27,
         'ă': 28,
         'ō': 29,
         'ť': 30
}
inv_vocab = dict(zip(vocab.values(), vocab.keys()))

In [None]:
def batch(generator, batch_size):
    num_features = None
    while True:
        X_batch = []
        y_batch = []
        for i in range(batch_size):
            X, y = next(generator)
            if len(y) <= 
            if num_features is None:
                num_features = X.shape[1]
            X_batch.append(X)
            y_batch.append(list(map(lambda x: vocab[x], y)))
        sequence_lengths = list(map(len, X_batch))
        X_batch_padded = np.array(list(zip_longest(*X_batch, fillvalue=np.zeros(num_features)))).transpose([1, 0, 2])
        yield X_batch_padded, sequence_lengths, list_2d_to_sparse(y_batch), y_batch
        

In [4]:
def tedlium_gen(path, max_opened_files=10):
    wav_path = os.path.join(path, "sph")
    stm_path = os.path.join(path, "stm")
    print("Reading stms")
    stms = {}
    for stm_file in iglob(os.path.join(stm_path, "*.stm")):
        session_name = os.path.basename(stm_file).split(".stm")[0]
        stms[session_name] = []
        with open(stm_file, "r") as fin:
            for line in fin:
                segments = line.strip().split()
                start = float(segments[3])
                stop = float(segments[4])
                text = " ".join(segments[6:])
                stms[session_name].append((start, stop, text))
                
                
    print("Building wav cache")
    
    wav_cache = {}
    
    cache_initial_population = np.random.choice(tuple(stms.keys()), size=max_opened_files)
    for session_name in cache_initial_population:
        full_path = os.path.join(wav_path, session_name + ".wav")
        rate, data = read_wav(full_path)
        free_phrase_ids = set(range(len(stms[session_name])))
        wav_cache[session_name] = (rate, data[:, 0].astype(np.float32), free_phrase_ids)
        
    while True:
        session_name = choice(tuple(wav_cache.keys()))
        rate, data, free_phrase_ids = wav_cache[session_name]
        phrase_id = choice(tuple(free_phrase_ids))
        start, end, text = stms[session_name][phrase_id]
        yield extract_feats(rate, data[int(start * rate):int(end * rate)])[2], text
        
        free_phrase_ids.remove(phrase_id)
        if len(free_phrase_ids) == 0:
            del wav_cache[session_name]
            new_session_name = choice(tuple(stms.keys()))
            full_path = os.path.join(wav_path, new_session_name + ".wav")
            rate, data = read_wav(full_path)
            free_phrase_ids = set(range(len(stms[new_session_name])))
            wav_cache[new_session_name] = (rate, data[:, 0].astype(np.float32), free_phrase_ids)

In [9]:
import os
from glob import glob
from multiprocessing import Pool
from scipy.io.wavfile import read, write

In [19]:
session_names = list(map(lambda x: x.split(".stm")[0], os.listdir(os.path.join(TRAIN_PATH, "stm"))))

In [11]:
def extract_ted_data(session_name, stm_path, wav_path, write_wav_path):
    stm_file = os.path.join(stm_path, session_name + ".stm")
    rate, data = read(os.path.join(wav_path, session_name + ".wav"))
    phrases = []
    with open(stm_file, "r") as fin:
        for line in fin:
            segments = line.strip().split()
            start = float(segments[3])
            stop = float(segments[4])
            text = " ".join(segments[6:])
            phrases.append((start, stop, text))
    result = {}
    for phrase in phrases:
        start, stop, text = phrase
        filename = "{}_{}_{}.wav".format(session_name, start, stop)
        write(os.path.join(write_wav_path, filename),
              rate, data[int(rate*start):int(rate*stop)])
        result[filename] = text
    return result

In [None]:
extract_ted_data("AlGore_2008", os.path.join(TRAIN_PATH, "stm"), os.path.join(TRAIN_PATH, "sph"), "/toshiba/tmp")

In [14]:
class Extractor:
    def __init__(self, stm_path, wav_path, write_wav_path):
        self.stm_path = stm_path
        self.wav_path = wav_path
        self.write_wav_path = write_wav_path
        
    def __call__(self, session_name):
        return extract_ted_data(session_name, self.stm_path, self.wav_path, self.write_wav_path)

extractor = Extractor(os.path.join(TRAIN_PATH, "stm"), os.path.join(TRAIN_PATH, "sph"), "/toshiba/tedlium_phrases/")

In [20]:
with Pool(8) as p:
    result = p.map(extractor, session_names)

In [22]:
len(result)

1495

In [23]:
result[0]

{'GaryWolf_2010S_119.43_127.66.wav': 'the incredible detailed information that you can get from just one sensor like this this kind of sensor is in the',
 'GaryWolf_2010S_136.57_144.01.wav': "has just that sensor in it you're probably familiar with the nike plus system i just put it up because that little blue dot is the sensor",
 'GaryWolf_2010S_156.1_162.72.wav': 'strap that people use to transmit heart rate data to their nike plus system this is a beautiful new',
 'GaryWolf_2010S_17.94_25.28.wav': 'i got up this morning at six ten am after going to sleep at twelve forty five am i was awakened once during the night',
 'GaryWolf_2010S_171.96_181.68.wav': 'the sensor is just a little strip of metal in that headband there the rest of it is the bedside console just for reference this is a sleep tracking system from just a few years ago i mean really',
 'GaryWolf_2010S_193.61_208.17.wav': 'is a very small gps transceiver which gives you the date and location of an asthma incident giving y

In [24]:
result_all = {}
for d in result:
    result_all.update(d)

In [27]:
len(result_all)

92973

In [29]:
with open("/toshiba/tedlium_phrases.txt", "w") as fout:
    for k, v in result_all.items():
        fout.write(k + "\t" + v + "\n")