In [1]:
# !pip install python_speech_features
# !pip install librosa

In [3]:
# TO UNTAR FILE
# !tar -xvf openslr_dataset2.tar.gz
# librosa.load('openslr_dataset1/LibriSpeech/dev-clean/1673/143396/1673-143396-0000.flac')

#### Installing Libraries

In [31]:
import os
import re
import sys
import urllib
from pathlib import Path
import pickle

import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import numpy as np
from python_speech_features import mfcc
from scipy.sparse import csr_matrix
from tqdm import tqdm

#### Functions to analyze sound signal

In [32]:
def plot_wave(path):
    samples,sample_rate=librosa.load(path,mono=True,sr=8000)  #16000,8000
    plt.figure(figsize=(15,5))
    librosa.display.waveplot(samples,sr=sample_rate)
    plt.show()
    
def plot_melspectogram(path,m_mels=128):
    samples,sample_rate=librosa.load(path,mono=True,sr=8000)  #16000,8000
    plt.figure(figsize=(15,5))
    S=librosa.feature.melspectrogram(samples,sr=sample_rate,n_mels=n_mels)
    log_S=librosa.power_to_db(S,ref=np.max)
    librosa.display.specshow(log_s)
    plt.show()

#### Preprocesssing of data

In [33]:
def audioToInputVector(audio_filename,numcep,numcontext):
    """Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features
    at every 0.01s time step with a window length of 0.025s. Appends ``numcontext``
    context frames to the left and right of each time step, and returns this data
    in a numpy array."""
    audio,fs=librosa.load(audio_filename)
    #get mfcc coeficients
    features=mfcc(audio,samplerate=fs,numcep=numcep,nfft=551)  #NUMCEP=26,NFFT=512
    #we only keep every second feature (BiRNN stride=2)
    features=features[::2]            
    #one stride per time step in input
    num_strides=len(features)
    #add empty initial and final context
    empty_context=np.zeros((numcontext,numcep),dtype=features.dtype)
    features=np.concatenate((empty_context,features,empty_context))
    #create a view into the array with overlapping strides of size numcontext (past) + 1(present)+numcontext(future)
    window_size=2*numcontext+1
    train_inputs=np.lib.stride_tricks.as_strided(
                 features,
                 (num_strides,window_size,numcep),
                 (features.strides[0],features.strides[0],features.strides[1]),
                 writeable=False)
    
    #Flatten the second and third dimensions
    train_inputs=np.reshape(train_inputs,[num_strides,-1])
    
    #Copy strided array so that we can write to it safely
    train_inputs = np.copy(train_inputs)
    train_inputs=(train_inputs-np.mean(train_inputs))/np.std(train_inputs)
    
    #RETURN :)
    return train_inputs


In [34]:
def load_data(dir_path):
    """
    Args:
        dir_path: path to the directory with txt and audio files.
        how_many: Integer. Number of directories we want to iterate,
                  that contain the audio files and transcriptions.
    Returns:
        txts: The spoken texts extracted from the .txt files,
              which correspond to the .flac files in audios.
              Text version.
        audios: The .flac file paths corresponding to the
                sentences in txts. Spoken version.
    """
    dir_path = Path(dir_path)
    txt_list = [f for f in dir_path.glob('**/*.txt') if f.is_file()]
    audio_list = [f for f in dir_path.glob('**/*.flac') if f.is_file()]

    print('Number of audio txt paths:', len(txt_list))
    print('Number of audio file paths:', len(audio_list))

    txts = []
    audios = []
    audio_paths = []



    for i, txt in tqdm(enumerate(txt_list)):
        with open(txt) as f:
            for line in f.readlines():
                for audio in audio_list:
                    if audio.stem in line:
                        line = re.sub(r'[^A-Za-z]', ' ', line)
                        line = line.strip()
                        txts.append(line)
                        audios.append(audioToInputVector(audio, 26, 9))
                        audio_paths.append(audio)
                        break
    return txts, audios, audio_paths

In [35]:
txts,audios,audio_paths=load_data('LibriSpeech/dev-clean')

0it [00:00, ?it/s]

Number of audio txt paths: 97
Number of audio file paths: 2703


97it [09:07,  5.64s/it]


In [36]:
def split_txts(txts):
    """
    Args:
        txts: The texts that will be split
              into single characters
    Returns:
        The splitted texts and array of all unique characters
        in those texts.
    """
    txts_splitted = []
    unique_chars = set()      #if 3 sets are consider each of them can be updated by elements in other if used set1.update(set2) for other combinations

    for txt in txts:
        splitted = list(txt)    #list contains alphabets in sent separated by comma
        splitted = [ch if ch != ' ' else '<SPACE>' for ch in splitted]
        txts_splitted.append(splitted)
        unique_chars.update(splitted)
    return txts_splitted, sorted(unique_chars)


In [37]:
def create_lookup_dicts(unique_chars, specials=None):
    """
    Args:
        unique_chars: Set of unique chars appearning in texts.
        specials: Special characters we want to add to the dict,
                  such as <PAD>, <SOS> or <EOS>
    Returns:
        char2ind: look updict from character to index
        ind2char: lookup dict from index to character
    """
    char2ind = {}
    ind2char = {}
    i = 0

    if specials is not None:
        for sp in specials:
            char2ind[sp] = i
            ind2char[i] = sp
            i += 1
    for ch in unique_chars:
        char2ind[ch] = i
        ind2char[i] = ch
        i += 1
    return char2ind, ind2char

def convert_txt_to_inds(txt, char2ind, eos=False, sos=False):
    """
    Args:
        txt: Array of chars to convert to inds.
        char2ind: Lookup dict from chars to inds.
    Returns: The converted chars, i.e. array of ints.
    """
    txt_to_inds = [char2ind[ch] for ch in txt]
    if eos:
        txt_to_inds.append(char2ind['<EOS>'])
    if sos:
        txt_to_inds.insert(0, char2ind['<SOS>'])
    return txt_to_inds

def convert_inds_to_txt(inds, ind2char):
    """
    Args:
        inds: Array of ints to convert to chars
        ind2char: Lookup dict from ind to chars
    Returns: The converted inds, i.e. array of chars.
    """
    inds_to_txt = [ind2char[ind] for ind in inds]
    return inds_to_txt

In [38]:
def process_txts(txts, specials):
    """
    Processes the texts. Calls the functions split_txts,
    create_lookup_dicts and uses convert_txt_to_inds.
    Args:
        txts: Array of strings. Input texts.
        specials: Specials tokens we want to include in the
                  lookup dicts
    Returns:
        txts_splitted: Array of the input texts splitted up into
                       characters
        unique_chars: Set of Unique chars appearing in input texts.
        char2ind: Lookup dict from character to index.
        ind2char: Lookup dict from index to character.
        txts_converted: txts splitted converted to indices of
                        word2ind. i.e. array of arrays of ints.
    """
    txts_splitted, unique_chars = split_txts(txts)
    char2ind, ind2char = create_lookup_dicts(unique_chars, specials)
    txts_converted = [convert_txt_to_inds(txt, char2ind, eos=True, sos=True)
                      for txt in txts_splitted]

    return txts_splitted, unique_chars, char2ind, ind2char, txts_converted



In [39]:
txts_splitted,unique_chars,char2ind,ind2char,txts_converted=process_txts(txts,['<PAD>','<SOS>','<EOS>'])

In [10]:
def sort_by_length(audios,
                   txts,
                   audio_paths,
                   txts_splitted,
                   txts_converted,
                   by_text_length=True):
    """
    Sort texts by text length from shortest to longest.
    To keep everything in order we also sort the rest of the data.
    Args:
        by_text_length: Boolean. Sort either by text lengths or
                        by length of audios.
    Returns:
    """

    # check if that works. if not audios isn't a  numpy array.
    # in that case we could convert beforehand.
    if by_text_length:
        indices = [txt[0] for txt in sorted(enumerate(txts_converted), key=lambda x: len(x[1]))]
    else:
        indices = [a[0] for a in sorted(enumerate(audios), key=lambda x: x[1].shape[0])]
    txts_sorted = np.array(txts)[indices]
    audios_sorted = np.array(audios)[indices]
    audio_paths_sorted = np.array(audio_paths)[indices]
    txts_splitted_sorted = np.array(txts_splitted)[indices]
    txts_converted_sorted = np.array(txts_converted)[indices]

    return txts_sorted, audios_sorted, audio_paths_sorted, txts_splitted_sorted, txts_converted_sorted


In [11]:
txts_sorted,audios_sorted,audio_paths_sorted,txts_splitted_sorted,txts_converted_sorted=sort_by_length(audios,txts,audio_paths,txts_splitted,txts_converted,by_text_length=False)

In [12]:
txts_converted_sorted[0]

[1, 28, 8, 22, 2]

In [13]:
def pad_txt_sequences(sequences,pad_tok):
    """pads the sentences,so that all sentences in a batch have same length"""
    max_length=max(len(x) for x in sequences)
    sequence_padded,sequence_length=[],[]
    for seq in tqdm(sequences):
        seq_ = seq + [pad_tok] * max(max_length - len(seq), 0)

        sequence_padded.append(seq_)
        sequence_length.append(len(seq))

    return np.array(sequence_padded), sequence_length        

In [32]:

def pad_audio_sequences(sequences, tail=True):
    """
    Args:
        sequences: Array of audio sequences
        tail: Boolean. Append silence to end or beginning
    Returns: Padded array with audio sequences, padded with
             silence.
    """

    max_length = max(seq.shape[0] for seq in sequences)
#     print(max_length)
    sequences_padded, sequence_length = [], []

    for seq in tqdm(sequences):
        if tail:
            seq_shape = seq.shape
            pad_vector = [0] * seq_shape[1]
            n_vectors_to_add = max_length - seq_shape[0]

            for _ in range(n_vectors_to_add):
                seq = np.append(seq, [pad_vector], axis=0)

                
        #FOR NUMPY TO SPARSE ONLY FOR XTRAIN,XVAL NOT YTRAIN AND YVAL
        sequences_padded.append(csr_matrix(seq))
        sequence_length.append(seq_shape[0])


    return sequences_padded, sequence_length

In [15]:
txt_sequence_padded, txt_sequence_length=pad_txt_sequences(txts_converted_sorted,char2ind['<PAD>'])

100%|██████████| 2703/2703 [00:00<00:00, 84315.30it/s]


In [16]:
audio_sequence_padded, audio_sequence_length=pad_audio_sequences(audios_sorted)

100%|██████████| 2703/2703 [36:32<00:00,  1.23it/s]


In [46]:
import pickle

def save_as_pickled_object(obj, filepath):
    """
    This is a defensive way to write pickle.write, allowing for very large files on all platforms
    https://stackoverflow.com/questions/31468117/python-3-can-pickle-handle-byte-objects-larger-than-4gb
    """
    max_bytes = 2 ** 31 - 1
    bytes_out = pickle.dumps(obj)
    n_bytes = sys.getsizeof(bytes_out)
    with open(filepath, 'wb') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx + max_bytes])
            
def read_as_pickled_object(filepath):
    bytes_in = bytearray(0)
    input_size = os.path.getsize(file_path)
    with open(file_path, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    data2 = pickle.loads(bytes_in)
    
            
def write_pkl(path, data):
    """
    Writes the given data to .pkl file.
    """
    with open(path, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_pkl(path):
    """
    Loads data from given path to .pkl file.
    """
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

In [19]:
save_as_pickled_object(audio_sequence_padded,'processed_numpy/audio')
# write_pkl('processed_numpy/unique_chars', unique_chars)
write_pkl('processed_numpy/char2ind', char2ind)
write_pkl('processed_numpy/ind2char', ind2char)

In [3]:
np.save('processed_numpy/txtx',txt_sequence_padded)
np.save('processed_numpy/txt_length',txt_sequence_length)