In [19]:
import errno
import os
import tables
from os import path
import sys
import tarfile
import fnmatch
import pandas as pd
import subprocess
from mapping import phone_maps
import python_speech_features as psf
import scipy.io.wavfile as wav
import numpy as np

In [4]:
timit_phone_map = phone_maps(mapping_file="kaldi_60_48_39.map")

In [5]:
def clean(word):
    # LC ALL & strip punctuation which are not required
    new = word.lower().replace('.', '')
    new = new.replace(',', '')
    new = new.replace(';', '')
    new = new.replace('"', '')
    new = new.replace('!', '')
    new = new.replace('?', '')
    new = new.replace(':', '')
    new = new.replace('-', '')
    return new

In [6]:
def compute_mfcc(wav_file, n_delta=0):
    mfcc_feat = psf.mfcc(wav_file)
    if(n_delta == 0):
        return(mfcc_feat)
    elif(n_delta == 1):
        return(np.hstack((mfcc_feat, psf.delta(mfcc_feat,1))))
    elif(n_delta == 2):
        return(np.hstack((mfcc_feat, psf.delta(mfcc_feat,1), psf.delta(mfcc_feat, 2))))
    else:
        return 0

In [7]:
def read_transcript(full_wav):
    trans_file = full_wav[:-8] + ".PHN"
    with open(trans_file, "r") as file:
        trans = file.readlines()
    durations = [ele.strip().split(" ")[:-1] for ele in trans]
    durations_int = []
    for duration in durations:
        durations_int.append([int(duration[0]), int(duration[1])])
    trans = [ele.strip().split(" ")[-1] for ele in trans]
    trans = [timit_phone_map.map_symbol_reduced(symbol=phoneme) for phoneme in trans]
    # trans = " ".join(trans)
    return trans, durations_int


## Converting train dataset sphere files into wav files

In [16]:
def convert_train_rifwav(args):
    target = args["datapath"]
    preprocessed = args["preprocessed"]
    print("Preprocessing data")
    print(preprocessed)
    # Assume data is downloaded from LDC - https://catalog.ldc.upenn.edu/ldc93s1
    # We convert the .WAV (NIST sphere format) into MSOFT .wav
    # creates _rif.wav as the new .wav file
    if(preprocessed):
        print("Train data is already preprocessed, now compute train features")
    else:
        print(os.path.join(target, "TIMIT\\TRAIN"))
        file = open('train_wavs', 'a+')
        for root, dirnames, filenames in os.walk(os.path.join(target, "TIMIT\\TRAIN")):
            for filename in fnmatch.filter(filenames, "*.WAV"):
                sph_file = os.path.join(root[35:], filename)
                wav_file = os.path.join(root[35:], filename)[:-4] + "_rif.wav"
                path = os.path.join(root, filename)[:-4] + "_rif.wav"
                file.write(path+",")
                print("converting {} to {}".format(sph_file, wav_file))
                cmd = "sph2pipe -f wav "+ sph_file+" "+wav_file
                subprocess.call(cmd)
        print("Preprocessing Complete")

## Converting test dataset sphere files into wav files

In [17]:
def convert_test_rifwav(args):
    target = args["datapath"]
    preprocessed = args["preprocessed"]
    print("Preprocessing data")
    print(preprocessed)
    # Assume data is downloaded from LDC - https://catalog.ldc.upenn.edu/ldc93s1
    # We convert the .WAV (NIST sphere format) into MSOFT .wav
    # creates _rif.wav as the new .wav file
    if(preprocessed):
        print("Test data is already preprocessed, now compute test features")
    else:
        print(os.path.join(target, "TIMIT\\TEST"))
        file = open('test_wavs', 'a+')
        for root, dirnames, filenames in os.walk(os.path.join(target, "TIMIT\\TEST")):
            for filename in fnmatch.filter(filenames, "*.WAV"):
                sph_file = os.path.join(root[35:], filename)
                wav_file = os.path.join(root[35:], filename)[:-4] + "_rif.wav"
                path = os.path.join(root, filename)[:-4] + "_rif.wav"
                file.write(path+",")
                #wav_file1 = os.path.join(root, filename)[:-4] + "_rif.wav"
                print("converting {} to {}".format(sph_file, wav_file))
                cmd = "sph2pipe -f wav "+ sph_file+" "+wav_file
                subprocess.call(cmd)
        print("Preprocessing Complete")

## Computing train and test features and dumping it into hdf file

In [20]:
def compute_features(args):
    n_delta = args["n_delta"]
    dataset = args["dataset"]
    
    print("Building CSVs")

    mfcc_features = []
    mfcc_labels = []

    if(dataset == "train"):
        with open("train_wavs", "r") as file:
            full_wavs = file.readlines()
        full_wavs = [ele.strip().split(",") for ele in full_wavs]
        full_wavs = full_wavs[0][:-1]
   
        for full_wav in full_wavs:
            print("Computing for file: ", full_wav)
            trans, durations = read_transcript(full_wav = full_wav)
            n_delta = int(args["n_delta"])
            labels = []
            (sample_rate,wav_file) = wav.read(full_wav)
            mfcc_feats = compute_mfcc(wav_file[durations[0][0]:durations[0][1]], n_delta=n_delta)

            for i in range(len(mfcc_feats)):
                    labels.append(trans[0])
            for index, chunk in enumerate(durations[1:]):
                mfcc_feat = compute_mfcc(wav_file[chunk[0]:chunk[1]], n_delta=n_delta)
                mfcc_feats = np.vstack((mfcc_feats, mfcc_feat))
                for i in range(len(mfcc_feat)):
                    labels.append(trans[index])
            mfcc_features.extend(mfcc_feats)
            mfcc_labels.extend(labels)
        #Possibly separate features phone-wise and dump them? (np.where() could be used)
        timit_df = pd.DataFrame()
        timit_df["features"] = mfcc_features
        timit_df["labels"] = mfcc_labels
        if n_delta == 0:
            timit_df.to_hdf("./train_features/mfcc/timit.hdf", "timit")
        elif n_delta == 1:
            timit_df.to_hdf("./train_features/mfcc_delta/timit.hdf", "timit")
        elif n_delta == 2:
            timit_df.to_hdf("./train_features/mfcc_delta_delta/timit.hdf", "timit")
        print("training features extracted")
    else:
        with open("test_wavs", "r") as file:
            full_wavs = file.readlines()
        full_wavs = [ele.strip().split(",") for ele in full_wavs]
        full_wavs = full_wavs[0][:-1]
   
        for full_wav in full_wavs:
            print("Computing for file: ", full_wav)
            trans, durations = read_transcript(full_wav = full_wav)
            n_delta = int(args["n_delta"])
            labels = []
            (sample_rate,wav_file) = wav.read(full_wav)
            mfcc_feats = compute_mfcc(wav_file[durations[0][0]:durations[0][1]], n_delta=n_delta)

            for i in range(len(mfcc_feats)):
                    labels.append(trans[0])
            for index, chunk in enumerate(durations[1:]):
                mfcc_feat = compute_mfcc(wav_file[chunk[0]:chunk[1]], n_delta=n_delta)
                mfcc_feats = np.vstack((mfcc_feats, mfcc_feat))
                for i in range(len(mfcc_feat)):
                    labels.append(trans[index])
            mfcc_features.extend(mfcc_feats)
            mfcc_labels.extend(labels)
        #Possibly separate features phone-wise and dump them? (np.where() could be used)
        timit_df = pd.DataFrame()
        timit_df["features"] = mfcc_features
        timit_df["labels"] = mfcc_labels
        if n_delta == 0:
            timit_df.to_hdf("./test_features/mfcc/timit.hdf", "timit")
        elif n_delta == 1:
            timit_df.to_hdf("./test_features/mfcc_delta/timit.hdf", "timit")
        elif n_delta == 2:
            timit_df.to_hdf("./test_features/mfcc_delta_delta/timit.hdf", "timit")
        print("testing features extracted")

In [23]:
if __name__ == "__main__":
    args1 = {"datapath": "C:\\Users\\Shivangi Singh\\ASRproject","preprocessed": True}
    convert_test_rifwav(args1)
    convert_train_rifwav(args1)
    
    arg_train_delta0 = {"n_delta" : 0, "dataset" : "train"}
    arg_train_delta1 = {"n_delta" : 1, "dataset" : "train"}
    arg_train_delta2 = {"n_delta" : 2, "dataset" : "train"}
    
    arg_test_delta0 = {"n_delta" : 0, "dataset" : "test"}
    arg_test_delta1 = {"n_delta" : 1, "dataset" : "test"}
    arg_test_delta2 = {"n_delta" : 2, "dataset" : "test"}
    
    #already computed features by executing these calls once
    #compute_features(arg_train_delta0)
    #compute_features(arg_train_delta1)
    #compute_features(arg_train_delta2)
    #compute_features(arg_test_delta0)
    #compute_features(arg_test_delta1)
    #compute_features(arg_test_delta2)
    print("Completed")
    

Preprocessing data
True
Test data is already preprocessed, now compute test features
Preprocessing data
True
Train data is already preprocessed, now compute train features
Completed
