# Speech Prominence Detection
Speech Prominence Detection is the process of identifying the most important or prominent parts of speech in an audio signal. Prominence refers to the degree of emphasis or attention that a particular word or phrase receives in spoken language, which is often conveyed through variations in pitch, loudness, and timing. Speech Prominence Detection is an essential task in speech processing and natural language understanding, with a wide range of applications including speech recognition, sentiment analysis, and language translation. The goal of this project is to develop a machine learning model that can accurately identify the prominent parts of speech in a given audio signal. This project will involve feature extraction, model training, and evaluation, with the aim of achieving high accuracy and generalizability on a diverse range of speech datasets. The results of this project could have significant implications for improving speech recognition and understanding systems in real-world applications.

In [3]:
import numpy as np
import wave
import struct
import math
from scipy.signal import butter, lfilter
import librosa

In [4]:
#Auxilliary functions for feature computation
def spectral_selection(x, n):                              #out of the 19 subband energies computed, this function selects the n energies with the highest values
    y = x.shape
    row = y[0]
    col = y[1]
    xx = []
    for i in range(0,col,1):
        v = x[:,i]                                           # his line selects the i-th column of x and assigns it to the variable v
        v = np.array([v])
        v = v.T                 
        t = np.array(np.arange(1,row+1)).reshape(-1,1)  #This line generates a column vector t containing values from 1 to row, representing the row numbers.

        v = np.hstack((v, t))                      #This line horizontally stacks v and t, resulting in a matrix where the first column contains the values of v and the second column contains the row numbers.
        v_sort = v[v[:,0].argsort(),]                 #This line sorts v based on the values in the first column, resulting in v_sort.
        v_sort_sel = v_sort[row-n:row, :]             #This line selects the last n rows from v_sort and assigns the result to v_sort_sel.
        vv = v_sort_sel[v_sort_sel[:,1].argsort(),]   #This line sorts v_sort_sel based on the values in the second column, resulting in vv
        #tt = numpy.array([vv[:,0]])                  #The subsequent code block handles the concatenation of vv[:, 0] (the first column of vv) with the previous iterations' results stored in xx
        if i!=0:
            if i==1:
                pp = np.array([xx])
                pp = pp.T
            else:
                pp = xx
            pp2 = np.array([vv[:,0]])
            pp2 = pp2.T
            xx = np.hstack((pp, pp2))
        else:
            xx = np.concatenate((xx, vv[:,0]))
    return xx                                         # xx is a matrix containing the selected energies for all the frames with size n x col


def temp_vec_corr(x2, t_sigma):
    from scipy.stats import norm
    y = x2.shape
    row = y[0]
    col = y[1]
    wn = norm.pdf(np.arange(1,col+1,1), (col+1)/2, t_sigma)              # pdf function with mean = (col+1)/2 and std = t_sigma as window
    # NOTE: if we use continue to manipulate the variable x2, (the function argument), then it gets reflected back in
    # in the parent function. (No idea why). So create a copy of x2 and work with that.
    x3 = np.zeros((row,col))
    for i in range(0,row,1):
        x3[i,:] = np.multiply(x2[i,:],wn)                                # windowing the frame energies
    s=0
    for i in range(0,col-1,1):
        for j in range(i+1,col,1):
            s+= np.multiply(x3[:,i], x3[:,j])                             # computing the correlation between the consecutive frames 
    if col!=1:
        s = np.sqrt(np.divide(s, (col-1)*col/2))
    else:
        s = x3                                                             
    return s

def temporal_corr(x, win, t_sigma):
    hwin = (win-1)/2           # hwin is the half window size
    yy = x.shape
    row = yy[0]
    col = yy[1]

    row = int(row)
    hwin = int(hwin)

    x = np.array([np.concatenate((np.zeros((row,hwin)), x, np.zeros((row, hwin))), axis = 1)])              # zero padding the input matrix, hwin zeros on each sides of the columns
    y = []
    for i in range(hwin,col+hwin,1):
        temp2 = x[0,:,i-hwin:i+hwin+1]
        z = temp_vec_corr(temp2, t_sigma)
        z = np.array([z]).T
        if i==hwin:
            y = np.concatenate((y, z[:,0]))
        else:
            if i==hwin+1:
                y = np.array([y]).T
            y = np.hstack((y, z))
    return y

def spectral_corr(x):
    yy = x.shape
    row = yy[0]
    col = yy[1]

    s = np.zeros((1, col))
    for i in range(0, row-1, 1):
        for j in range(i+1, row, 1):
            s = s+np.multiply(x[i,:], x[j,:])

    if row!=1:
        s = np.sqrt(np.divide(s, (row*(row-1)/2)))
    else:
        s = x
    return s

def statFunctions_Syl(t):
    from scipy.stats.mstats import gmean
    if np.min(t)<0:
        t = np.subtract(t,min(t[0]))
        #out = []
        #return out
    out = np.array([np.median(t[0]), np.mean(t[0]), gmean(np.absolute(t[0])), np.max(t[0])-np.min(t[0]), np.std(t[0])])
    out = np.array([out]).T
    t = np.subtract(t,np.min(t[0]))
    t = np.divide(t, np.sum(t[0]))
    tempArr = np.array([np.arange(1,len(t[0])+1)])
    temporalMean = np.sum(np.multiply(tempArr,t)[0])
    temporalStd = np.sqrt(np.sum(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),2),t[0])))
    temporalSkewness = np.sum(np.divide(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),3),t[0]),np.power(temporalStd,3)))
    temporalKurthosis = np.sum(np.divide(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),4),t[0]),np.power(temporalStd,4)))
    arr1 = np.array([np.array([temporalStd, temporalSkewness, temporalKurthosis])]).T
    out = np.vstack((out,arr1))
    return out

def statFunctions_Vwl(t):
    if np.min(t)<0:
        t = np.subtract(t,min(t[0]))
        #out = []
        #eturn out
    out = np.array([np.median(t[0]), np.mean(t[0]), np.max(t[0])-np.min(t[0]), np.std(t[0])])
    out = np.array([out]).T
    t = np.subtract(t,np.min(t[0]))
    t = np.divide(t, np.sum(t[0]))
    tempArr = np.array([np.arange(1,len(t[0])+1)])
    temporalMean = np.sum(np.multiply(tempArr,t)[0])
    temporalStd = np.sqrt(np.sum(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),2),t[0])))
    temporalSkewness = np.sum(np.divide(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),3),t[0]),np.power(temporalStd,3)))
    temporalKurthosis = np.sum(np.divide(np.multiply(np.power(np.subtract(np.array([np.arange(1,len(t[0])+1)]),temporalMean),4),t[0]),np.power(temporalStd,4)))
    arr1 = np.array([np.array([temporalStd, temporalSkewness, temporalKurthosis])]).T
    out = np.vstack((out,arr1))
    return out

def smooth(t_cor, swin, sigma):
    from scipy.stats import norm
    ft = norm.pdf(np.arange(1,swin+1), (swin+1)/2, sigma)
    ft = np.array([ft])
    t_cor = np.array([t_cor])
    convRes = np.zeros((1, t_cor.shape[2]+ft.shape[1]-1))
    convRes = np.convolve(t_cor[0,0,:], ft[0,:])
    y = convRes[np.arange((swin+1)//2-1, len(convRes)-(swin-1)//2, 1)]
    return y

def get_labels(lab_list,fa,fileName):
        L=[]; fb=fa; filenm=[];
        
        for num in range(0,len(lab_list)):
            if str((lab_list[num][0].tolist())[0]) == str('P'):
                L.append(1)
                filenm.append(fileName)           
            else:
                L.append(0)
                filenm.append(fileName)
        fb = np.vstack((fa,L))
#        fb = np.vstack((fb,np.asarray(filenm,object)))
        return fb,filenm

def get_labels_seq2seq(lab_list):
        L=[];# filenm=[];
        
        for num in range(0,len(lab_list)):
            if str((lab_list[num][0].tolist())[0]) == str('P'):
                L.append(1)
#                filenm.append(fileName)           
            else:
                L.append(0)
                #filenm.append(fileName)
        #fb = np.vstack((fa,L))
#        fb = np.vstack((fb,np.asarray(filenm,object)))
        return L
    
def vocoder_func(wavPath):

    # FILTER DEFINITIONS

    def butter_bandpass(lowcut, highcut, fs, order):
        nyq = 0.5*fs
        low = float(lowcut) / nyq
        high = float(highcut) / nyq
        b, a = butter(order, [low, high], btype='band')
        return b, a

    def butter_lowpass(lowcut, fs, order):
        nyq = 0.5*fs
        low = float(lowcut) / nyq
        b ,a = butter(order, low, btype='lowpass')
        return b, a

    def butter_bandpass_filter(data, lowcut, highcut, fs, order):
        b, a = butter_bandpass(lowcut, highcut, fs, order=order)
        y = lfilter(b, a, data)
        return y

    def butter_lowpass_filter(data, lowcut, fs, order):
        b, a = butter_lowpass(lowcut, fs, order=order)
        y = lfilter(b, a, data)
        return y

    # FUNCTION TO READ A .wav FILE MATLAB STYLE

    def readWav(wavPath):
        waveFile = wave.open(wavPath)
        fs = waveFile.getframerate()
        length = waveFile.getnframes()
        data = []
        for i in range(0, length):
            waveData = waveFile.readframes(1)
            data.append(struct.unpack("<h", waveData))
        waveFile.close()
        data = np.array([data])
        data = data.astype(float)/np.max(np.abs(data))
        data = data[0]
        return data, fs, length

    # BUFFER FUNCTION AS DEFINED IN MATLAB

    def buffer(x, n, p=0, opt=None):
        import numpy
        if p >= n:
            raise ValueError('p ({}) must be less than n ({}).'.format(p,n))
        cols = int(numpy.ceil(len(x)/float(n-p)))+1
        if opt == 'nodelay':
            cols += 1
        elif opt != None:
            raise SystemError('Only `None` (default initial condition) and '
                              '`nodelay` (skip initial condition) have been '
                              'implemented')
        b = numpy.zeros((n, cols))
        j = 0
        for i in range(cols):
            if i == 0 and opt == 'nodelay':
                b[0:n,i] = x[0:n]
                continue
            elif i != 0 and p != 0:
                b[:p, i] = b[-p:, i-1]
            else:
                b[:p, i] = 0
            k = j + n - p
            n_end = p+len(x[j:k])
            b[p:n_end,i] = x[j:k,0]
            j = k
        return b

    fltcF= np.array([240,360,480,600,720,840,1000,1150,1300,1450,1600,1800,2000,2200,2400,2700,3000,3300,3750])
    fltBW= np.array([120,120,120,120,120,120,150,150,150,150,150,200,200,200,200,300,300,300,500])

    fltFc= np.array([np.subtract(fltcF,np.divide(fltBW,2)),np.add(fltcF,np.divide(fltBW,2))])
    fltLpFc= 50

    sig, Fs, length = readWav(wavPath)
    # print("sig.shape: ", sig.shape)
    # print("Fs: ", Fs)

    # Saving the audio in a txt file
    xx = np.append(Fs,sig)                       # sig is the amplitude of the audio signal

    nWndw = int(round(Fs*0.02))
    # print("nWndw is: ", nWndw)

    nOverlap = int(round(Fs*0.01))
    # print("nOverlap is: ", nOverlap)

    sig = 0.99*sig/max(abs(sig))                 # Normalizing the signal
    
    # Windowing first and filtering next
    sigFrames= buffer(sig*32768,nWndw,nOverlap)          # sig Frames is a 2D array where each column represents an analysis frame 
    subBandEnergies= np.zeros([19,sigFrames.shape[1]])   # 2D array with 19 rows and number of columns equal to number of frames

    for j in range(0,sigFrames.shape[1]): 
        currFrame = np.array([sigFrames[:,j]])                  # 1D array with the selected frame
        for i in range(0,fltFc.shape[1]):
            fltFrame = butter_bandpass_filter(currFrame[0], fltFc[0][i], fltFc[1][i], Fs, 2); fltFrame = fltFrame.T  # this line applies the bandpass filter to the current frame with fltFc[0][i] as lowcut and fltFc[1][i] as highcut
            rectFrame = np.abs(fltFrame[0:nWndw])
            lpFltFrame = butter_lowpass_filter(rectFrame, float(fltLpFc), Fs, 2)
            currEnergy = lpFltFrame[nWndw-1]
            if currEnergy < 1:
                currEnergy = 0.5
            subBandEnergies[i,j] = math.exp(2*math.log(currEnergy)/math.log(10))
    subBandEnergies = np.concatenate((np.exp(0.5*np.ones((19,1))),subBandEnergies[:,0:-2]),axis=1).T

    return Fs, subBandEnergies, xx

# so you have the signal in frames. Each column is a frame and the frames and are obtained by windowing the original signal and the frames have a 50% overlap.
# you calculate the subbandenergies frame by frame 
# for each frame you bandpass filter it through the 19 subbandenergies and take log of the last value of the filtered signal as the sub-band energy value.
# you concatenate a 0.5 to the first column of the subbandenergies matrix and remove the last two columns of the subbandenergies matrix.

In [5]:
# imports
import numpy as np
import pandas as pd
import re
import os
import collections
import scipy
from scipy.signal import medfilt
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

In [6]:
ger_train_dir = "/libri-360-train_wav/wav_matched_files/"
# ita_train_dir = os.path.join(data_dir, "ITA/train/")

phn_dir = "/Harsha/FA_results/"           # phonemes for vowels 
syl_dir = "/Harsha/syllables/"          # syllables 
# word_dir = "/Harsha/words/"             # words


In [7]:
if not os.path.exists(ger_train_dir):
    print("German Train directory does not exist")

# if not os.path.exists(ita_train_dir):
#     print("Italian Train directory does not exist")

if not os.path.exists(phn_dir):
    print("Phoneme directory does not exist")

if not os.path.exists(syl_dir):
    print("Syllable directory does not exist")

# if not os.path.exists(word_dir):
#     print("Word directory does not exist")

German Train directory does not exist
Phoneme directory does not exist
Syllable directory does not exist


In [8]:
ger_train_files = os.listdir(ger_train_dir)
# ita_train_files = os.listdir(ita_train_dir)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/libri-360-train_wav/wav_matched_files/'

In [9]:
# Compute features
twin = 5
t_sigma = 1.4
swin = 7
s_sigma = 1.5
mwin = 13
max_threshold = 25

vwlSB_num = 4
vowelSB = [1, 2, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17]
sylSB_num = 5
sylSB = [1, 2, 3, 4, 5, 6, 13, 14, 15, 16, 17, 18]

startWordFrame_all = []
spurtStartFrame_all = []
spurtEndFrame_all = []
vowelStartFrame_all = []
vowelEndFrame_all = []
eng_full_all = []
spurtStress_all = []

In [10]:
def get_syllables(syl_file):
    syl_array = []
    try:
        fid = open(syl_file, 'r')
        syl_array = np.loadtxt(fid, usecols=(0, 1, 2), dtype={'names': ('a', 'b', 'c'), 'formats': ('i4', 'f4', 'f4')})
        fid.close
    except:
        print('File does not exist')
        return

    labels = syl_array['a']
    spurtStartTimes = syl_array['b']
    spurtEndTimes = syl_array['c']

    return spurtStartTimes, spurtEndTimes, labels

In [11]:
import re

In [12]:
def get_vowels(phn_file):
    phn_array = []
    try:
        fid = open(phn_file, 'r')
        phn_array = np.loadtxt(fid, usecols=(0, 1, 2), dtype={'names': ('a', 'b', 'c'), 'formats': ('f4', 'f4', 'S16')})
        fid.close
    except:
        print('File does not exist')
        return


    phnStartTimes = phn_array['a']
    phnEndTimes = phn_array['b']
    phoneme = phn_array['c']
    # convert labels to string
    phoneme = [x.decode('UTF-8') for x in phoneme]
    
    vowelList = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'eh', 'er', 'ey', 'ih', 'iy', 'ow', 'oy', 'uh', 'uw']
    vowel_start_time = []
    vowel_end_time = []
    vowel = []

            
    for i in range(0, len(phoneme)):
        # convert phoneme[i] to only alphabets 
        phoneme[i] = re.sub('[^A-Za-z]+', '', phoneme[i])
        if phoneme[i].lower() in vowelList:
            vowel_start_time.append(phnStartTimes[i])
            vowel_end_time.append(phnEndTimes[i])
            vowel.append(phoneme[i].lower())

    return vowel_start_time, vowel_end_time, vowel

In [13]:
def get_words(word_file):
    word_array = []
    try:
        fid = open(word_file, 'r')
        word_array = np.loadtxt(fid, usecols=(2, 4), dtype={'names': ('a', 'b'), 'formats': ('f4', 'S16')})
        fid.close
    except:
        print('File does not exist')
        return

    startWordTimes = word_array['a']
    words = word_array['b']
    # convert words to string
    words = [x.decode('UTF-8') for x in words]
    return startWordTimes, words

In [105]:
def get_sylTCSSBC(sylSB, eng_full, sylSB_num, twin, t_sigma, swin, s_sigma, spurtStartTime, vowelStartTime, startWordFrame):
    # TCSSBC computation
    # vowelStartTime = np.squeeze(vowelStartTime)        #NOTE you changed this, not in original code 
    startWordFrame = np.squeeze(startWordFrame)        #NOTE you changed this, not in original code 

    if len(sylSB) > sylSB_num:
        eng = spectral_selection(
            eng_full[np.subtract(sylSB, 1), :], sylSB_num)
    else:
        eng = eng_full[sylSB, :]               # extract only the sub-band energies that are in the sylSB list


    t_cor = temporal_corr(eng, twin, t_sigma)             # calculate correlation spectrally and temporally 

    s_cor = spectral_corr(t_cor)

    sylTCSSBC = smooth(s_cor, swin, s_sigma)                # smooth the correlation
    
    sylTCSSBC = np.array([sylTCSSBC])     
    # sylTCSSBC = np.array(sylTCSSBC)  
    # print("sylTCSSBC.shape: ", sylTCSSBC.shape)           

    start_idx = np.round(spurtStartTime[0]*100).astype(int)             # get the start index of the spurt

    sylTCSSBC = np.array([sylTCSSBC[0][start_idx:-1]])     
    # sylTCSSBC = np.array(sylTCSSBC[start_idx:-1])                  # clip the TCSSBC contour from the spurt start
  
    sylTCSSBC = np.divide(sylTCSSBC, max(sylTCSSBC[0]))                 # normalize the TCSSBC contour
    # sylTCSSBC = np.divide(sylTCSSBC, max(sylTCSSBC))   

    # print("sylTCSSBC.shape: ", sylTCSSBC.shape)
    # print("sylTCSSBC: ", sylTCSSBC)
    
    if len(vowelSB) > vwlSB_num:                                       
        eng = spectral_selection(eng_full[np.subtract(vowelSB, 1), :], vwlSB_num)
    else:
        eng = eng_full[vowelSB, :]                                       # extract only the sub-band energies that are in the vowelSB list

    t_cor = temporal_corr(eng, twin, t_sigma)
    s_cor = spectral_corr(t_cor)                     
    vwlTCSSBC = smooth(s_cor, swin, s_sigma)      

    vwlTCSSBC = np.array([vwlTCSSBC])
    # vwlTCSSBC = np.array(vwlTCSSBC)
    # print("vwlTCSSBC.shape: ", vwlTCSSBC.shape)

    # Modify TCSSBC contour by clipping from the vowel start
    start_idx = np.round(vowelStartTime[0]*100).astype(int)         # get the start index of the vowel
    # vwlTCSSBC = np.array([vwlTCSSBC[0][start_idx:-1]])                 # clip the TCSSBC contour from the vowel start
    vwlTCSSBC = np.array([vwlTCSSBC[0][start_idx:-1]])   
    # print("vwlTCSSBC.shape: ", vwlTCSSBC.shape)
    # print("vwlTCSSBC: ", vwlTCSSBC)

    vwlTCSSBC = np.divide(vwlTCSSBC, max(vwlTCSSBC[0]))                 # normalize the TCSSBC contour
    # vwlTCSSBC = np.divide(vwlTCSSBC, max(vwlTCSSBC))   

    word_duration = np.zeros((1, len(startWordFrame) - 1))
  
    word_Sylsum = np.zeros((1, len(startWordFrame) - 1))
    
    word_Vwlsum = np.zeros((1, len(startWordFrame) - 1))
    
    for j in range(0, len(startWordFrame) - 1):
        temp_start = startWordFrame[j].astype(int)
        temp_end = startWordFrame[j + 1].astype(int) - 1

        if (temp_end >= sylTCSSBC.shape[1]):
           
            temp_end1 = sylTCSSBC.shape[1]-1
            sylTCSSBC[0, np.arange(temp_start, temp_end1)] = medfilt(sylTCSSBC[0, np.arange(temp_start, temp_end1)], 3)
            sylTCSSBC[0, temp_start] = sylTCSSBC[0, temp_start+1]
            sylTCSSBC[0, temp_end1] = sylTCSSBC[0, temp_end1 - 1]                          # median filteringv the TCSSBC contour
            tempArr = sylTCSSBC[0, np.arange(temp_start, temp_end1)]
            word_Sylsum[0, j] = tempArr.sum(axis=0)                                        # calculate the sum of the TCSSBC contour for the word  # need word boundaries for this 
        else:
            sylTCSSBC[0, np.arange(temp_start, temp_end)] = medfilt(
                sylTCSSBC[0, np.arange(temp_start, temp_end)], 3)
            sylTCSSBC[0, temp_start] = sylTCSSBC[0, temp_start+1]
            sylTCSSBC[0, temp_end] = sylTCSSBC[0, temp_end - 1]
            tempArr = sylTCSSBC[0, np.arange(temp_start, temp_end)]
            word_Sylsum[0, j] = tempArr.sum(axis=0)
            
        if (temp_end >= vwlTCSSBC.shape[1]):
            temp_end = vwlTCSSBC.shape[1]-1

        # temp_end = np.min([temp_end,len(vwlTCSSBC)])
        vwlTCSSBC[0, np.arange(temp_start, temp_end)] = medfilt(
            vwlTCSSBC[0, np.arange(temp_start, temp_end)], 3)
        vwlTCSSBC[0, temp_start] = vwlTCSSBC[0, temp_start+1]
        vwlTCSSBC[0, temp_end] = vwlTCSSBC[0, temp_end - 1]

        word_duration[0, j] = temp_end - temp_start + 1                      # calculate the duration of the word in frames ## need word boundaries for this

        tempArr = vwlTCSSBC[0, np.arange(temp_start, temp_end)]
        word_Vwlsum[0, j] = tempArr.sum(axis=0)                              # calculate the sum of the TCSSBC contour for the word  # need word boundaries for this

    sylTCSSBC[np.isnan(sylTCSSBC)] = 0   # Feature vector 1
    vwlTCSSBC[np.isnan(vwlTCSSBC)] = 0   # Feature vector 2
    return sylTCSSBC, vwlTCSSBC, word_Sylsum, word_duration

In [118]:
def feature_contour(i, wav_file):
    # file_name = wav_file[:-4]
    # # print("file_name", file_name)
    # syl_file = syl_dir + file_name + ".txt"
    # # print("syl_file", syl_file)
    # phn_file = phn_dir + file_name + ".txt"
    # # print("phn_file", phn_file)
    # word_file = word_dir + file_name + ".txt" 
    # # print("word_file", word_file)
    
    # if not os.path.exists(phn_file):
    #     print("phn file doesn't exist")
    #     return None, False
    
    # if not os.path.exists(syl_file):    
    #     print("syl file doesn't exist")
    #     return None, False
    
    # if not os.path.exists(word_file):
    #     print("word file doesn't exist")
    #     return None, False
    
    spurtStartTime, spurtEndTime, labels = get_syllables("14-208-0001_syl.txt")
    spurtStartTime = np.array(spurtStartTime)
    spurtEndTime = np.array(spurtEndTime)
    num_syls = len(spurtStartTime) 
    print("num_syls", num_syls)
    # print(spurtStartTime)
    # print(spurtEndTime)
    # print(labels)

    vowelStartTime, vowelEndTime, vowels = get_vowels("14-208-0001_vwl.txt")            
    vowelStartTime = np.array(vowelStartTime)            #NOTE why is this not truncated to two decimal places?
    vowelEndTime = np.array(vowelEndTime)
    if num_syls != len(vowelStartTime):
        print("Number of syllables and vowels don't match")
        return None, False

    # print(vowelStartTime)
    # print(vowelEndTime)
    # print(vowels)

    startWordTime, words = get_words("14-208-0001_word.txt")
    startWordTime = np.array(startWordTime)
    # print(startWordTime)
    # print(words)
    


    # Execute the vocoder [MODIFICATION]: Get the audio file back so that it can be stored in a text file for C code.
    file_dir = ger_train_dir
    Fs, eng_full, xx = vocoder_func("14-208-0001.wav")
    #  Fs, eng_full, xx = vocoder_func(file_dir + wav_file)

    # print("eng_full", eng_full)     # eng_full contains the sub-band energies of the audio file
    # print("xx", xx)                 # xx contains the amplitude of the waveform of the audio file

    eng_full = eng_full.conj().transpose()
    # print("eng_full", eng_full)
    print(eng_full.shape)            #NOTE fix this 
    print(Fs)

    startWordFrame = np.round((np.subtract(np.array(startWordTime, dtype='float'), spurtStartTime[0].astype(float))*100))              #NOTE why is this being done? 
    # endWordFrame = np.round((np.subtract(np.array(endWordTime, dtype='float'), spurtStartTime[0].astype(float))*100) + 1)
    startWordFrame = startWordFrame.astype(int)
     
    spurtStartFrame = np.round((spurtStartTime - spurtStartTime[0]) * 100)
    spurtEndFrame = np.round((spurtEndTime - spurtStartTime[0]) * 100)
    spurtStartFrame = spurtStartFrame.astype(int)
    spurtEndFrame = spurtEndFrame.astype(int)

    vowel_start_time = vowelStartTime.astype(float)
    vowel_end_time = vowelEndTime.astype(float)

    vowelStartFrame = np.round(vowel_start_time*100 - spurtStartTime[0]*100)   # vowelStartFrame contains the start frame of each vowel
    vowelEndFrame = np.round(vowel_end_time*100 - spurtStartTime[0]*100)       # vowelEndFrame contains the end frame of each vowel
    vowelStartFrame = vowelStartFrame.astype(int)
    vowelEndFrame = vowelEndFrame.astype(int)
  

    # print("spurt start time = ", spurtStartTime)
    # print("spurt end time = ", spurtEndTime)

    # print("vowel start time = ", vowel_start_time)
    # print("vowel end time = ", vowel_end_time)

    # print("start word time = ", startWordTime)

    # print("start word frame = ", startWordFrame)
    # print("spurt start frame = ", spurtStartFrame)
    # print("spurt end frame = ", spurtEndFrame)
    # print("vowel start frame = ", vowelStartFrame)
    # print("vowel end frame = ", vowelEndFrame)


    sylTCSSBC, vwlTCSSBC, word_duration, word_Sylsum = get_sylTCSSBC(sylSB, eng_full, sylSB_num, twin, t_sigma, swin, s_sigma, spurtStartTime, vowel_start_time, startWordFrame)         
    
    # print("sylTCSSBC", sylTCSSBC)
    # print(sylTCSSBC.shape)
    # print("vwlTCSSBC", vwlTCSSBC)
    # print(vwlTCSSBC.shape)
    # print("word_duration", word_duration)
    # print("word_Sylsum", word_Sylsum)
  
    tempOut = np.array([[]])
            
    wordIndication = []

    # Generating the features
    for j in range(0, num_syls , 1):
            inds = (startWordFrame <= spurtStartFrame[j]).nonzero()   # finds the word that the syllable belongs to
            word_ind = inds[0][-1]                           # finds the index of the word that the syllable belongs to
            wordIndication.append(word_ind)                  # stores the index of the word that the syllable belongs to
    #       print([0, np.arange(spurtStartFrame[j], spurtEndFrame[j]-1, 1).astype(int)])
            currFtr1SylSeg = sylTCSSBC[0, np.arange(spurtStartFrame[j], spurtEndFrame[j]-1, 1).astype(int)]  # extracts the syllable segment from the TCSSBC contour
            currFtr1SylSeg = np.array([currFtr1SylSeg])
            
            temp = np.multiply(currFtr1SylSeg, len(currFtr1SylSeg[0]) / word_duration[0, word_ind])  # normalizes the syllable segment by the duration of the syllable and the duration of the word            # need word duration for this 

            arrResampled = np.array([librosa.resample(temp[0], Fs, Fs*float(30) / len(temp[0]), 'sinc_best')])       ##change   # resamples the syllable segment to 30 frames
            
            F_new = Fs*float(30) / len(temp[0])      ##change   # resampling frequency
        
            currSylFtrs = statFunctions_Syl(arrResampled)   # calculates the statistical features of the syllable segment
            arr1 = np.array([np.array([np.sum(currFtr1SylSeg) / word_Sylsum[0, word_ind]])]).T     # calculates ratio of the area under the TCSSBC contour of the syllable segment and the area under the TCSSBC contour for the word
            currSylFtrs = np.vstack((currSylFtrs, arr1))     # appends the sum of the TCSSBC contour for the word to the statistical features of the syllable segment
        
            if (j>= len(vowelEndFrame)):
                break
            if (vowelEndFrame[j] >= vwlTCSSBC.shape[1]):
                vowelEndFrame[j] = vwlTCSSBC.shape[1]-1
        
            currFtr1VowelSeg = vwlTCSSBC[0, np.arange(vowelStartFrame[j], vowelEndFrame[j]-1, 1).astype(int)]   # extracts the vowel segment from the TCSSBC contour
            currFtr1VowelSeg = np.array([currFtr1VowelSeg])
            temp = np.multiply(currFtr1VowelSeg, len(currFtr1VowelSeg[0]) / word_duration[0, word_ind])  # normalizes the vowel segment by the duration of the syllable
            if (len(temp[0])==0):
                break
                
            arrResampled = np.array([librosa.resample(temp[0], F_new, F_new*float(20) / len(temp[0]), 'sinc_best')])     ##change  # resamples the vowel segment to 20 frames
            currVowelFtrs = statFunctions_Vwl(arrResampled)      # calculates the statistical features of the vowel segment
            arr1 = np.array([np.array([np.sum(currFtr1VowelSeg) / word_Sylsum[0, word_ind]])]).T        # calculates ratio of the area under the TCSSBC contour of the vowel segment and the area under the TCSSBC contour for the word
            currVowelFtrs = np.vstack((currVowelFtrs, arr1))
            if j == 0:
                tempOut = np.vstack((currSylFtrs, currVowelFtrs, len(currFtr1VowelSeg[0]), len(currFtr1SylSeg[0])))    
            else:
                tempOut = np.hstack((tempOut, np.vstack((currSylFtrs, currVowelFtrs,len(currFtr1VowelSeg[0]), len(currFtr1SylSeg[0])))))                # tempOut columns contain the statistical features of the syllable segment and the vowel segment, the duration of the vowel segment and the duration of the syllable segment

    if (len(temp[0])==0):
            return None, False   ###
    
    # print("tempOut.shape: ", tempOut.shape)
    # print("tempOut: ", tempOut)
    
    sylDurations = spurtEndTime - spurtStartTime
    
    ftrs = tempOut    
    
    wordLabls = np.unique(wordIndication)
    for iterWrd in range(0, len(wordLabls)):
        inds = [i for i, x in enumerate(wordIndication) if x == wordLabls[iterWrd]] #doing argwhere(wordIndication==wordLabls[iterWrd]
        if len(inds)>1 :
            ftrs[-1, inds] = ftrs[-1, inds] / sum(ftrs[-1, inds])
            ftrs[-2, inds] = ftrs[-2, inds] / sum(ftrs[-2, inds])
    end=1

    fa = ftrs

    # print(fa.shape)
    # print("fa")
    # print(fa)

    return fa, True

In [117]:
feature_contour(0, "14-208-0001.wav")

num_syls 64
(19, 1524)
16000
(19, 64)
fa
[[1.44806515e-02 1.01911852e+00 3.58497500e-02 ... 1.20201886e+00
  6.67410493e-01 8.05102557e-01]
 [3.98542419e-02 9.17403377e-01 5.00041690e-02 ... 9.99943380e-01
  7.70145115e-01 2.02640149e+00]
 [0.00000000e+00 0.00000000e+00 3.44470541e-02 ... 6.21569877e-01
  6.38310383e-01 7.64703531e-01]
 ...
 [1.51120980e-03 1.23232024e-01 9.13122061e-03 ... 5.63342574e-02
  1.75777955e-02 1.01736300e-01]
 [1.50000000e-01 7.00000000e-01 1.50000000e-01 ... 4.00000000e+00
  2.85714286e-01 7.14285714e-01]
 [1.71875000e-01 6.71875000e-01 1.56250000e-01 ... 2.40000000e+01
  2.53968254e-01 7.46031746e-01]]


In [None]:
ger_train_files_subset = ger_train_files[4:5]
all_contours = []
all_labels = []

for i, file in enumerate(ger_train_files_subset):
  

    contours, valid = feature_contour(i, file, False)
    if valid:
        # print(contours.shape)
    # print(contours)
        all_contours.extend(contours)

data array [['0.0' '0.61' 'sil']
 ['0.61' '0.79' 'ay']
 ['0.79' '0.95' 's']
 ['0.95' '1.0' 'eh']
 ['1.0' '1.05' 'd']
 ['1.05' '1.28' 'f']
 ['1.28' '1.43' 'ay']
 ['1.43' '1.55' 't']
 ['1.55' '1.59' 'sil']
 ['1.59' '1.65' 'n']
 ['1.65' '1.68' 'aa']
 ['1.68' '1.74' 't']
 ['1.74' '1.93' 's']
 ['1.93' '2.01' 'eh']
 ['2.01' '2.07' 'n']
 ['2.07' '2.21' 't']
 ['2.21' '2.39' 'er']
 ['2.39' '3.18' 'sil']]
phones [['ay' 's' 'eh' 'd' 'f' 'ay' 't' 'n' 'aa' 't' 's' 'eh' 'n' 't' 'er']]
phn_times [['0.61' '0.79']
 ['0.79' '0.95']
 ['0.95' '1.0']
 ['1.0' '1.05']
 ['1.05' '1.28']
 ['1.28' '1.43']
 ['1.43' '1.55']
 ['1.59' '1.65']
 ['1.65' '1.68']
 ['1.68' '1.74']
 ['1.74' '1.93']
 ['1.93' '2.01']
 ['2.01' '2.07']
 ['2.07' '2.21']
 ['2.21' '2.39']]
vowel [['ay' 'eh' 'ay' 'aa' 'eh' 'er']]
vowel_start_time [['0.61' '0.95' '1.28' '1.65' '1.93' '2.21']]
vowel_end_time [['0.79' '1.0' '1.43' '1.68' '2.01' '2.39']]
words ['i', 'said', 'fight', 'not', 'centre']
word_syls [['aa', 'ae', 'ah', 'ay', 'ay . ah', 'ay 

In [None]:
# train data
all_contours = []
# all_labels = []
for i, file in enumerate(ger_train_files):
    contours, success = feature_contour(i, file)

    if success:
        all_contours.extend(contours)   #NOTE how are you storing the contours? store labels too
        # all_labels.extend(labels)   

    clear_output(wait=True)
    print("Processed: {}/{}".format(i+1, len(ger_train_files)), end="\r")
    print("Progress: {:.2f}%".format((i+1)/len(ger_train_files)*100), end="\r")

data array [['0.0' '0.25' 'sil']
 ['0.25' '0.42' 'ay']
 ['0.42' '0.52' 's']
 ['0.52' '0.56' 'eh']
 ['0.56' '0.59' 'd']
 ['0.59' '0.68' 'hh']
 ['0.68' '0.85' 'w']
 ['0.85' '0.98' 'ay']
 ['0.98' '1.13' 't']
 ['1.13' '1.16' 'sil']
 ['1.16' '1.26' 'n']
 ['1.26' '1.3' 'aa']
 ['1.3' '1.41' 't']
 ['1.41' '1.48' 'sil']
 ['1.48' '1.55' 'b']
 ['1.55' '1.7' 'ey']
 ['1.7' '1.84' 't']
 ['1.84' '2.54' 'sil']]
phones [['ay' 's' 'eh' 'd' 'hh' 'w' 'ay' 't' 'n' 'aa' 't' 'b' 'ey' 't']]
phn_times [['0.25' '0.42']
 ['0.42' '0.52']
 ['0.52' '0.56']
 ['0.56' '0.59']
 ['0.59' '0.68']
 ['0.68' '0.85']
 ['0.85' '0.98']
 ['0.98' '1.13']
 ['1.16' '1.26']
 ['1.26' '1.3']
 ['1.3' '1.41']
 ['1.48' '1.55']
 ['1.55' '1.7']
 ['1.7' '1.84']]
vowel [['ay' 'eh' 'ay' 'aa' 'ey']]
vowel_start_time [['0.25' '0.52' '0.85' '1.26' '1.55']]
vowel_end_time [['0.42' '0.56' '0.98' '1.3' '1.7']]
words ['i', 'said', 'white', 'not', 'bait']
word_syls [['aa', 'ae', 'ah', 'ay', 'ay . ah', 'ay hh', 'eh', 'ey', 'hh ae', 'hh ah', 'hh ay', '

KeyboardInterrupt: 

In [None]:
# save train data as pickle
df = pd.DataFrame({'contour': all_contours})
df.to_pickle('../saved/ger_train.pkl')