In [1]:
import os
import scipy.io
import numpy as np
from IPython.display import clear_output
import librosa
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import seaborn as sns

base_dir = os.getcwd()
base_dir

'D:\\projects\\current\\listen_italian_motor_entrainment\\analysis\\python\\acoustic-articulatory'

In [2]:
# load data
data_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
raw_fname = data_path+ '\exp\exp_running_scirpt\olm_stimuli_normalRepeatTwice.mat'
data = scipy.io.loadmat(raw_fname)
raw_fname = data_path+ '/analysis/behaviour/data/palate_trace_only_stimuli_new.mat'
palate_trace = scipy.io.loadmat(raw_fname)
palate_trace = palate_trace['palate_trace']


filenames = data['data']['filename'][0][0][0]
speech = data['data']['speech'][0][0][0]
fs = data['data']['fs'][0][0][0][0]
#lab = data['data']['lab'][0][0][0]
ema = data['data']['ema'][0][0][0]

In [7]:
# add context to input frames
frame_context = 0

In [18]:
# extract features
n_mfcc = 13
resample_freq = 100


def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)



audio_feat = []
ema_feat   = []
for i in range(0,len(filenames)):
    # meta data
    filename = filenames[i][0].replace('.wav','')
    
    a = filename.split('_')
    if(a[0]=='n'):
        condition = 2
    elif(int(a[2])>5):
        condition = 3
    else:
        condition = 1
    
    # downsample ema to 100 hz
    A = []
    a = ema[i].shape[1]/400 # Number of seconds in signal X
    a = a*resample_freq     # Number of samples to downsample
    for j in range(0,ema[i].shape[0]):
        A.append(scipy.signal.resample(ema[i][j,:], int(np.ceil(a))))
    A = np.stack(A)
    
    # mfcc
    mfcc = librosa.feature.mfcc(y=speech[i].flatten(), sr=fs,
                                hop_length=int(0.010*fs), n_fft=int(0.025*fs), n_mfcc=n_mfcc)
    
    
    if(mfcc.shape[1]>A.shape[1]):
        mfcc = mfcc[:,:A.shape[1]]            
    elif(mfcc.shape[1]<A.shape[1]):
        A = A[:,:mfcc.shape[1]]
    
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)    
    a = np.vstack((mfcc,mfcc_delta,mfcc_delta2))

    # adding context to frames
    b = np.pad(a,((0,0),(int((frame_context-1)/2), int((frame_context-1)/2))), 'constant')
    if not(frame_context==0):
        b = rolling_window(b, frame_context)
        b = np.swapaxes(b,2,1)
        b = b.reshape(-1, b.shape[-1])
    
    audio_feat.append(b)
    ema_feat.append(A)
    
    print(i,end=' ')
    clear_output

audio_feat = np.hstack(audio_feat)
ema_feat = np.hstack(ema_feat)
audio_feat=audio_feat.astype('f')
ema_feat=ema_feat.astype('f')

save_path = data_path + '/analysis/python/data/extracted_features/mfcc_contextSize-'+str(frame_context)+'.npy'
np.save(save_path,audio_feat)
save_path = data_path + '/analysis/python/data/extracted_features/raw_ema.npy'
np.save(save_path,ema_feat)
clear_output()