In [1]:
import mne
import os
import scipy.io
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.5)

data_path = os.path.dirname(os.getcwd())

subject_name = ['stella','juliet','shai','ayoub','lucas','simone','henry', 'julien','marion','elvira']
#subject_group = ['stella','juliet';'shai','ayoub';'lucas','simone';'henry', 'julien';'marion','elvira']
subject_group = [1,1,2,2,3,3,4,4,5,5]
subject_gender = ['f','f','m','m','m','m','m','m','f','f']
subject_speak = [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
session_name = ['pre','duet1', 'duet2', 'duet3' ,'duet4','duet5','duet6','post']
session_words = [60,50,50,50,50,50,50,60]
eeg_label =     ['F3','C3','P3','Cz','F4','C4','P4','P7']
ema_label = ['jawaopening','lipaparature','lipProtrusion','TTCD','TMCD','TBCD']

ema_fs = 100

In [2]:
# reation time and words
raw_fname = data_path + '/behaviour/behaviour.mat'
mat = scipy.io.loadmat(raw_fname)

RT = mat['reaction_time']
words = mat['words']

frame = []
for sub in range(0,len(subject_name)):
    for session in range(0,len(session_name)):
        for w in range(0,session_words[session]):
            
            df = pd.DataFrame({'gender':[subject_gender[sub]],
                               'speak':subject_speak[sub],'group':subject_group[sub],
                               'RT': RT[sub][session][w][0],'wordNo':w+1,
                              'wav':subject_name[sub]+'-'+session_name[session]+'_'+str(w+1)+'_.wav'})
            frame.append(df)
            
data1=pd.concat((frame),axis=0)
#data.set_index(['subject','session','Condition'], inplace=True)
#data.sort_index(inplace=True)
data1.head()  

Unnamed: 0,gender,speak,group,RT,wordNo,wav
0,f,1,1,0.503827,1,stella-pre_1_.wav
0,f,1,1,0.551558,2,stella-pre_2_.wav
0,f,1,1,0.468265,3,stella-pre_3_.wav
0,f,1,1,0.640828,4,stella-pre_4_.wav
0,f,1,1,0.486797,5,stella-pre_5_.wav


In [3]:
# get EEG data
raw_fname = data_path + '/data/processed_data_EEG.mat'
mat = scipy.io.loadmat(raw_fname, struct_as_record=False)

# get EMA data
raw_fname = data_path + '/data/processed_data_EMA.mat'
mat2 = scipy.io.loadmat(raw_fname, struct_as_record=False)

# get speech data
raw_fname = data_path + '/data/processed_data_SPEECH_features.mat'
mat3 = scipy.io.loadmat(raw_fname, struct_as_record=False)

# get index
raw_fname = data_path + '/data/processed_data_idx.mat'
mat1 = scipy.io.loadmat(raw_fname)

In [4]:
# arrange
trial = mat['EEG'][0,0].dataS[0,0].trial
time = mat['EEG'][0,0].dataS[0,0].time
trialinfo = mat['EEG'][0,0].dataS[0,0].trialinfo
trial1 = mat['EEG'][0,0].dataL[0,0].trial
time1 = mat['EEG'][0,0].dataL[0,0].time
trialinfo1 = mat['EEG'][0,0].dataL[0,0].trialinfo

trial2 = mat2['EMA'][0,0].tract
time2 = mat2['EMA'][0,0].time

envelop = mat3['SPEECH_features'][0,0].envelop
envelop_time = mat3['SPEECH_features'][0,0].envelop_time
f0 = mat3['SPEECH_features'][0,0].F0
f1 = mat3['SPEECH_features'][0,0].f1
f2 = mat3['SPEECH_features'][0,0].f2
intensity = mat3['SPEECH_features'][0,0].intensity

index = mat1['D']
word = mat1['WORD']

frame = []
for i in range(0,trial.shape[1]):
    a = pd.DataFrame({'subject':subject_name[index[i][0]-1],'session':session_name[index[i][1]-1],
                      'words':word[index[i][2]-1][0][0],'partner':subject_name[index[i][3]-1],
                      'wav':subject_name[index[i][0]-1]+'-'+session_name[index[i][1]-1]+'_'+str(index[i][2])+'_.wav',
                      
                      'EEGS':[trial[0][i]],'EEGS_time':[time[0][i]],'EEGL':[trial1[0][i]],'EEGL_time':[time1[0][i]],
                      'trial_noS':trialinfo[i][0],'baseline_startS':trialinfo[i][1],'baseline_endS':trialinfo[i][2],
                      'visual_representS':trialinfo[i][3],'go_signalS':trialinfo[i][4],'voice_onsetS':trialinfo[i][5],
                      'voice_offsetS':trialinfo[i][6],'trial_endS':trialinfo[i][7],
                      'trial_noL':trialinfo1[i][0],'baseline_startL':trialinfo1[i][1],'baseline_endL':trialinfo1[i][2],
                      'visual_representL':trialinfo1[i][3],'go_signalL':trialinfo1[i][4],'voice_onsetL':trialinfo1[i][5],
                      'voice_offsetL':trialinfo1[i][6],'trial_endL':trialinfo1[i][7],  
                      
                      'EMA':[trial2[i][0]],'EMA_time':[time2[i][0]],
                      
                      'envelop':[envelop[i][0]],'envelop_time':[envelop_time[i][0]],
                      'f0':[f0[i][0]],'f1':[f1[i][0]],'f2':[f2[i][0]],'intensity':[intensity[i][0]]
                      }) 
    frame.append(a)
    
data=pd.concat((frame),axis=0)
#data.set_index(['subject','session','Condition'], inplace=True)
#data.sort_index(inplace=True)
data.head()  

Unnamed: 0,subject,session,words,partner,wav,EEGS,EEGS_time,EEGL,EEGL_time,trial_noS,...,voice_offsetL,trial_endL,EMA,EMA_time,envelop,envelop_time,f0,f1,f2,intensity
0,stella,pre,péchant,juliet,stella-pre_1_.wav,"[[43577.10795400812, 20184.207429459435, -159....","[[-0.512, -0.51, -0.508, -0.506, -0.504, -0.50...",[],[],1.0,...,0.0,0.0,"[[15.38124559873013, nan, nan, 9.7627096612492...","[[0.0], [0.01171875], [0.0234375], [0.03515625...","[[0.0004973213972817715, 0.0006604769865521556...","[[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,stella,pre,paumé,juliet,stella-pre_2_.wav,"[[-8101.605861944673, -30411.99562233412, -495...","[[-0.512, -0.51, -0.508, -0.506, -0.504, -0.50...",[],[],2.0,...,0.0,0.0,"[[14.12412896181104, nan, nan, 13.850937667830...","[[0.0], [0.01171875], [0.0234375], [0.03515625...","[[0.0015064791565754282, 0.002063941571661309,...","[[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,stella,pre,city,juliet,stella-pre_3_.wav,"[[73245.14487949017, 42778.887419735394, 16209...","[[-0.514, -0.512, -0.51, -0.508, -0.506, -0.50...",[],[],3.0,...,0.0,0.0,"[[14.679319077645033, nan, nan, 15.63574844572...","[[0.0], [0.01171875], [0.0234375], [0.03515625...","[[0.00016057039050831106, 0.000212352626467842...","[[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,stella,pre,roman,juliet,stella-pre_4_.wav,"[[152923.95055306112, 124148.80739808134, 9917...","[[-0.512, -0.51, -0.508, -0.506, -0.504, -0.50...",[],[],4.0,...,0.0,0.0,"[[14.317038115567648, nan, nan, 14.90113818271...","[[0.0], [0.01171875], [0.0234375], [0.03515625...","[[0.0014144607955710014, 0.0019507285620930302...","[[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,stella,pre,coma,juliet,stella-pre_5_.wav,"[[6208.734423970084, 7731.603650287002, 8842.8...","[[-0.514, -0.512, -0.51, -0.508, -0.506, -0.50...",[],[],5.0,...,0.0,0.0,"[[13.790001525400518, nan, nan, 13.92859113344...","[[0.0], [0.01171875], [0.0234375], [0.03515625...","[[0.00016434728626385265, 0.000217718200923781...","[[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [5]:
a = pd.merge(data, data1, on='wav')

In [6]:
save_path = data_path + '/python/data/data.pkl'
a.to_pickle(save_path)

In [None]:
save_path = data_path + '/python/data/data.pkl'
data=pd.read_pickle(save_path)


In [None]:
resample_freq=100

In [None]:
# function 

def get_downsample(data,resample_freq,sr):
    # downsample
    a = data.shape[1]/sr # Number of seconds in signal X
    a = a*resample_freq     # Number of samples to downsample
    data = resample(data.T, int(np.ceil(a)))
    return data.T


def get_egg_withinVoise(aa,i,resample_freq,tmp):
    c=np.abs(aa.iloc[i]['EEGS_time'][0]-aa.iloc[i]['voice_onsetS'])
    d=np.abs(aa.iloc[i]['EEGS_time'][0]-aa.iloc[i]['voice_offsetS'])
    c = aa.iloc[i][tmp][:,c.argmin():d.argmin()+1]
    c = get_downsample(c,resample_freq,500)
    return c

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y




In [None]:
# get eeg data and save

for s in range(0,len(subject_group)):
    a = data[(data['subject']==subject_group[s][0]) & (data['trial_noS']>=idx[1][0]) & 
             (data['trial_noS']<=idx[6][-1])].reset_index()
    b = data[(data['subject']==subject_group[s][1]) & (data['trial_noS']>=idx[1][0]) & 
             (data['trial_noS']<=idx[6][-1])].reset_index()

    A = []
    for i in range(0,a.shape[0]):
        aa = a['trial_noL'][i]-60
        bb = a['trial_noS'][i]-60 
        tmp = get_egg_withinVoise(a,i,resample_freq,'EEGL')
        envelope = a.iloc[i]['envelop']
        
        if(tmp.shape[1]>envelope.shape[1]):
            tmp = tmp[:,:envelope.shape[1]]            
        elif(tmp.shape[1]<envelope.shape[1]):
            envelope = envelope[:,:tmp.shape[1]]
        
        df = pd.DataFrame({'idx':aa,'data':[tmp],'subject':subject_group[s][0],
                           'wordNo':a.iloc[i]['wordNo'],
                           'type':'L','session':a['session'][i],
                          'envelop':[envelope]})
        A.append(df)
        
        #tmp = get_egg_withinVoise(a,i,resample_freq,'EEGS')
        #df = pd.DataFrame({'idx':bb,'data':[tmp],'subject':subject_group[s][0],
        #                   'type':'S','session':a['session'][i],
        #                  'envelop':0})
        #A.append(df)
        print(i,end=' ')

    for i in range(0,b.shape[0]):
        aa = b['trial_noL'][i]-60
        bb = b['trial_noS'][i]-60
        tmp = get_egg_withinVoise(b,i,resample_freq,'EEGL')
        envelope = b.iloc[i]['envelop']
        
        if(tmp.shape[1]>envelope.shape[1]):
            tmp = tmp[:,:envelope.shape[1]]            
        elif(tmp.shape[1]<envelope.shape[1]):
            envelope = envelope[:,:tmp.shape[1]]
            
        df = pd.DataFrame({'idx':aa,'data':[tmp],'subject':subject_group[s][1],
                           'wordNo':b.iloc[i]['wordNo'],
                           'type':'L','session':b['session'][i],
                          'envelop':[envelope]})
        A.append(df)
        
        #tmp = get_egg_withinVoise(b,i,resample_freq,'EEGS')
        #df = pd.DataFrame({'idx':bb,'data':[tmp],'subject':subject_group[s][1],
        #                   'type':'S','session':b['session'][i],
        #                  'envelop':0})
        #A.append(df)
        print(i,end=' ')
        
    a=pd.concat((A),axis=0)
    A=a.sort_values('idx')

    
    save_path = data_path + '/python/data/'+'_'.join(subject_group[s])+'_EEG.pkl'
    A.to_pickle(save_path)
    clear_output()