In [1]:
! pip install regex



In [396]:
from scripts.load.utils_ZuCo import *
import scipy.io as io
import mat73
import pandas as pd
import numpy as np

In [35]:
def get_matfiles(task:str, subdir = '/task1- SR/Matlab_files/'):
    """
        Args: Task number ("task1", "task2", "task3") plus sub-directory
        Return: 12 matlab files (one per subject) for given task
    """
    path = os.getcwd() + subdir #+ task
    files = [os.path.join(path,file) for file in os.listdir(path)[1:]]
    assert len(files) == 12, 'each task must contain 12 .mat files'
    return files

In [47]:
class DataTransformer:
    """
        Transforms ET (and EEG data) to use for further analysis (per test subject)
    """
    
    def __init__(self, task:str, level:str, scaling='min-max', fillna='zeros'):
        """
            Args: task ("task1", "task2", or "task3"), data level, scaling technique, how to treat NaNs
        """
        tasks = ['task1', 'task2', 'task3']
        if task in tasks:
            self.task = task
        else:
            raise Exception('Task can only be one of "task1", "task2", or "task3"') 
        levels = ['sentence', 'word']
        if level in levels:
            self.level = level
        else:
            raise Exception('Data can only be processed on sentence or word level')
        #display raw (absolut) values or normalize data according to specified feature scaling technique
        feature_scalings = ['min-max', 'mean-norm', 'standard', 'raw']
        if scaling in feature_scalings:
            self.scaling = scaling
        else:
            raise Exception('Features must either be min-max scaled, mean-normalized or standardized')
        fillnans = ['zeros', 'mean', 'min']
        if fillna in fillnans:
            self.fillna = fillna
        else:
            raise Exception('Missing values should be replaced with zeros, the mean or min per feature')
    
    def __call__(self, subject:int):
        """
            Args: test subject (0-11)
            Return: DataFrame with normalized features (i.e., attributes) on sentence or word level
        """
        # subject should not be a property of data transform object (thus, it's not in the init method), 
        # since we want to apply the same data transformation to each subject
        subjects = list(range(12))
        if subject not in subjects:
            raise Exception('Access subject data with an integer value between 0 - 11')  
        files = get_matfiles(self.task)
        data = io.loadmat(files[subject], squeeze_me=True, struct_as_record=False)['sentenceData']
        
        if self.level == 'sentence':
            fields = ['SentLen',  'omissionRate', 'nFixations', 'meanPupilSize', 'GD', 'TRT', 
                      'FFD', 'SFD', 'GPT']
            if self.task == 'task1' and subject == 2:
                features = np.zeros((len(data)-101, len(fields)))
            elif self.task == 'task2' and (subject == 6 or subject == 11):
                features = np.zeros((len(data)-50, len(fields)))
            elif self.task == 'task3' and subject == 3:
                features = np.zeros((len(data)-47, len(fields)))
            elif self.task == 'task3' and subject == 7:
                features = np.zeros((len(data)-48, len(fields)))
            elif self.task == 'task3' and subject == 11:
                features = np.zeros((len(data)-89, len(fields)))
            else:
                features = np.zeros((len(data), len(fields)))

        elif self.level == 'word':
            if self.task == 'task1' and subject == 2:
                n_words = sum([len(sent.word) for i, sent in enumerate(data[:-1]) if i < 150 or i > 249])
            elif self.task == 'task2' and subject == 6:
                n_words = sum([len(sent.word) for i, sent in enumerate(data) if i > 49])  
            elif self.task == 'task2' and subject == 11:
                n_words = sum([len(sent.word) for i, sent in enumerate(data) if i < 50 or i > 99])
            elif self.task == 'task3' and subject == 3:
                n_words = sum([len(sent.word) for i, sent in enumerate(data) if i < 178 or i > 224])
            elif self.task == 'task3' and subject == 7:
                n_words = sum([len(sent.word) for i, sent in enumerate(data) if i < 359])
            elif self.task == 'task3' and subject == 11:
                n_words = sum([len(sent.word) for i, sent in enumerate(data) if i < 270 or (i > 313 and i < 362)])
            else:
                n_words = sum([len(sent.word) for sent in data])
            fields = ['Sent_ID', 'Word_ID', 'Word', 'nFixations', 'meanPupilSize', 
                      'GD', 'TRT', 'FFD', 'SFD', 'GPT', 'WordLen']
            df = pd.DataFrame(index=range(n_words), columns=[fields])
            k = 0
        
        idx = 0
        for i, sent in enumerate(data):
            if (self.task == 'task1' and subject == 2) and ((i >= 150 and i <= 249) or i == 399):
                continue
            elif (self.task == 'task2' and subject == 6) and (i <= 49):
                continue
            elif (self.task == 'task2' and subject == 11) and (i >= 50 and i <= 99):
                continue
            elif (self.task == 'task3' and subject == 3) and (i >= 178 and i <= 224):
                continue
            elif (self.task == 'task3' and subject == 7) and (i >= 359):
                continue
            elif (self.task == 'task3' and subject == 11) and ((i >= 270 and i <= 313) or (i >= 362 and i <= 406)):
                continue
            else:
                nwords_fixated = 0
                for j, word in enumerate(sent.word):
                    token = re.sub('[^\w\s]', '', word.content)
                    #lowercase words at the beginning of the sentence only
                    token = token.lower() if j == 0 else token 
                    if self.level == 'sentence':
                        word_features = [getattr(word, field) if hasattr(word, field)\
                                         and not isinstance(getattr(word, field), np.ndarray) else\
                                         0 for field in fields[2:]]
                        features[idx, 2:] += word_features
                        nwords_fixated += 0 if len(set(word_features)) == 1 and next(iter(set(word_features))) == 0 else 1
                    elif self.level == 'word':
                        df.iloc[k, 0] = str(idx)+'_NR' if self.task=='task1' or self.task=='task2'\
                                        else str(idx)+'_TSR'
                        df.iloc[k, 1] = j
                        df.iloc[k, 2] = token
                        df.iloc[k, 3:-1] = [getattr(word, field) if hasattr(word, field)\
                                            and not isinstance(getattr(word, field), np.ndarray) else\
                                            0 for field in fields[3:-1]]
                        df.iloc[k, -1] = len(token)
                        k += 1

                if self.level == 'sentence':
                    features[idx, 0] = len(sent.word)
                    features[idx, 1] = sent.omissionRate
                    #normalize by number of words for which fixations were reported
                    features[idx, 2:] /= nwords_fixated
                    
                idx += 1

        #handle -inf, inf and NaN values
        if self.level == 'sentence': 
            features = self.check_inf(features)
            
        elif self.level == 'word':
            if self.fillna == 'zeros':
                df.iloc[:,:].fillna(0, inplace=True)
            elif self.fillna == 'min':
                for i, field in enumerate(fields):
                    df.iloc[:,i].fillna(getattr(df, field).values.min(), inplace=True)
            elif self.fillna == 'mean':
                for i, field in enumerate(fields):
                    df.iloc[:,i].fillna(getattr(df, field).values.mean(), inplace=True)
                    
            df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, inplace=True)

        #normalize data according to feature scaling technique
        if self.scaling == 'min-max':
            if self.level == 'sentence':
                features = np.array([(feat - min(feat))/(max(feat) - min(feat)) for feat in features.T])
            elif self.level == 'word':
                df.iloc[:, 3:] = [(getattr(df,field).values - getattr(df,field).values.min())/\
                                  (getattr(df,field).values.max() - getattr(df,field).values.min())\
                                  for field in fields[3:]]
                
        elif self.scaling == 'mean-norm':
            if self.level == 'sentence':
                features = np.array([(feat - np.mean(feat))/(max(feat) - min(feat)) for feat in features.T])
            elif self.level == 'word':
                df.iloc[:, 3:] = [(getattr(df,field).values - getattr(df,field).values.mean())/\
                                  (getattr(df,field).values.max() - getattr(df,field).values.min())\
                                  for field in fields[3:]]
                
        elif self.scaling == 'standard':
            if self.level == 'sentence':
                features = np.array([(feat - np.mean(feat))/np.std(feat) for feat in features.T])
            elif self.level == 'word':
                df.iloc[:, 3:] = [(getattr(df,field).values - getattr(df,field).values.mean())/\
                                  getattr(df,field).values.std() for field in fields[3:]]
                
        if self.level == 'sentence':
            if self.scaling == 'raw':
                df = pd.DataFrame(data=features, index=range(features.shape[0]), columns=[fields])
            else:
                df = pd.DataFrame(data=features.T, index=range(features.shape[1]), columns=[fields])
                
            if self.fillna == 'zeros':
                df.iloc[:,:].fillna(0, inplace=True)
            elif self.fillna == 'min':
                for i, field in enumerate(fields):
                    df.iloc[:,i].fillna(getattr(df, field).values.min(), inplace=True)
            elif self.fillna == 'mean':
                for i, field in enumerate(fields):
                    df.iloc[:,i].fillna(getattr(df, field).values.mean(), inplace=True)
           
        return df
    
    @staticmethod
    def check_inf(features):
        pop_idx = 0
        for idx, feat in enumerate(features):
            if True in np.isneginf(feat) or True in np.isinf(feat):
                features = np.delete(features, idx-pop_idx, axis=0)
                pop_idx += 1
        return features

In [33]:
def split_data(sbjs): 
    """
        Args: Data per sbj on sentence level for task 1
        Purpose: Function is necessary to control for order effects (only relevant for Task 1 (NR))
    """
    first_half, second_half = [], []
    for sbj in sbjs:
        first_half.append(sbj[:len(sbj)//2])
        second_half.append(sbj[len(sbj)//2:])
    return first_half, second_half

In [51]:
!ls

preprocess.ipynb [34mscripts[m[m          [34mtask1- SR[m[m


In [115]:
file_name = "task1- SR/Matlab_files/resultsZAB_SR.mat"

# index of the array `data` is the number of sentence
data = io.loadmat(file_name, squeeze_me=True, struct_as_record=False)['sentenceData']

In [124]:
# get all field names for sentence data
# print(data[0]._fieldnames)

# # example: print sentence
# print(data[1].content)

# # example: get omission rate of first sentence
# omission_rate = data[0].omissionRate
# print(omission_rate)

# # get word level data
word_data = data[0].word

# # get names of all word features
# # index of the array `word_data` is the number of the word
print(word_data[0]._fieldnames)

# # example: get first word
# print(word_data[0].content)

# # example: get number of fixations of first word
print(word_data[0].nFixations)

['content', 'fixPositions', 'nFixations', 'meanPupilSize', 'rawEEG', 'rawET', 'FFD', 'FFD_pupilsize', 'FFD_t1', 'FFD_t2', 'FFD_a1', 'FFD_a2', 'FFD_b1', 'FFD_b2', 'FFD_g1', 'FFD_g2', 'FFD_t1_diff', 'FFD_t2_diff', 'FFD_a1_diff', 'FFD_a2_diff', 'FFD_b1_diff', 'FFD_b2_diff', 'FFD_g1_diff', 'FFD_g2_diff', 'TRT', 'TRT_pupilsize', 'TRT_t1', 'TRT_t2', 'TRT_a1', 'TRT_a2', 'TRT_b1', 'TRT_b2', 'TRT_g1', 'TRT_g2', 'TRT_t1_diff', 'TRT_t2_diff', 'TRT_a1_diff', 'TRT_a2_diff', 'TRT_b1_diff', 'TRT_b2_diff', 'TRT_g1_diff', 'TRT_g2_diff', 'GD', 'GD_pupilsize', 'GD_t1', 'GD_t2', 'GD_a1', 'GD_a2', 'GD_b1', 'GD_b2', 'GD_g1', 'GD_g2', 'GD_t1_diff', 'GD_t2_diff', 'GD_a1_diff', 'GD_a2_diff', 'GD_b1_diff', 'GD_b2_diff', 'GD_g1_diff', 'GD_g2_diff', 'GPT', 'GPT_pupilsize', 'GPT_t1', 'GPT_t2', 'GPT_a1', 'GPT_a2', 'GPT_b1', 'GPT_b2', 'GPT_g1', 'GPT_g2', 'GPT_t1_diff', 'GPT_t2_diff', 'GPT_a1_diff', 'GPT_a2_diff', 'GPT_b1_diff', 'GPT_b2_diff', 'GPT_g1_diff', 'GPT_g2_diff', 'SFD', 'SFD_pupilsize', 'SFD_t1', 'SFD_t2', 

In [208]:
data[0].content

'Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.'

In [308]:
EEG = [ 'mean_t1',#x
'mean_t2',#x
'mean_a1',#x
'mean_a2',#x
'mean_b1', #x
'mean_b2', #x
'mean_g1', #x
'mean_g2'] #x

for i in sorted(os.listdir('task1- SR/Matlab_files')):
    if '.mat' in i:
        
        file_name = f'task1- SR/Matlab_files/{i}'
        data = io.loadmat(file_name, squeeze_me=True, struct_as_record=False)['sentenceData']
        patient = f'{i[7:10]}'
        eeg = pd.DataFrame()
        L = []

        for j in range(len(data)):

            arr =  data[j].mean_g2

            if np.isnan(arr).all():

                arr = [0] * 105

            df = pd.DataFrame(arr).T

            eeg = pd.concat([eeg, df], axis = 0)

        eeg= eeg.reset_index()

        for k in range(len(data)):
            sent = data[k].content
            L.append(sent)

        sent_df = pd.DataFrame(L, columns = ['new_words'])

        new_df = pd.concat([sent_df, eeg], axis = 1)
        new_df.to_csv(f'eeg/mean_g2/{patient}_mean_g2_df.csv')
    


In [394]:
df = pd.DataFrame()
count = 0
for i in sorted(os.listdir('eeg')):
    if '.csv' and 'ZPH' in i:
        read = pd.read_csv(f'eeg/{i}')
        name = i[4:11]
        count =0
        for j in range(3, len(read.columns)):
            read = read.rename(columns={f'{read.columns[j]}': f'{name}_{read.columns[j]}'})
            count +=1
            
        read.to_csv(f'{i}')

In [395]:
df = pd.DataFrame()
for i in sorted(os.listdir('eeg')):
    if'.csv' and 'ZPH' in i:
        read= pd.read_csv(f'eeg/{i}')
        df = pd.concat([df, read],axis=1)

df.to_csv('ZPH_mean.csv')

In [405]:
sentiment = pd.read_csv('label.csv')
for i in sorted(os.listdir('preprocessed_eeg')):
    if '.csv' in i:
        read = pd.read_csv(f'preprocessed_eeg/{i}')
        df = pd.concat([sentiment, read], axis = 1)
        df.to_csv(f'{i}')
        

In [None]:
read = pd.read_csv('preprocessed_eeg/ZAB_mean.csv')
read.drop(['Unnamed: 0', 'Unnamed: 0.1', 'index'], inplace=True, axis=1)

In [None]:
read.to_csv('ZAB_mean.csv')