In [None]:
"""This code is based on the code from this repository: https://github.com/MikeWangWZHL/EEG-To-Text"""

import scipy.io as io
import h5py
import os
import json
from glob import glob
from tqdm import tqdm
import numpy as np
import pickle
import argparse

In [None]:
# Current working directory
home_directory = os.getcwd()
# directory of the data 
data_directory = '/home/sposso22/Documents/datasets/ZuCo'


# Tasks
tasks = ['task1-SR', 'task2-NR', 'task3-TSR']

task = tasks[0]



# Load the Matlab file
input_mat_files_dir = os.path.join(data_directory, f'{task}/Matlab_files')

# Create output folder to save EEG and eye-tracking data as pickle files
output_file_folder = os.path.join(home_directory, f'{task}_pickle')

os.makedirs(output_file_folder, exist_ok=True)

### Load info for each subject 

In [None]:
mat_files = os.listdir(input_mat_files_dir)
path_mat_files = [os.path.join(input_mat_files_dir,mat_file) for mat_file in mat_files]

In [None]:
# Whole dataset dictionary
dataset_dict = {}

for mat_file in tqdm(path_mat_files):

    # get subject name from the file name
    subject_name = os.path.basename(mat_file).split('.')[0].replace('results','').strip()
     
    # The subject data will be saved in a list
    dataset_dict[subject_name] = []

   

    mat_data = io.loadmat(mat_file,squeeze_me=True,struct_as_record=False)['sentenceData']

    
     # Sentence level data
    for sent in mat_data: 

        word_data = sent.word

        if not isinstance(word_data, float):

            # First key: sentence content
            sent_obj = {'content': sent.content}
            
            # second key : Oscillatory in different power bands (Theta, Alpha, Beta, Gamma)
            sent_obj['sentence_level_EEG'] = {'mean_t1':sent.mean_t1, 'mean_t2':sent.mean_t2, 
                                              'mean_a1':sent.mean_a1, 'mean_a2':sent.mean_a2, 
                                              'mean_b1':sent.mean_b1, 'mean_b2':sent.mean_b2, 
                                              'mean_g1':sent.mean_g1, 'mean_g2':sent.mean_g2}

            if task == 'task1-SR':

                # task1-SR: Read sentences, answer control questions

                sent_obj['answer_EEG'] = {'answer_mean_t1':sent.answer_mean_t1, 'answer_mean_t2':sent.answer_mean_t2,
                                         'answer_mean_a1':sent.answer_mean_a1,'answer_mean_a2':sent.answer_mean_a2, 
                                         'answer_mean_b1':sent.answer_mean_b1, 'answer_mean_b2':sent.answer_mean_b2, 
                                         'answer_mean_g1':sent.answer_mean_g1, 'answer_mean_g2':sent.answer_mean_g2}
                                         

            # world level data
            sent_obj['word'] = []

            # Features from eye-tracking 
            word_tokens_has_fixation =[]
            word_tokens_with_mask = []
            word_tokens_all = []

            for word in word_data:
                word_obj = {'content': word.content}
                word_tokens_all.append(word.content)

                word_obj['n_fixations'] = word.nFixations

  
                if isinstance(word.nFixations, (int, np.integer)) and word.nFixations > 0:

                    print('word n fixations:', word.nFixations)
                    print(type(word.nFixations))

                    word_obj['word_level_EEG'] = {'FFD':{'FFD_t1':word.FFD_t1, 'FFD_t2':word.FFD_t2, 
                                                         'FFD_a1':word.FFD_a1, 'FFD_a2':word.FFD_a2, 
                                                         'FFD_b1':word.FFD_b1, 'FFD_b2':word.FFD_b2, 
                                                         'FFD_g1':word.FFD_g1, 'FFD_g2':word.FFD_g2}}

                    word_obj['word_level_EEG']['TRT'] = {'TRT_t1':word.TRT_t1, 'TRT_t2':word.TRT_t2, 
                                                         'TRT_a1':word.TRT_a1, 'TRT_a2':word.TRT_a2, 
                                                         'TRT_b1':word.TRT_b1, 'TRT_b2':word.TRT_b2, 
                                                         'TRT_g1':word.TRT_g1, 'TRT_g2':word.TRT_g2}
                    word_obj['word_level_EEG']['GD'] = {'GD_t1':word.GD_t1, 'GD_t2':word.GD_t2, 
                                                        'GD_a1':word.GD_a1, 'GD_a2':word.GD_a2, 
                                                        'GD_b1':word.GD_b1, 'GD_b2':word.GD_b2, 
                                                        'GD_g1':word.GD_g1, 'GD_g2':word.GD_g2}
                    sent_obj['word'].append(word_obj)
                    word_tokens_has_fixation.append(word.content)
                    word_tokens_with_mask.append(word.content)

                else:

                    word_tokens_with_mask.append('[MASK]')
                    



                    continue
            sent_obj['word_tokens_has_fixation'] = word_tokens_has_fixation
            sent_obj['word_tokens_with_mask'] = word_tokens_with_mask
            sent_obj['word_tokens_all'] = word_tokens_all


            dataset_dict[subject_name].append(sent_obj)



# Save the dataset dictionary as a pickle file
output_name = f'{task}_dataset_dict_v1.pkl'

with open(os.path.join(output_file_folder,output_name), 'wb') as handle:
    pickle.dump(dataset_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    



## Sanity check 

In [7]:
with open(os.path.join(output_file_folder,output_name), 'rb') as handle:
    whole_dataset = pickle.load(handle)
print('subjects:', whole_dataset.keys())

subjects: dict_keys(['ZKB_SR', 'ZJS_SR', 'ZKH_SR', 'ZJM_SR', 'ZAB_SR', 'ZPH_SR', 'ZKW_SR', 'ZMG_SR', 'ZDN_SR', 'ZJN_SR', 'ZDM_SR', 'ZGW_SR'])


### Let's explore the data for the first subject and validate the gathered information above

In [18]:
subject_1 = list(whole_dataset.keys())[0]
subject_1_data = whole_dataset[subject_1]

"""

The first task (SR) correspond to the sentiment reading from the Stanford Sentiment Treebank dataset. 400 sentences were presented
to the subjects. Each sentence is annotated with a sentiment label (positive, negative, neutral). 

""" 
print("Number of sentences for task 1 (SR):", len(subject_1_data))



Number of sentences for task 1 (SR): 400


#### Let's go deeper into the info stored for each sentence

In [22]:
first_sentence_info = subject_1_data[0]
print('The info for first sentence is stored in :', type(first_sentence_info))
print(first_sentence_info.keys())

The info for first sentence is stored in : <class 'dict'>
dict_keys(['content', 'sentence_level_EEG', 'answer_EEG', 'word', 'word_tokens_has_fixation', 'word_tokens_with_mask', 'word_tokens_all'])


### Sentence read by the subject.

In [24]:
content = first_sentence_info['content']
print('The content of the first sentence is the following sentence :', content)

The content of the first sentence is the following sentence : Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.


### EEG features per sentence 

In [26]:
sentence_level_EEG = first_sentence_info['sentence_level_EEG']
print('The sentence level EEG features are stored in :', type(sentence_level_EEG))

"""
From the ZuCo paper:

They extracted EEG features based on the sentence-level by calculating the power in each frequency band.
For all the EEG recorded while a subject reads one sentence, they compute the average power per band

""" 
print(sentence_level_EEG.keys())

The sentence level EEG features are stored in : <class 'dict'>
dict_keys(['mean_t1', 'mean_t2', 'mean_a1', 'mean_a2', 'mean_b1', 'mean_b2', 'mean_g1', 'mean_g2'])


### EEG Band Power Feature Extraction 

1. **Band-pass filter the EEG**  
   Each EEG channel is filtered into frequency bands (theta, alpha, beta, gamma).  

2. **Hilbert transform**  
   For each band-limited signal $x_b(t)$, apply the Hilbert transform:

   
   $$z_b(t) = x_b(t) + i \, H(x_b(t))$$


3. **Amplitude envelope**  
   Compute the instantaneous amplitude:

   
   $$ A_b(t) = |z_b(t)| = \sqrt{x_b(t)^2 + H(x_b(t))^2} $$
   

4. **Instantaneous power**  
   Power is amplitude squared:

   
   $$P_b(t) = A_b(t)^2$$
   

5. **Average over a time window**  
   For a fixation or sentence time window \(W\):

   
   $$\overline{P}_b = \frac{1}{|W|} \sum_{t \in W} P_b(t)$$
   


In [27]:
# print shape of each frequency band
"""
105 EEG channels were used for scalp recordings
"""

for band in sentence_level_EEG.keys():
    print(band, np.shape(sentence_level_EEG[band]))

mean_t1 (105,)
mean_t2 (105,)
mean_a1 (105,)
mean_a2 (105,)
mean_b1 (105,)
mean_b2 (105,)
mean_g1 (105,)
mean_g2 (105,)


In [28]:
## Anser EEG features for task 1 (SR)

answer_EEG = first_sentence_info['answer_EEG']
print('The answer EEG features are stored in :', type(answer_EEG))
print(answer_EEG.keys())

The answer EEG features are stored in : <class 'dict'>
dict_keys(['answer_mean_t1', 'answer_mean_t2', 'answer_mean_a1', 'answer_mean_a2', 'answer_mean_b1', 'answer_mean_b2', 'answer_mean_g1', 'answer_mean_g2'])


In [36]:
## Only words with fixations are stored, so their number is smaller than the total words in the sentence because some words do not have fixations
print('The number of words in the first sentence that has fixations is :', len(first_sentence_info['word_tokens_has_fixation']))
print('The words that have fixations are :', first_sentence_info['word_tokens_has_fixation'])

The number of words in the first sentence that has fixations is : 18
The words that have fixations are : ['Presents', 'a', 'good', 'case', 'while', 'failing', 'to', 'provide', 'a', 'reason', 'for', 'to', 'care', 'beyond', 'very', 'basic', 'dictums', 'human']


## Word level data

In [37]:
### word level data
word_data = first_sentence_info['word']
print('The word level data is stored in :', type(word_data))
print('The number of words in the first sentence is :', len(word_data))

The word level data is stored in : <class 'list'>
The number of words in the first sentence is : 18


### Info available for each word

In [44]:
first_word = word_data[0]
print('The info for the first word is stored in :', type(first_word))
print(first_word.keys())

The info for the first word is stored in : <class 'dict'>
dict_keys(['content', 'n_fixations', 'word_level_EEG'])


In [45]:
print('The content of the first word is :', first_word['content'])
print('The number of fixations on the first word is :', first_word['n_fixations'])

The content of the first word is : Presents
The number of fixations on the first word is : 1


In [46]:
## EEG features for the first word
word_level_EEG = first_word['word_level_EEG']
print('The EEG features for the first word are stored in :', type(word_level_EEG))
print(word_level_EEG.keys())

The EEG features for the first word are stored in : <class 'dict'>
dict_keys(['FFD', 'TRT', 'GD'])


Reminder:

**FFD** = The duration of the first fixation on the prevailing word. <br>
**TRT** =  The sum of all fixation durations on the current word, including regression (when you go back to the same word and make another fixation that lasts x time). <br>
**GD** =  The sum of all fixations on the current word in the first-pass reading before the eyes move out of the word. <br>

In [47]:
for eeg_feature in word_level_EEG.keys():
    print(eeg_feature, word_level_EEG[eeg_feature].keys())

FFD dict_keys(['FFD_t1', 'FFD_t2', 'FFD_a1', 'FFD_a2', 'FFD_b1', 'FFD_b2', 'FFD_g1', 'FFD_g2'])
TRT dict_keys(['TRT_t1', 'TRT_t2', 'TRT_a1', 'TRT_a2', 'TRT_b1', 'TRT_b2', 'TRT_g1', 'TRT_g2'])
GD dict_keys(['GD_t1', 'GD_t2', 'GD_a1', 'GD_a2', 'GD_b1', 'GD_b2', 'GD_g1', 'GD_g2'])
