In [12]:
import os
import numpy as np
from scipy.io import loadmat
from typing import Dict, List, Any

class ZucoDataLoader:
    def __init__(self, data_dir='../zuco_data/zuco1.0/task1-SR/Matlab files'):
        """Initialize ZuCo data loader for SR task."""
        self.data_dir = data_dir
        self.subject_files = self._get_subject_files()
        
    def _get_subject_files(self) -> Dict[str, str]:
        """Get mapping of subject IDs to file paths."""
        subject_files = {}
        for file_name in os.listdir(self.data_dir):
            if file_name.endswith(".mat"):
                subject_id = file_name.split('.')[0]
                subject_files[subject_id] = os.path.join(self.data_dir, file_name)
        return subject_files
    
    def get_subject_ids(self) -> List[str]:
        """Get list of available subject IDs."""
        return list(self.subject_files.keys())
    
    def load_subject_data(self, subject_id: str) -> Dict[str, Any]:
        """Load raw .mat file for a subject."""
        file_path = self.subject_files[subject_id]
        print(f"Loading data from {file_path}")
        
        # Load .mat file
        data = loadmat(file_path, squeeze_me=True, struct_as_record=False)
        return data
    
    def extract_word_level_features(self, subject_id: str) -> Dict[int, Dict[int, Dict[str, Any]]]:
        """
        Extract word-level EEG and eye-tracking features.
        
        Returns:
            Nested dict: {sentence_id: {word_position: {features}}}
        """
        data = self.load_subject_data(subject_id)
        sentences = data['sentenceData']
        
        # Initialize result structure
        result = {}
        
        # Extract word-level data
        for sent_idx, sentence in enumerate(sentences):
            result[sent_idx] = {}
            
            try:
                words = sentence.word
                
                for word_idx, word in enumerate(words):
                    result[sent_idx][word_idx] = {
                        'word': word.content if hasattr(word, 'content') else '',
                        'sentence': sentence.content if hasattr(sentence, 'content') else '',
                        'eeg_features': {},
                    }
                    
                    # Extract EEG features
                    for feature in ['FFD', 'TRT', 'GD', 'GPT']:
                        for band in ['_t1', '_t2', '_a1', '_a2', '_b1', '_b2', '_g1', '_g2']:
                            feature_name = feature + band
                            if hasattr(word, feature_name):
                                result[sent_idx][word_idx]['eeg_features'][feature_name] = getattr(word, feature_name)
            
            except (AttributeError, IndexError) as e:
                print(f"Error processing sentence {sent_idx}: {e}")
                continue
        
        return result
    
    def get_sentences(self) -> Dict[int, str]:
        """Get all sentences in the dataset."""
        # Use first subject for consistency
        subject_id = self.get_subject_ids()[0]
        data = self.load_subject_data(subject_id)
        sentences = data['sentenceData']
        
        result = {}
        for sent_idx, sentence in enumerate(sentences):
            try:
                result[sent_idx] = sentence.content
            except AttributeError:
                continue
        
        return result

In [8]:
SR_DIR = '../zuco_data/zuco1.0/task1-SR/Matlab files'


In [None]:
zloader = ZucoDataLoader(SR_DIR)
subject_ids = zloader.get_subject_ids()
print(f"Available subject IDs: {subject_ids}")



Available subject IDs: ['resultsZKB_SR', 'resultsZDM_SR', 'resultsZJN_SR', 'resultsZAB_SR', 'resultsZKH_SR', 'resultsZMG_SR', 'resultsZGW_SR', 'resultsZKW_SR', 'resultsZJM_SR', 'resultsZDN_SR', 'resultsZJS_SR', 'resultsZPH_SR']


In [None]:
test_id = 'resultsZAB_SR'
test_subj = zloader.load_subject_data('resultsZAB_SR')
test_subj['sentenceData'][0]
test_feat = zloader.extract_word_level_features('resultsZAB_SR')
test_feat[0][3]['eeg_features']['FFD_t1'].shape

Loading data from ../zuco_data/zuco1.0/task1-SR/Matlab files/resultsZAB_SR.mat


Loading data from ../zuco_data/zuco1.0/task1-SR/Matlab files/resultsZAB_SR.mat


In [52]:
test_words_per_sentence = [test_feat[i].keys().__len__() for i in range(len(test_feat))]
print(f"Number of words per sentence: {words_per_sentence}")

Number of words per sentence: [22, 22, 16, 5, 13, 16, 19, 11, 28, 21, 18, 7, 7, 23, 8, 21, 13, 36, 28, 25, 21, 17, 16, 6, 7, 12, 8, 10, 9, 27, 16, 23, 17, 21, 5, 26, 28, 11, 22, 17, 3, 6, 9, 6, 11, 19, 11, 7, 12, 9, 6, 29, 23, 10, 24, 31, 9, 13, 5, 12, 16, 33, 26, 7, 13, 22, 11, 14, 28, 40, 33, 29, 16, 19, 24, 14, 13, 24, 13, 9, 5, 23, 12, 13, 36, 23, 15, 18, 20, 5, 14, 26, 26, 10, 8, 21, 21, 4, 23, 16, 23, 20, 11, 28, 20, 13, 27, 15, 13, 23, 14, 10, 20, 20, 21, 20, 17, 25, 14, 28, 29, 13, 9, 14, 11, 13, 6, 32, 15, 26, 35, 13, 18, 13, 18, 3, 9, 31, 27, 16, 26, 19, 25, 13, 15, 15, 18, 24, 5, 25, 9, 18, 8, 38, 21, 14, 5, 10, 21, 16, 26, 12, 23, 24, 14, 18, 20, 20, 23, 12, 24, 7, 27, 11, 18, 8, 16, 16, 19, 18, 18, 26, 18, 7, 14, 16, 22, 16, 18, 16, 15, 13, 28, 8, 24, 18, 23, 31, 5, 30, 21, 10, 6, 26, 12, 12, 12, 10, 18, 18, 25, 24, 34, 23, 5, 31, 22, 21, 11, 15, 32, 40, 10, 26, 10, 8, 26, 15, 15, 20, 21, 24, 24, 9, 19, 24, 7, 20, 27, 23, 13, 9, 4, 35, 25, 43, 20, 13, 6, 21, 15, 19, 29, 25

In [39]:
sentences = zloader.get_sentences()

Loading data from ../zuco_data/zuco1.0/task1-SR/Matlab files/resultsZKB_SR.mat


In [53]:
words_per_sentence = [len(sentences[i].split()) for i in range(len(sentences))]
print(f"Number of sentences: {len(sentences)}")
print(f"Average words per sentence: {np.mean(words_per_sentence)}")

Number of sentences: 400
Average words per sentence: 17.8225


In [None]:
# sanity check
not any(np.array(words_per_sentence) - np.array(test_words_per_sentence))

True

In [69]:
test_feat[0][3]['eeg_features']['FFD_t1'].shape

(105,)