In [42]:
import numpy as np
import scipy.io
import os
from collections import defaultdict
import pickle

In [9]:
from utils_ZuCo import *

In [10]:
dataset_dir = "/home/singh_shruti/ZuCo_dataset/"

In [11]:
task1_dir = dataset_dir + "task1_sr/Matlab files/"
task2_dir = dataset_dir + "task2_nr/Matlab files/"
task3_dir = dataset_dir + "task3_tsr/Matlab files/"

## Read data

In [93]:
subjects = ['ZAB', 'ZDM', 'ZGW', 'ZJM', 'ZJN', 'ZJS', 'ZKB', 'ZKH','ZKW', 'ZMG', 'ZPH', 'ZDN']

In [94]:
words_dict = defaultdict(int)
sentences_dict = defaultdict(int)

In [95]:
with open('data/rev_sentiment_gold.pkl', 'rb') as f:
    rev_sentiment_dict = pickle.load(f)

In [100]:
taskfile_paths = [task1_dir, task2_dir, task3_dir]
subjects_data = {}

sentence_level_feats = ["sent_mean", "sm_a1", "sm_a2", "sm_b1", "sm_b2", "sm_g1", "sm_g2", "sm_t1", "sm_t2"]
word_level_feats = ["word_mean", "FFD", "TRT", "GD", "GPT", "SFD", "wm_a1", "wm_a2", "wm_b1", "wm_b2", "wm_g1", "wm_g2", 
                    "wm_t1", "wm_t2"]

# First experiment with only the first task (sentiment analysis) data
for path in taskfile_paths[0:1]:
    print("Files: ", os.listdir(path))
    mat_files = [os.path.join(path,file) for file in os.listdir(path)]
    
    for filepath in mat_files:
        subject_id = filepath[-10:-7]  # Is of form '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZDN_SR.mat'
        subjects_data[subject_id] = {}
        
        data = io.loadmat(filepath, squeeze_me=True, struct_as_record=False)['sentenceData']

        for sent_item in data:
            #print(data[0].content)
            prc_sent = sent_item.content.strip()
            sentences_dict[prc_sent] += 1
            if prc_sent in rev_sentiment_dict:
                sentid = rev_sentiment_dict[prc_sent]['id']
                sent_mean_vec = []
                
#                 if np.all(np.isnan(sent_item.word)):
#                     continue
                
                subjects_data[subject_id][sentid] = {k: None for k in sentence_level_feats}
                if 'mean_a1' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_a1'])) and np.shape(sent_item.__dict__['mean_a1'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_a1"] = sent_item.__dict__['mean_a1']
                    sent_mean_vec.append(sent_item.__dict__['mean_a1'])
                else:
                    subjects_data[subject_id][sentid]["sm_a1"] = np.zeros(105, )
                
                if 'mean_a2' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_a2'])) and np.shape(sent_item.__dict__['mean_a2'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_a2"] = sent_item.__dict__['mean_a2']
                    sent_mean_vec.append(sent_item.__dict__['mean_a2'])
                else:
                    subjects_data[subject_id][sentid]["sm_a2"] =  np.zeros(105, )
                
                if 'mean_b1' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_b1'])) and np.shape(sent_item.__dict__['mean_b1'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_b1"] = sent_item.__dict__['mean_b1'] 
                    sent_mean_vec.append(subjects_data[subject_id][sentid]["sm_b1"])
                else:
                    subjects_data[subject_id][sentid]["sm_b1"] = np.zeros(105, )
                
                if 'mean_b2' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_b2'])) and np.shape(sent_item.__dict__['mean_b2'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_b2"] = sent_item.__dict__['mean_b2'] 
                    sent_mean_vec.append(sent_item.__dict__['mean_b2'])
                else:
                    subjects_data[subject_id][sentid]["sm_b2"] =  np.zeros(105, )
                
                if 'mean_g1' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_g1'])) and np.shape(sent_item.__dict__['mean_g1'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_g1"] = sent_item.__dict__['mean_g1']
                    sent_mean_vec.append(sent_item.__dict__['mean_g1'])
                else:
                    subjects_data[subject_id][sentid]["sm_g1"] = np.zeros(105, )
                
                if 'mean_g2' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_g2'])) and np.shape(sent_item.__dict__['mean_g2'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_g2"] = sent_item.__dict__['mean_g2']
                    sent_mean_vec.append(sent_item.__dict__['mean_g2'])
                else:
                    subjects_data[subject_id][sentid]["sm_g2"] =  np.zeros(105, )
                
                if 'mean_t1' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_t1'])) and np.shape(sent_item.__dict__['mean_t1'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_t1"] = sent_item.__dict__['mean_t1'] 
                    sent_mean_vec.append(subjects_data[subject_id][sentid]["sm_t1"])
                else:
                    subjects_data[subject_id][sentid]["sm_t1"] = np.zeros(105, )
                
                if 'mean_t2' in sent_item.__dict__ and not np.any(np.isnan(sent_item.__dict__['mean_t2'])) and np.shape(sent_item.__dict__['mean_t2'])[0] != 0:
                    subjects_data[subject_id][sentid]["sm_t2"] = sent_item.__dict__['mean_t2'] 
                    sent_mean_vec.append(sent_item.__dict__['mean_t2'])
                else:
                    subjects_data[subject_id][sentid]["sm_t2"] =  np.zeros(105, )
                subjects_data[subject_id][sentid]["sent_mean"] = np.mean(sent_mean_vec, axis=0)
                sent_mean_vec = []
                
                try:
                    # Process words data now
                    subjects_data[subject_id][sentid]["words"] = {}
                    subjects_data[subject_id][sentid]["words_list"] = []
                    word_counter = 0
                    inside_loop = False
                    for w_item in sent_item.word:
                        inside_loop = True
                        prc_word = w_item.content.strip().lower()
                        words_dict[prc_word] += 1
                        subjects_data[subject_id][sentid]["words_list"].append(w_item.content.strip())
                        subjects_data[subject_id][sentid]["words"][word_counter] = {k: None for k in word_level_feats}

                        word_mean_vec = []
                        for feat in ["FFD", "TRT", "GD", "GPT", "SFD"]:
                            if feat in w_item.__dict__ and np.shape(w_item.__dict__[feat]) != 0:
                                subjects_data[subject_id][sentid]["words"][word_counter][feat] = w_item.__dict__[feat]
                                word_mean_vec.append(w_item.__dict__[feat])
                            else:
                                subjects_data[subject_id][sentid]["words"][word_counter][feat] = np.zeros(105, )
                        subjects_data[subject_id][sentid]["words"][word_counter]["word_mean"] = np.mean(word_mean_vec, axis=0)
                        word_mean_vec = []

                        for f1 in ["_a1", "_a2", "_b1", "_b2", "_g1", "_g2", "_t1", "_t2"]:
                            f1_mean_vec = []
                            for f2 in ["FFD", "TRT", "GD", "GPT", "SFD"]:
                                composite_key = f2 + f1
                                if composite_key in w_item.__dict__ and np.shape(w_item.__dict__[composite_key])[0] != 0:
                                    f1_mean_vec.append(w_item.__dict__[composite_key])
                            if len(f1_mean_vec) > 0:
                                subjects_data[subject_id][sentid]["words"][word_counter]["wm" + f1] = np.mean(f1_mean_vec, axis=0)
                            else:
                                subjects_data[subject_id][sentid]["words"][word_counter]["wm" + f1] = np.zeros(105, )
                        word_counter += 1
                except Exception as ex:
                    if not inside_loop:
                        subjects_data[subject_id].pop(sentid)
                    print(ex)
#                     print(sent_item)
#                     print(prc_sent, sentid, w_item, ex)
            else:
                print("{} - Sent not found: {}".format(subject_id, prc_sent))
            
            # stop at one sentence (total 400)
            # break
        
        # stop at one subject (total 12)
        # break
    
    # stop at the first task (total 3)
    break

Files:  ['resultsZDN_SR.mat', 'resultsZKH_SR.mat', 'resultsZKB_SR.mat', 'resultsZJN_SR.mat', 'resultsZDM_SR.mat', 'resultsZKW_SR.mat', 'resultsZPH_SR.mat', 'resultsZMG_SR.mat', 'resultsZAB_SR.mat', 'resultsZJM_SR.mat', 'resultsZJS_SR.mat', 'resultsZGW_SR.mat']


  return array(a, dtype, copy=False, order=order, subok=True)


ZDN - Sent not found: Ultimately feels emp11111ty and unsatisfying, like swallowing a Communion wafer without the wine.
ZDN - Sent not found: Bullock's complete lack of focus and ability quickly derails the film.1


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' object is not iterable
'float' 

In [101]:
with open('data/sentiment_subject_data_new.pkl', 'wb') as f:
    pickle.dump(subjects_data, f)

In [92]:
400*12, sum(len(subjects_data[i]) for i in subjects)

(4800, 4616)

In [97]:
400*12, sum(len(subjects_data[i]) for i in subjects)

(4800, 4675)

In [80]:
len(subjects_data[subject_id])

393

In [79]:
sent_item.content

'Martyr gets royally screwed and comes back for more.'

In [78]:
sent_item.word

nan

In [67]:
sent_item.__dict__['mean_a1'] == nan

NameError: name 'nan' is not defined

In [47]:
'/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZDN_SR.mat'[-10:-7]

'ZDN'

In [56]:
dir(sent_item)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fieldnames',
 'allFixations',
 'answer_mean_a1',
 'answer_mean_a1_diff',
 'answer_mean_a2',
 'answer_mean_a2_diff',
 'answer_mean_b1',
 'answer_mean_b1_diff',
 'answer_mean_b2',
 'answer_mean_b2_diff',
 'answer_mean_g1',
 'answer_mean_g1_diff',
 'answer_mean_g2',
 'answer_mean_g2_diff',
 'answer_mean_t1',
 'answer_mean_t1_diff',
 'answer_mean_t2',
 'answer_mean_t2_diff',
 'content',
 'mean_a1',
 'mean_a1_diff',
 'mean_a1_diff_sec',
 'mean_a1_sec',
 'mean_a2',
 'mean_a2_diff',
 'mean_a2_diff_sec',
 'mean_a2_sec',
 'mean_b1',
 'mean_b1_diff',
 'mean_b1_diff_sec',
 'mean_b1_sec',
 'mean_b2',
 'mean_b2_dif

In [16]:
# List of sentences datastructure
len(data)

400

In [18]:
dir(data[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fieldnames',
 'allFixations',
 'answer_mean_a1',
 'answer_mean_a1_diff',
 'answer_mean_a2',
 'answer_mean_a2_diff',
 'answer_mean_b1',
 'answer_mean_b1_diff',
 'answer_mean_b2',
 'answer_mean_b2_diff',
 'answer_mean_g1',
 'answer_mean_g1_diff',
 'answer_mean_g2',
 'answer_mean_g2_diff',
 'answer_mean_t1',
 'answer_mean_t1_diff',
 'answer_mean_t2',
 'answer_mean_t2_diff',
 'content',
 'mean_a1',
 'mean_a1_diff',
 'mean_a1_diff_sec',
 'mean_a1_sec',
 'mean_a2',
 'mean_a2_diff',
 'mean_a2_diff_sec',
 'mean_a2_sec',
 'mean_b1',
 'mean_b1_diff',
 'mean_b1_diff_sec',
 'mean_b1_sec',
 'mean_b2',
 'mean_b2_dif

In [19]:
data[0].mean_a1

array([0.7034963 , 0.6258178 , 0.8860085 , 0.7699399 , 0.7714355 ,
       0.5110524 , 0.52470464, 0.69962174, 0.93221384, 0.90646076,
       0.8137055 , 0.55650216, 0.6928516 , 0.6534393 , 0.8337521 ,
       1.0126307 , 0.48507643, 0.6197668 , 0.7751927 , 0.54318595,
       0.7491286 , 0.91322684, 0.8836411 , 0.78567386, 0.6354318 ,
       0.53772336, 0.9543252 , 0.95500284, 0.9461102 , 0.8219461 ,
       0.45179316, 0.86706465, 0.9247909 , 1.0417749 , 0.9287476 ,
       0.6299081 , 0.63640034, 1.1421303 , 1.1596866 , 1.104492  ,
       1.523519  , 1.2779826 , 1.1187562 , 0.9509142 , 0.6946317 ,
       0.43775293, 1.8377604 , 1.7060714 , 1.4577321 , 1.1865503 ,
       0.8063267 , 0.9023357 , 2.4548678 , 1.9295872 , 1.5575138 ,
       1.2764499 , 2.630885  , 2.0274146 , 1.7164342 , 1.2034322 ,
       2.7603424 , 1.973675  , 1.3827966 , 1.0632343 , 0.7378168 ,
       0.7264999 , 0.47979254, 2.6444514 , 1.8167125 , 1.2998445 ,
       0.9866424 , 0.84460247, 0.7777054 , 2.294515  , 1.48287

In [39]:
data[0].mean_a1_diff

array([-0.03962821, -0.16031033, -0.006051  ,  0.00805688,  0.07023698,
       -0.05225641, -0.0346148 ,  0.24869365,  0.21169072,  0.16600877,
        0.18427384,  0.16133398,  0.06054109,  0.0586704 ,  0.12116051,
        0.06538439,  0.30067533,  0.17690182,  0.13555306, -0.03873003,
        0.04424071,  0.10631174,  0.12324637,  0.20491636,  0.38682747,
        0.19990784,  0.35613394,  0.398031  ,  0.25766933,  0.4467106 ,
        0.21070206,  0.12259886, -0.1298784 ,  0.03226012,  0.21139592,
        0.10671306,  0.39461863,  0.73092186,  0.33637   ,  0.11589098,
        0.3336376 ,  0.21321559,  0.06850988, -0.03186822,  0.15563929,
       -0.07533395,  0.13652086, -0.04618245], dtype=float32)

In [20]:
first_sent_word_data = data[0].word

In [24]:
dir(first_sent_word_data[0])

['FFD',
 'FFD_a1',
 'FFD_a1_diff',
 'FFD_a2',
 'FFD_a2_diff',
 'FFD_b1',
 'FFD_b1_diff',
 'FFD_b2',
 'FFD_b2_diff',
 'FFD_g1',
 'FFD_g1_diff',
 'FFD_g2',
 'FFD_g2_diff',
 'FFD_pupilsize',
 'FFD_t1',
 'FFD_t1_diff',
 'FFD_t2',
 'FFD_t2_diff',
 'GD',
 'GD_a1',
 'GD_a1_diff',
 'GD_a2',
 'GD_a2_diff',
 'GD_b1',
 'GD_b1_diff',
 'GD_b2',
 'GD_b2_diff',
 'GD_g1',
 'GD_g1_diff',
 'GD_g2',
 'GD_g2_diff',
 'GD_pupilsize',
 'GD_t1',
 'GD_t1_diff',
 'GD_t2',
 'GD_t2_diff',
 'GPT',
 'GPT_a1',
 'GPT_a1_diff',
 'GPT_a2',
 'GPT_a2_diff',
 'GPT_b1',
 'GPT_b1_diff',
 'GPT_b2',
 'GPT_b2_diff',
 'GPT_g1',
 'GPT_g1_diff',
 'GPT_g2',
 'GPT_g2_diff',
 'GPT_pupilsize',
 'GPT_t1',
 'GPT_t1_diff',
 'GPT_t2',
 'GPT_t2_diff',
 'SFD',
 'SFD_a1',
 'SFD_a1_diff',
 'SFD_a2',
 'SFD_a2_diff',
 'SFD_b1',
 'SFD_b1_diff',
 'SFD_b2',
 'SFD_b2_diff',
 'SFD_g1',
 'SFD_g1_diff',
 'SFD_g2',
 'SFD_g2_diff',
 'SFD_pupilsize',
 'SFD_t1',
 'SFD_t1_diff',
 'SFD_t2',
 'SFD_t2_diff',
 'TRT',
 'TRT_a1',
 'TRT_a1_diff',
 'TRT_a2',
 'TR

In [25]:
feat_keys = ['FFD', 'TRT', 'GD', 'GPT', 'SFD']

In [37]:
mean_vec = []
for w in first_sent_word_data:
    if np.shape(w.FFD_a1)[0] != 0:
        mean_vec.append(w.FFD_a1)
    else:
        mean_vec.append(np.zeros(105, ))
    if np.shape(w.TRT_a1)[0] != 0:
        mean_vec.append(w.TRT_a1)
    else:
        mean_vec.append(np.zeros(105,))
    if np.shape(w.GD_a1)[0] != 0:
        mean_vec.append(w.GD_a1)
    else:
        mean_vec.append(np.zeros(105,))
    if np.shape(w.GPT_a1)[0] != 0:
        mean_vec.append(w.GPT_a1)
    else:
        mean_vec.append(np.zeros(105,))
    if np.shape(w.SFD_a1)[0] != 0:
        mean_vec.append(w.SFD_a1)
    else:
        mean_vec.append(np.zeros(105,))

In [36]:
# mean_vec = []
# for w in first_sent_word_data:
#     if np.shape(w.FFD_a1)[0] == 0:
#         print("FFD")
#         #mean_vec.append(w.FFD_a1)
#     if np.shape(w.TRT_a1)[0] == 0:
#         print("TRT")
#         #mean_vec.append(w.TRT_a1)
#     if np.shape(w.GD_a1)[0] == 0:
#         print("GD")
#         #mean_vec.append(w.GD_a1)
#     if np.shape(w.GPT_a1)[0] == 0:
#         print("GPT")
#         #mean_vec.append(w.GPT_a1)
#     if np.shape(w.SFD_a1)[0] == 0:
#         print("SFD")
#         #mean_vec.append(w.SFD_a1)

In [35]:
np.shape(w.FFD_a1)

(105,)

In [38]:
np.mean(mean_vec)

0.6054074901356522

In [30]:
w.SFD_a1

array([0.40591857, 0.75719452, 0.76280004, 0.94856644, 1.05752206,
       0.50663656, 0.63521439, 0.75372767, 0.82306552, 0.83693457,
       0.66437054, 0.70226735, 0.76326489, 0.73722726, 0.6980688 ,
       1.00000942, 0.4407219 , 0.66637623, 0.64499789, 0.3702977 ,
       0.59544927, 0.66627342, 0.49173701, 0.44956049, 0.3528302 ,
       0.31686187, 0.33261576, 0.53838074, 0.46792671, 0.31606403,
       0.31704211, 0.40655693, 0.88960081, 0.5740729 , 0.51724094,
       0.50712609, 0.60694355, 1.03674746, 1.12922704, 1.0517832 ,
       1.27990973, 0.99842197, 0.77063876, 0.57528192, 0.38948461,
       0.28921601, 0.96650958, 0.97208166, 0.91678911, 0.79044932,
       0.3661375 , 0.45487836, 1.24283755, 0.92326736, 0.95216793,
       0.72229791, 1.2043165 , 0.84532601, 1.00261462, 0.60786444,
       1.25376236, 0.67564827, 0.56797945, 0.47103262, 0.37136737,
       0.44428673, 0.46344185, 1.20341754, 0.74520987, 0.55240291,
       0.38696954, 0.60517997, 0.46290782, 1.18297613, 0.69435

In [66]:
words_dict

defaultdict(int,
            {'a': 2,
             'basic': 1,
             'beyond': 1,
             'care': 1,
             'case': 1,
             'decency.': 1,
             'dictums': 1,
             'failing': 1,
             'for': 1,
             'good': 1,
             'human': 1,
             'of': 1,
             'presents': 1,
             'provide': 1,
             'reason': 1,
             'the': 1,
             'to': 2,
             'us': 1,
             'very': 1,
             'while': 1})

In [67]:
import spacy

In [68]:
nlp = spacy.load('en_core_web_sm')

In [70]:
import nltk

In [81]:
nltk.word_tokenize("He was falling.")[2]

AttributeError: 'str' object has no attribute 'lemma'

In [85]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [90]:
nltk.pos_tag(['he', 'is', 'falling'])

[('he', 'PRP'), ('is', 'VBZ'), ('falling', 'VBG')]

In [95]:
wordnet_lemmatizer.lemmatize('falling', 'v')

'fall'

In [96]:
wordnet_lemmatizer.lemmatize??

### Extract data

In [3]:
ls $dataset_dir

[0m[38;5;9manswers.zip[0m  [38;5;27mscripts[0m/     [38;5;27mtask1_sr[0m/     [38;5;9mtask2_NR.zip[0m   [38;5;9mtask_materials.zip[0m
nohup.out    [38;5;9mscripts.zip[0m  [38;5;9mtask1_SR.zip[0m  [38;5;9mtask3_TSR.zip[0m  wget-log


In [17]:
#!unzip ~/ZuCo_dataset/task2_NR.zip -d task2_nr

In [18]:
#!unzip ~/ZuCo_dataset/task3_TSR.zip -d task3_tsr

In [14]:
ls task2_nr/Preprocessed/

relations_normal_reading_control_questions.csv  [0m[38;5;27mZAB[0m/  [38;5;27mZGW[0m/  [38;5;27mZJS[0m/  [38;5;27mZKW[0m/
relations_normal_reading.csv                    [38;5;27mZDM[0m/  [38;5;27mZJM[0m/  [38;5;27mZKB[0m/  [38;5;27mZMG[0m/
sentencesNR.mat                                 [38;5;27mZDN[0m/  [38;5;27mZJN[0m/  [38;5;27mZKH[0m/  [38;5;27mZPH[0m/


In [16]:
!head relations_normal_reading.csv

head: cannot open ‘relations_normal_reading.csv’ for reading: No such file or directory


In [19]:
pwd

'/home/singh_shruti/workspace/aiproj'

In [23]:
ls

0_DataExploration.ipynb  nohup.out  [0m[38;5;27mscripts[0m/  utils_ZuCo.py


### EEG processing imports

In [4]:
from utils_ZuCo import *

In [5]:
import os
path = "/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/"

In [21]:
os.listdir(path)

['resultsZDN_SR.mat',
 'resultsZKH_SR.mat',
 'resultsZKB_SR.mat',
 'resultsZJN_SR.mat',
 'resultsZDM_SR.mat',
 'resultsZKW_SR.mat',
 'resultsZPH_SR.mat',
 'resultsZMG_SR.mat',
 'resultsZAB_SR.mat',
 'resultsZJM_SR.mat',
 'resultsZJS_SR.mat',
 'resultsZGW_SR.mat']

In [19]:
for file in os.listdir(path)[1:]:
    print(file)

resultsZKH_SR.mat
resultsZKB_SR.mat
resultsZJN_SR.mat
resultsZDM_SR.mat
resultsZKW_SR.mat
resultsZPH_SR.mat
resultsZMG_SR.mat
resultsZAB_SR.mat
resultsZJM_SR.mat
resultsZJS_SR.mat
resultsZGW_SR.mat


In [7]:
files, len(files)

NameError: name 'files' is not defined

In [17]:
sorted(files)

['/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZAB_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZDM_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZGW_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZJM_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZJN_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZJS_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZKB_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZKH_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZKW_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZMG_SR.mat',
 '/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZPH_SR.mat']

### Visualize data

In [99]:
# datatransform_t1 = DataTransformer('task1', level='word', scaling='min-max', fillna='zeros')

In [100]:
# type(datatransform_t1)

In [101]:
# sbjs_t1 = [datatransform_t1(i) for i in range(12)]

In [97]:
data = io.loadmat("/home/singh_shruti/ZuCo_dataset/task1_sr/Matlab files/resultsZAB_SR.mat", squeeze_me=True, struct_as_record=False)['sentenceData']

In [98]:
type(data)

numpy.ndarray

In [103]:
len(data)

400

In [104]:
# get all field names for sentence data
print(data[0]._fieldnames)

['content', 'rawData', 'mean_t1', 'mean_t2', 'mean_a1', 'mean_a2', 'mean_b1', 'mean_b2', 'mean_g1', 'mean_g2', 'mean_t1_sec', 'mean_t2_sec', 'mean_a1_sec', 'mean_a2_sec', 'mean_b1_sec', 'mean_b2_sec', 'mean_g1_sec', 'mean_g2_sec', 'mean_t1_diff', 'mean_t2_diff', 'mean_a1_diff', 'mean_a2_diff', 'mean_b1_diff', 'mean_b2_diff', 'mean_g1_diff', 'mean_g2_diff', 'mean_t1_diff_sec', 'mean_t2_diff_sec', 'mean_a1_diff_sec', 'mean_a2_diff_sec', 'mean_b1_diff_sec', 'mean_b2_diff_sec', 'mean_g1_diff_sec', 'mean_g2_diff_sec', 'word', 'omissionRate', 'allFixations', 'wordbounds', 'answer_mean_t1', 'answer_mean_t2', 'answer_mean_a1', 'answer_mean_a2', 'answer_mean_b1', 'answer_mean_b2', 'answer_mean_g1', 'answer_mean_g2', 'answer_mean_t1_diff', 'answer_mean_t2_diff', 'answer_mean_a1_diff', 'answer_mean_a2_diff', 'answer_mean_b1_diff', 'answer_mean_b2_diff', 'answer_mean_g1_diff', 'answer_mean_g2_diff']


In [105]:
# example: print sentence
print(data[0].content)

Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.


In [112]:
dir(data[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fieldnames',
 'allFixations',
 'answer_mean_a1',
 'answer_mean_a1_diff',
 'answer_mean_a2',
 'answer_mean_a2_diff',
 'answer_mean_b1',
 'answer_mean_b1_diff',
 'answer_mean_b2',
 'answer_mean_b2_diff',
 'answer_mean_g1',
 'answer_mean_g1_diff',
 'answer_mean_g2',
 'answer_mean_g2_diff',
 'answer_mean_t1',
 'answer_mean_t1_diff',
 'answer_mean_t2',
 'answer_mean_t2_diff',
 'content',
 'mean_a1',
 'mean_a1_diff',
 'mean_a1_diff_sec',
 'mean_a1_sec',
 'mean_a2',
 'mean_a2_diff',
 'mean_a2_diff_sec',
 'mean_a2_sec',
 'mean_b1',
 'mean_b1_diff',
 'mean_b1_diff_sec',
 'mean_b1_sec',
 'mean_b2',
 'mean_b2_dif

In [116]:
data[0]._fieldnames

['content',
 'rawData',
 'mean_t1',
 'mean_t2',
 'mean_a1',
 'mean_a2',
 'mean_b1',
 'mean_b2',
 'mean_g1',
 'mean_g2',
 'mean_t1_sec',
 'mean_t2_sec',
 'mean_a1_sec',
 'mean_a2_sec',
 'mean_b1_sec',
 'mean_b2_sec',
 'mean_g1_sec',
 'mean_g2_sec',
 'mean_t1_diff',
 'mean_t2_diff',
 'mean_a1_diff',
 'mean_a2_diff',
 'mean_b1_diff',
 'mean_b2_diff',
 'mean_g1_diff',
 'mean_g2_diff',
 'mean_t1_diff_sec',
 'mean_t2_diff_sec',
 'mean_a1_diff_sec',
 'mean_a2_diff_sec',
 'mean_b1_diff_sec',
 'mean_b2_diff_sec',
 'mean_g1_diff_sec',
 'mean_g2_diff_sec',
 'word',
 'omissionRate',
 'allFixations',
 'wordbounds',
 'answer_mean_t1',
 'answer_mean_t2',
 'answer_mean_a1',
 'answer_mean_a2',
 'answer_mean_b1',
 'answer_mean_b2',
 'answer_mean_g1',
 'answer_mean_g2',
 'answer_mean_t1_diff',
 'answer_mean_t2_diff',
 'answer_mean_a1_diff',
 'answer_mean_a2_diff',
 'answer_mean_b1_diff',
 'answer_mean_b2_diff',
 'answer_mean_g1_diff',
 'answer_mean_g2_diff']

In [49]:
# get word level data
word_data = data[0].word
word_data

array([<scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896f98>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f8968d0>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896898>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896908>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896ac8>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896978>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896860>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896a20>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896d30>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc16f896e10>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc177dbf588>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc177dbf2b0>,
       <scipy.io.matlab.mio5_params.mat_struct object at 0x7fc177dbf320>,
       <scipy.io.matlab.mio5_params.ma

In [106]:
# get names of all word features
# index of the array `word_data` is the number of the word
print(word_data[0]._fieldnames)

['content', 'fixPositions', 'nFixations', 'meanPupilSize', 'rawEEG', 'rawET', 'FFD', 'FFD_pupilsize', 'FFD_t1', 'FFD_t2', 'FFD_a1', 'FFD_a2', 'FFD_b1', 'FFD_b2', 'FFD_g1', 'FFD_g2', 'FFD_t1_diff', 'FFD_t2_diff', 'FFD_a1_diff', 'FFD_a2_diff', 'FFD_b1_diff', 'FFD_b2_diff', 'FFD_g1_diff', 'FFD_g2_diff', 'TRT', 'TRT_pupilsize', 'TRT_t1', 'TRT_t2', 'TRT_a1', 'TRT_a2', 'TRT_b1', 'TRT_b2', 'TRT_g1', 'TRT_g2', 'TRT_t1_diff', 'TRT_t2_diff', 'TRT_a1_diff', 'TRT_a2_diff', 'TRT_b1_diff', 'TRT_b2_diff', 'TRT_g1_diff', 'TRT_g2_diff', 'GD', 'GD_pupilsize', 'GD_t1', 'GD_t2', 'GD_a1', 'GD_a2', 'GD_b1', 'GD_b2', 'GD_g1', 'GD_g2', 'GD_t1_diff', 'GD_t2_diff', 'GD_a1_diff', 'GD_a2_diff', 'GD_b1_diff', 'GD_b2_diff', 'GD_g1_diff', 'GD_g2_diff', 'GPT', 'GPT_pupilsize', 'GPT_t1', 'GPT_t2', 'GPT_a1', 'GPT_a2', 'GPT_b1', 'GPT_b2', 'GPT_g1', 'GPT_g2', 'GPT_t1_diff', 'GPT_t2_diff', 'GPT_a1_diff', 'GPT_a2_diff', 'GPT_b1_diff', 'GPT_b2_diff', 'GPT_g1_diff', 'GPT_g2_diff', 'SFD', 'SFD_pupilsize', 'SFD_t1', 'SFD_t2', 

In [107]:
# example: get first word
print(word_data[0].content)

Presents


In [26]:
# example: get number of fixations of first word
print(word_data[0].nFixations)

4


In [111]:
type(word_data[0].FFD_t2), word_data[0].FFD_t2.shape

(numpy.ndarray, (105,))

In [110]:
type(word_data[0].FFD_t1), word_data[0].FFD_t1.shape, word_data[0].FFD_t1

(numpy.ndarray,
 (105,),
 array([0.19429664, 0.17923741, 0.36307213, 0.50747943, 0.62206709,
        0.51871073, 0.42868289, 0.06850591, 0.28140983, 0.57861644,
        1.08024967, 0.36859021, 0.36467823, 0.41722685, 1.14768839,
        0.55932534, 0.4871715 , 0.79681689, 0.51210332, 0.40454227,
        0.55180544, 0.43953407, 0.75510144, 0.38622764, 0.43004686,
        0.29673383, 0.3575944 , 0.36961138, 0.3506383 , 0.85081506,
        1.26278675, 0.41491231, 0.3718234 , 0.45663175, 0.42179471,
        1.03561676, 0.85461956, 0.43418464, 0.46566522, 0.33808869,
        0.49680769, 0.46951395, 0.50899029, 0.15184164, 1.6930244 ,
        0.76735342, 0.57302761, 0.94475245, 0.7187323 , 1.07659662,
        0.72896671, 1.66536224, 1.39709103, 1.28687668, 1.10441923,
        1.43177247, 1.32517946, 1.66541123, 1.60669196, 1.8083328 ,
        1.39522731, 1.99193358, 1.38413417, 1.54831302, 1.34435618,
        1.0585711 , 0.49228552, 1.85182977, 1.85807848, 1.41647136,
        1.35392964, 1.0

In [27]:
eeg_features_list = ['FFD_t1', 'FFD_t2', 'FFD_a1', 'FFD_a2', 'FFD_b1', 'FFD_b2', 
                     'FFD_g1', 'FFD_g2', 'TRT_t1', 'TRT_t2', 'TRT_a1', 'TRT_a2', 
                     'TRT_b1', 'TRT_b2', 'TRT_g1', 'TRT_g2', 'GD_t1', 'GD_t2', 
                     'GD_a1', 'GD_a2', 'GD_b1', 'GD_b2', 'GD_g1', 'GD_g2', 'GPT_t1', 
                     'GPT_t2', 'GPT_a1', 'GPT_a2', 'GPT_b1', 'GPT_b2', 'GPT_g1', 
                     'GPT_g2',  'SFD_t1', 'SFD_t2', 'SFD_a1', 'SFD_a2', 'SFD_b1', 
                     'SFD_b2', 'SFD_g1', 'SFD_g2']

In [34]:
for i in eeg_features_list:
    if not i in word_data[0]._fieldnames:
        print(i)

In [33]:
arr_sizes = []

for i in eeg_features_list:
    arr_sizes.append(len(word_data[0]))
        

TypeError: object of type 'mat_struct' has no len()

In [36]:
dir(word_data[0])

['FFD',
 'FFD_a1',
 'FFD_a1_diff',
 'FFD_a2',
 'FFD_a2_diff',
 'FFD_b1',
 'FFD_b1_diff',
 'FFD_b2',
 'FFD_b2_diff',
 'FFD_g1',
 'FFD_g1_diff',
 'FFD_g2',
 'FFD_g2_diff',
 'FFD_pupilsize',
 'FFD_t1',
 'FFD_t1_diff',
 'FFD_t2',
 'FFD_t2_diff',
 'GD',
 'GD_a1',
 'GD_a1_diff',
 'GD_a2',
 'GD_a2_diff',
 'GD_b1',
 'GD_b1_diff',
 'GD_b2',
 'GD_b2_diff',
 'GD_g1',
 'GD_g1_diff',
 'GD_g2',
 'GD_g2_diff',
 'GD_pupilsize',
 'GD_t1',
 'GD_t1_diff',
 'GD_t2',
 'GD_t2_diff',
 'GPT',
 'GPT_a1',
 'GPT_a1_diff',
 'GPT_a2',
 'GPT_a2_diff',
 'GPT_b1',
 'GPT_b1_diff',
 'GPT_b2',
 'GPT_b2_diff',
 'GPT_g1',
 'GPT_g1_diff',
 'GPT_g2',
 'GPT_g2_diff',
 'GPT_pupilsize',
 'GPT_t1',
 'GPT_t1_diff',
 'GPT_t2',
 'GPT_t2_diff',
 'SFD',
 'SFD_a1',
 'SFD_a1_diff',
 'SFD_a2',
 'SFD_a2_diff',
 'SFD_b1',
 'SFD_b1_diff',
 'SFD_b2',
 'SFD_b2_diff',
 'SFD_g1',
 'SFD_g1_diff',
 'SFD_g2',
 'SFD_g2_diff',
 'SFD_pupilsize',
 'SFD_t1',
 'SFD_t1_diff',
 'SFD_t2',
 'SFD_t2_diff',
 'TRT',
 'TRT_a1',
 'TRT_a1_diff',
 'TRT_a2',
 'TR

In [53]:
word_data[0].SFD_g2, word_data[0].FFD_g2

(array([], dtype=float64),
 array([0.7923798 , 0.63489175, 1.02781081, 0.80003172, 0.64115089,
        0.32353818, 0.47877821, 0.7482419 , 1.02459919, 0.66422927,
        0.54948336, 0.51821971, 0.72873878, 0.63881034, 0.5773499 ,
        0.71563071, 0.35449752, 0.59358221, 0.59261787, 0.43347245,
        0.60748667, 0.65205139, 0.56229717, 0.48488885, 0.34160981,
        0.45462072, 0.71885067, 0.6506626 , 0.51853442, 0.39688364,
        0.19659023, 0.7033326 , 0.81556404, 0.4577812 , 0.44225255,
        0.5253005 , 0.31454113, 0.66364211, 1.13975155, 0.66868407,
        0.80140257, 0.88212639, 0.4877941 , 0.31965712, 0.27589515,
        0.23692214, 0.85550123, 0.76653475, 0.65327018, 0.47429478,
        0.37908867, 0.58112049, 0.91268718, 0.74974769, 0.78024673,
        0.68926531, 0.85103428, 0.7415393 , 0.81330562, 0.70207113,
        0.81092423, 0.74885523, 0.76991349, 0.73557526, 0.57275552,
        0.50937349, 0.37304649, 0.70928764, 0.68744558, 0.78911626,
        0.73298335, 0

In [51]:
dir(word_data[0])

['FFD',
 'FFD_a1',
 'FFD_a1_diff',
 'FFD_a2',
 'FFD_a2_diff',
 'FFD_b1',
 'FFD_b1_diff',
 'FFD_b2',
 'FFD_b2_diff',
 'FFD_g1',
 'FFD_g1_diff',
 'FFD_g2',
 'FFD_g2_diff',
 'FFD_pupilsize',
 'FFD_t1',
 'FFD_t1_diff',
 'FFD_t2',
 'FFD_t2_diff',
 'GD',
 'GD_a1',
 'GD_a1_diff',
 'GD_a2',
 'GD_a2_diff',
 'GD_b1',
 'GD_b1_diff',
 'GD_b2',
 'GD_b2_diff',
 'GD_g1',
 'GD_g1_diff',
 'GD_g2',
 'GD_g2_diff',
 'GD_pupilsize',
 'GD_t1',
 'GD_t1_diff',
 'GD_t2',
 'GD_t2_diff',
 'GPT',
 'GPT_a1',
 'GPT_a1_diff',
 'GPT_a2',
 'GPT_a2_diff',
 'GPT_b1',
 'GPT_b1_diff',
 'GPT_b2',
 'GPT_b2_diff',
 'GPT_g1',
 'GPT_g1_diff',
 'GPT_g2',
 'GPT_g2_diff',
 'GPT_pupilsize',
 'GPT_t1',
 'GPT_t1_diff',
 'GPT_t2',
 'GPT_t2_diff',
 'SFD',
 'SFD_a1',
 'SFD_a1_diff',
 'SFD_a2',
 'SFD_a2_diff',
 'SFD_b1',
 'SFD_b1_diff',
 'SFD_b2',
 'SFD_b2_diff',
 'SFD_g1',
 'SFD_g1_diff',
 'SFD_g2',
 'SFD_g2_diff',
 'SFD_pupilsize',
 'SFD_t1',
 'SFD_t1_diff',
 'SFD_t2',
 'SFD_t2_diff',
 'TRT',
 'TRT_a1',
 'TRT_a1_diff',
 'TRT_a2',
 'TR

In [54]:
word_data[0].__dict__['SFD_g2'], word_data[0].__dict__['FFD_g2']

(array([], dtype=float64),
 array([0.7923798 , 0.63489175, 1.02781081, 0.80003172, 0.64115089,
        0.32353818, 0.47877821, 0.7482419 , 1.02459919, 0.66422927,
        0.54948336, 0.51821971, 0.72873878, 0.63881034, 0.5773499 ,
        0.71563071, 0.35449752, 0.59358221, 0.59261787, 0.43347245,
        0.60748667, 0.65205139, 0.56229717, 0.48488885, 0.34160981,
        0.45462072, 0.71885067, 0.6506626 , 0.51853442, 0.39688364,
        0.19659023, 0.7033326 , 0.81556404, 0.4577812 , 0.44225255,
        0.5253005 , 0.31454113, 0.66364211, 1.13975155, 0.66868407,
        0.80140257, 0.88212639, 0.4877941 , 0.31965712, 0.27589515,
        0.23692214, 0.85550123, 0.76653475, 0.65327018, 0.47429478,
        0.37908867, 0.58112049, 0.91268718, 0.74974769, 0.78024673,
        0.68926531, 0.85103428, 0.7415393 , 0.81330562, 0.70207113,
        0.81092423, 0.74885523, 0.76991349, 0.73557526, 0.57275552,
        0.50937349, 0.37304649, 0.70928764, 0.68744558, 0.78911626,
        0.73298335, 0