In [1]:
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
from word import Word

In [2]:
datatype = {'index': str,
            'round': str, 
            'session': str,
            'line_part': str,
            'participant': str,
            'stimuli_image': str,
            'stimuli_text': str,
            'duration': 'int64',
            'x_cord': 'float64',
            'y_cord': 'float64',
            'aoi_x': 'float64',
            'aoi_y': 'float64', 
            'aoi_width': 'float64', 
            'aoi_height': 'float64',
            'token': str,
            'length': 'int64'}

df = pd.read_csv('EZReader_Filtered_Fixation_Result.csv', dtype=datatype)

In [3]:
df = df.drop(columns=["Unnamed: 0"])

In [4]:
df = df.dropna(subset=['token'])

In [5]:
df

Unnamed: 0,index,round,session,participant,line_part,stimuli_image,stimuli_text,timestamp,duration,x_cord,y_cord,aoi_x,aoi_y,aoi_width,aoi_height,token,length
0,11001_0,1,1,1,line 12 part 3,TEX_R1S1_bg.png,TEX_R1S1_text.txt,263.0,120,722.360000,488.112500,700.5,467.0,102.0,32.0,brought,7
1,11001_1,1,1,1,line 13 part 3,TEX_R1S1_bg.png,TEX_R1S1_text.txt,263.0,120,722.360000,488.112500,708.5,500.0,96.0,32.0,broker,6
2,11001_2,1,1,1,line 12 part 3,TEX_R1S1_bg.png,TEX_R1S1_text.txt,519.0,255,736.400394,491.933725,700.5,467.0,102.0,32.0,brought,7
3,11001_3,1,1,1,line 13 part 3,TEX_R1S1_bg.png,TEX_R1S1_text.txt,519.0,255,736.400394,491.933725,708.5,500.0,96.0,32.0,broker,6
4,11001_4,1,1,1,line 12 part 4,TEX_R1S1_bg.png,TEX_R1S1_text.txt,698.0,161,809.180746,494.226708,803.5,467.0,28.0,32.0,to,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454973,92314_302,9,2,314,line 7 part 5,TEX_R9S2_bg.png,TEX_R9S2_text.txt,59342.0,150,964.618662,267.236005,928.5,269.0,159.0,32.0,every,5
454974,92314_305,9,2,314,line 6 part 6,TEX_R9S2_bg.png,TEX_R9S2_text.txt,59573.0,148,1046.345271,264.358783,946.5,236.0,130.0,32.0,frequently,10
454975,92314_306,9,2,314,line 7 part 5,TEX_R9S2_bg.png,TEX_R9S2_text.txt,59573.0,148,1046.345271,264.358783,928.5,269.0,159.0,32.0,every,5
454976,92314_307,9,2,314,line 6 part 7,TEX_R9S2_bg.png,TEX_R9S2_text.txt,59946.0,364,1081.592856,260.325826,1077.5,236.0,65.0,32.0,finds,5


In [6]:
df['line'] = df['line_part'].apply(lambda line_part: line_part.split()[1])

In [7]:
df['line'] = df['line'].astype('int')

In [9]:
df['block'] = df['block'].apply(lambda line: 1 if line == 1 else math.ceil( (line - 1) / 4 ) + 1)

In [8]:
df = df.set_index('index')

In [9]:
lexicon = {}

participants = {}

for row in tqdm(df.iterrows()):
    
    index = row[0]
    cols = row[1]
    
    round_id = index[0]
    session_id = index[1]
    file_id = round_id + session_id
    
    participant_id = index[2:5]
    
    line_part = cols['line_part']
    _, line_id, _, part_id = line_part.split()
    
    timestamp = cols['timestamp']
    duration = cols['duration']
    length = cols['length']
    frequency = 0
    predict = 0
    level = 0
    
    token = cols['token']
    
    key = f'{file_id}_{line_id}_{part_id}_{participant_id}'
    
    if lexicon.get(key, 0) == 0:
        lexicon[key] = Word(file_id, line_part, token)
        
    lexicon[key].add(participant=participant_id, 
                     duration=duration, 
                     start_time=timestamp, 
                     length=length, 
                     freq=frequency, 
                     predict=predict,
                     level=level)
    
    if participants.get(file_id, 0) == 0:
        participants[file_id] = []
    
    if participant_id not in participants[file_id]:
        participants[file_id].append(participant_id)

454772it [00:43, 10541.25it/s]


In [11]:
lines = dict()

for key, word in tqdm(lexicon.items()):
    file, line, part, subject = key.split('_')
    
    line_key = file + '_' + str(line) + '_' + subject

    if lines.get(line_key, 0) == 0:
        lines[line_key] = []
    
    lines[line_key].append(word)

100%|██████████| 220937/220937 [00:00<00:00, 688680.63it/s]


In [13]:
lines

{'11_12_001': [<word.Word at 0x11ddb5850>,
  <word.Word at 0x11ddb5a90>,
  <word.Word at 0x1233c9850>,
  <word.Word at 0x1233c98b0>,
  <word.Word at 0x1233c98e0>,
  <word.Word at 0x1233c9910>],
 '11_13_001': [<word.Word at 0x11ddb5910>,
  <word.Word at 0x11ddb5a00>,
  <word.Word at 0x1233c9970>,
  <word.Word at 0x1233c99d0>,
  <word.Word at 0x1233c9880>],
 '11_2_001': [<word.Word at 0x11ddb5b20>,
  <word.Word at 0x11ddb5970>,
  <word.Word at 0x11ddb57f0>,
  <word.Word at 0x11ddb5af0>,
  <word.Word at 0x11ddb5bb0>,
  <word.Word at 0x11ddb5b50>,
  <word.Word at 0x11ddb59a0>],
 '11_3_001': [<word.Word at 0x11ddb5c40>,
  <word.Word at 0x11ddb5ca0>,
  <word.Word at 0x11ddb5c70>,
  <word.Word at 0x11ddb5d00>,
  <word.Word at 0x11ddb5cd0>,
  <word.Word at 0x11ddb5d30>,
  <word.Word at 0x1233da7f0>],
 '11_4_001': [<word.Word at 0x11ddb5d60>,
  <word.Word at 0x11ddb5d90>,
  <word.Word at 0x11ddb5dc0>,
  <word.Word at 0x11ddb5df0>,
  <word.Word at 0x11ddb5e20>,
  <word.Word at 0x11ddb5e50>],
 '1

In [18]:
def get_length(file_block):
    round_id = file_block[0][0]
    session_id = file_block[0][1]

    with open(f'datasets/GazeBase_TEX/StimuliExamples/TEXTexts/TEX_R{round_id}S{session_id}_text.txt') as file:
        lines = [line for line in file.readlines() if line != '\n']

        return len(lines[0].split())

In [20]:
headers = ['file',
           'line',
           'tokens',
           'subject',
           'duration',
           'nSFD',
           'nFFD',
           'nGD',
           'nTT',
           'PrF',
           'Pr1',
           'Pr2',
           'PrS',
           'length',
           'wpm']

result = pd.DataFrame(columns=headers)

for key, words in tqdm(lines.items()):
    file, line, subject = key.split('_')
    
    tokens = ""
    duration = np.sum([np.sum(word._durations) for word in words])
    
    SFD = np.mean([word.single_fix_duration() / word.get_leng() for word in words])
    FFD = np.mean([word.first_fix_duration() / word.get_leng() for word in words])
    GD = np.mean([word.gaze_duration() / word.get_leng() for word in words])
    TT = np.mean([word.total_time() / word.get_leng() for word in words])
    
    line_length = get_length([file, line])
    
    one_fix = len([word for word in words if len(word._durations) == 1])
    two_or_more_fix = len([word for word in words if len(word._durations) > 1])
    
    PrF = len(words) / line_length
    Pr1 = one_fix / line_length
    Pr2 = two_or_more_fix / line_length
    PrS = (line_length - len(words)) / line_length
    

    length = 0
    wpm = 0
    
    values = [file, line, tokens, subject, duration, SFD, FFD, GD, TT, PrF, Pr1, Pr2, PrS, length, wpm]
    
    dic = dict(zip(headers, values))
    
    result = result.append(dic, ignore_index=True)

100%|██████████| 41800/41800 [06:15<00:00, 111.29it/s]


In [21]:
result

Unnamed: 0,file,line,tokens,subject,duration,nSFD,nFFD,nGD,nTT,PrF,Pr1,Pr2,PrS,length,wpm
0,11,12,,001,3556,,32.914484,32.914484,100.328770,3.000000,0.500000,2.500000,-2.000000,0,0
1,11,13,,001,1585,,53.780000,53.780000,73.113333,2.500000,2.000000,0.500000,-1.500000,0,0
2,11,2,,001,4013,,52.490136,52.490136,136.861905,3.500000,0.500000,3.000000,-2.500000,0,0
3,11,3,,001,2598,,61.333333,61.333333,123.916667,3.500000,0.500000,3.000000,-2.500000,0,0
4,11,4,,001,2205,,49.544444,49.544444,98.366667,3.000000,1.000000,2.000000,-2.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41795,92,20,,314,2320,,79.569444,79.569444,115.150926,0.545455,0.272727,0.272727,0.454545,0,0
41796,92,21,,314,1365,,80.800000,80.800000,102.800000,0.454545,0.363636,0.090909,0.545455,0,0
41797,92,23,,314,1560,,46.873333,46.873333,73.263333,0.454545,0.181818,0.272727,0.545455,0,0
41798,92,24,,314,2087,,64.219444,64.219444,106.519444,0.545455,0.363636,0.181818,0.454545,0,0


In [22]:
result.to_csv("EZReader_Filtered_Result_Line_Level.csv")