In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

from read_data import *
from features import * 
from special_character import * 



#### single file

In [None]:
file_path = "dundee_corpus/english/sa02ma1p.dat"

sf03ma1p = pd.read_csv(file_path, sep='\s+', skiprows=1,
                        names=['WORD', 'TEXT', 'LINE', 'OLEN', 'WLEN', 'XPOS', 'WNUM', 'FDUR', 'OBLP', 'WDLP', 'LAUN', 'TXFR'], 
                        encoding='windows-1252')

print(len(sf03ma1p))

### MA1P.DAT files

In [None]:
#len(result_df[result_df['Text File'] == '02'])
#print(len(result_df))
#filtered_df = result_df[(result_df['Text File'] == '03') & (result_df['Participant ID'] == 'sf') & (result_df['LINE'] == 3) & (result_df['TEXT'] == 8)]
#print(filtered_df)
#print(result_df[:7430])

In [None]:
def preprocess_data(result_df):
    #result_df = result_df[:7430]
    words = result_df['WORD']
    wnum = result_df["WNUM"]
    wlen = result_df["WLEN"]
    xpos = result_df['XPOS']
    wdlp = list(result_df['WDLP'])
    oblp = list(result_df['OBLP'])
    fdur = result_df['FDUR']
    participant_id =  result_df['Participant ID'] 
    text_file = result_df["Text File"]
    words_to_ignore = ['*Off-screen', '*Blink']
    nan_words = result_df[result_df['WORD'].isna()]
    words.fillna('0', inplace=True)
    words_without_nan = result_df.dropna(subset=['WORD'])
    return words, wnum, wlen, xpos, wdlp, oblp, fdur, words_to_ignore, participant_id, text_file

In [None]:
directory = "dundee_corpus/english/"
result_df = read_data_ma1p(directory)

In [15]:
result_df[7434:]

Unnamed: 0,WORD,TEXT,LINE,OLEN,WLEN,XPOS,WNUM,FDUR,OBLP,WDLP,LAUN,TXFR,Participant ID,Text File
7434,'I,40,4,2,1,47,2464,162,0,-1,-11,-99,sa,04
7435,"thought,",40,4,8,7,57,2465,157,7,7,-10,24,sa,04
7436,a,40,4,1,1,69,2468,199,1,1,-12,1434,sa,04
7437,strange,40,5,7,7,5,2470,226,5,5,-99,8,sa,04
7438,project.'',40,5,10,7,14,2472,252,1,1,-9,-99,sa,04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437969,ahead,40,4,5,5,53,2452,254,4,4,-8,9,sj,20
437970,too.,40,4,4,3,58,2453,256,3,3,-5,56,sj,20
437971,hair.,40,4,5,4,60,2395,94,5,5,-2,12,sj,20
437972,*Blink,40,-99,5,4,-99,-99,65,-99,-99,-99,-99,sj,20


In [None]:

# Step 2: Preprocess data
words, wnum, wlen, xpos, wdlp, oblp, fdur, words_to_ignore, participant_id, text_file = preprocess_data(result_df)
len(text_file)

# Step 3: Clean text
cleaned_text_list = remove_special_characters(words, words_to_ignore)
cleaned_text_only = [cleaned_text for cleaned_text, special_char in cleaned_text_list]
special_char_only = [special_char for cleaned_text, special_char in cleaned_text_list]

# Step 4: Tokenize words into subwords
subwords, makeup_words = tokenize_words_into_subwords(words, words_to_ignore)
cleaned_subwords, cleaned_makeup_words = tokenize_words_into_subwords(cleaned_text_only, words_to_ignore)

# Step 5: Calculate subword information
num_subwords, subword_lengths, subword_infos = calculate_subword_info(subwords, wnum, words_to_ignore)
cl_num_subwords, cl_subword_lengths, cl_subword_infos = calculate_subword_info(cleaned_subwords, wnum, words_to_ignore)

# Step 6: Analyze gaze landing
fixation_letter, subpart_subword, fixation_position = gaze_landed_on_subwords(subwords, oblp, words_to_ignore)
cl_fixation_letter, cl_subpart_subword, cl_fixation_position = gaze_landed_on_subwords(cleaned_subwords, wdlp, words_to_ignore)


In [13]:
def main():
    
    # Step 7: Create DataFrame
    dict1 = {'id': participant_id,'text_file': text_file, 'WORD': words, 'SBW': subwords,'WLEN': wlen, 
              'SWONUM': num_subwords, 'SWNUM': cl_num_subwords, 
             'SWOLEN': subword_lengths, 'SWLEN': cl_subword_lengths, 'SPCHAR': special_char_only,
             'SWIDX': subword_infos, 'XPOS': xpos, 'SWD': cl_subpart_subword,
             'SWDLP': cl_fixation_position, 'SWDLL': cl_fixation_letter, 'FDUR': fdur,}

    df1 = pd.DataFrame(dict1)
    return df1

# Call the main function to execute the processing steps and get the DataFrame
df1 = main()


In [14]:
df1[7434:]

Unnamed: 0,id,text_file,WORD,SBW,WLEN,SWONUM,SWNUM,SWOLEN,SWLEN,SPCHAR,SWIDX,XPOS,SWD,SWDLP,SWDLL,FDUR
7434,sa,04,'I,"[', I]",1,2,1,"[1, 1]",[1],['],"[2464-1, 2464-2]",47,[],[],[],162
7435,sa,04,"thought,","[thought, ,]",7,2,1,"[7, 1]",[7],"[,]","[2465-1, 2465-2]",57,[thought],[7],[t],157
7436,sa,04,a,[a],1,1,1,[1],[1],0,[2468-1],69,[a],[1],[a],199
7437,sa,04,strange,"[str, ange]",7,2,2,"[3, 4]","[3, 4]",0,"[2470-1, 2470-2]",5,[ange],[2],[n],226
7438,sa,04,project.'',"[project, .'']",7,2,2,"[7, 3]","[7, 2]",['],"[2472-1, 2472-2]",14,[project],[1],[p],252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437969,sj,20,ahead,[ahead],5,1,1,[5],[5],0,[2452-1],53,[ahead],[4],[a],254
437970,sj,20,too.,"[too, .]",3,2,1,"[3, 1]",[3],[.],"[2453-1, 2453-2]",58,[too],[3],[o],256
437971,sj,20,hair.,"[hair, .]",4,2,1,"[4, 1]",[4],[.],"[2395-1, 2395-2]",60,[],[],[],94
437972,sj,20,*Blink,*Blink,4,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,65
