In [15]:
from numpy import mean, sum

In [25]:
class Word:
    #class for word data
    def __init__(self, file, line_part, text):
        
        self._file = file
        
        tokenized_line_part = line_part.split()
        self._line = int(tokenized_line_part[1])
        self._part = int(tokenized_line_part[3])
        self._text = text
        
        self._participants = []
        self._durations = []
        self._start_times = []
        self._length = -1
        self._freq = -1
        self._predict = -1
        
        
    def add(self, participant, duration, start_time, length, freq, predict, level):
        
        self._participants.append(participant)
        self._durations.append(int(duration))
        self._start_times.append(int(start_time))
        self._length = int(float(length))
        self._freq = int(float(freq))
        self._predict = float(predict)
        self._level = level
        
    
    def get_basic_info(self):
        
        return self._file + "  " +\
        str(self._line) + "  " +\
        str(self._part) + "  " +\
        self._text + "  " +\
        str(self._participants) + "  " +\
        str(self._durations) + "  " +\
        str(self._start_times) + "  " +\
        str(self._length) + "  " +\
        str(self._freq) + "  " +\
        str(self._predict)
        
    def get_text(self):
        return self._text
    
    def get_freq(self):
        return self._freq
    
    def get_pred(self):
        return round(self._predict, 2)
    
    def get_leng(self):
        return self._length
    
    def get_subject(self):
        return self._participants[0]
    
    def get_level(self):
        return self._level
        
        
    def single_fix_duration(self):
        #duration of first_fixation with cases where only one fixation is made.
        result = []

        for index, person in enumerate(self._participants):
            if self._is_unique(person, self._participants):
                result.append(self._durations[index])

        return round(mean(result), 2)


    def first_fix_duration(self):
        #duration of first fixation on word.
        fixations_durations = []
        
        for person in self._unique_participants():
            fixations_durations.append(self._durations[self._find_first_fix(self._get_index_of_person_fixations(person))])
        
        return round(mean(fixations_durations), 2)
        

    def gaze_duration(self):
        #sum of all fixations on word n before moving to n+1.
        gaze_durations = []
        
        for person in self._unique_participants():
            gaze_durations.append(self._person_gaze_duation(self._get_index_of_person_fixations(person)))
        
        return round(mean(gaze_durations), 2)


    def total_time(self):
        #sum of all fixations and regressions.
        return round(sum(self._durations) / len(self._unique_participants()), 2)


    def fix_probability(self):
        #count of participants who made at least one fixation / count of participants in experiment
        return round(len(self._unique_participants()) / len(participants[self._file]), 2)


    def prob_of_one_fix(self):
        # count of participants who made only one fixation / count of participants in experiment
        count = 0

        for person in self._unique_participants():
            if self._is_unique(person, self._participants):
                count += 1

        return round(count / len(participants[self._file]), 2)


    def prob_of_2_or_more_fix(self):
        # count of participants who made 2 or more fixations / count of participants in experiment
        count = 0

        for person in self._unique_participants():
            if not self._is_unique(person, self._participants):
                count += 1

        return round(count / len(participants[self._file]), 2)


    def prob_of_skip(self):
        # count of participants who didn't make a fixation / count of participants in experiment

        skipper = 0

        for person in participants[self._file]:
            if person not in self._unique_participants():
                skipper += 1

        return round(skipper / len(participants[self._file]), 2)


    def _unique_participants(self):
        #returns the list of participants without repetition
        result = []

        for person in self._participants:
            if person not in result:
                result.append(person)

        return result


    def _is_unique(self, item, some_list):
        #returns true if there is only one of item in some_list
        count = 0

        for val in some_list:
            if val == item:
                count += 1

        if count == 1:
            return True
        return False
    
    
    def _get_index_of_person_fixations(self, someone):
        #returns the indexes of all fixations for one person
        
        result = []
        for index, person in enumerate(self._participants):
            if person == someone:
                result.append(index)
                
        return result
    
    def _find_first_fix(self, list_of_fix_indexes):
        #returns the index of the first fixation (temporaly)
        
        result = list_of_fix_indexes[0]
        first_fix = self._start_times[list_of_fix_indexes[0]]
        
        for index in list_of_fix_indexes:
            if self._start_times[index] < first_fix:
                first_fix = self._start_times[index]
                result = index
                
        return result
        
        
    def _person_gaze_duation(self, list_of_fix_indexes):
        #sum of all fixations on word n before moving to n+1.
        fix_durations = []
        
        time = self._start_times[list_of_fix_indexes[0]]
        
        for fix_index in list_of_fix_indexes:
            if self._start_times[fix_index] - time < 100:
                fix_durations.append(self._durations[fix_index])
                time = self._start_times[fix_index]
            
        return sum(fix_durations)
        
        

In [26]:
#testing all of my methods
#nt1,line 1 part 1
my_word = Word("nt1", "line 1 part 1", "test_word")

#participant, duration, start_time, length, freq, predict
my_word.add("1", "100", "10", "11", "55.0", "0.0555", "1")
my_word.add("1", "200", "15", "11", "55.0", "0.0555", "1")
my_word.add("1", "300", "125", "11", "55.0", "0.0555", "1")
my_word.add("2", "200", "10", "11", "55.0", "0.0555", "2")
my_word.add("2", "250", "20", "11", "55.0", "0.0555", "2")
my_word.add("3", "300", "100", "11", "55.0", "0.0555", "3")

participants = {}
participants["nt1"] = ["1", "2", "3", "4"]

print("single_fix_duration", my_word.single_fix_duration(), "\tshould print: 300")
print("first_fix_duration", my_word.first_fix_duration(), "\tshould print: 200")
print("gaze_duration", my_word.gaze_duration(), "\tshould print: 350") 
print("total_time", my_word.total_time(), "\tshould print: 450.0")
print("fix_probability", my_word.fix_probability(), "\tshould print: 0.75")
print("prob_of_one_fix", my_word.prob_of_one_fix(), "\tshould print: 0.25")
print("prob_of_2_or_more_fix", my_word.prob_of_2_or_more_fix(), "\tshould print: 0.5")
print("prob_of_skip", my_word.prob_of_skip(), "\tshould print: 0.25")




single_fix_duration 300.0 	should print: 300
first_fix_duration 200.0 	should print: 200
gaze_duration 350.0 	should print: 350
total_time 450.0 	should print: 450.0
fix_probability 0.75 	should print: 0.75
prob_of_one_fix 0.25 	should print: 0.25
prob_of_2_or_more_fix 0.5 	should print: 0.5
prob_of_skip 0.25 	should print: 0.25


In [27]:

#dictionary of all words
#file + line_part as key and word object as value
lexicon = {}

#dictionary to list all participants in each experiment (or file)
participants = {}

#open out.csv and read in each record
file = open('Novice-code-filtered.csv', 'rt')

text = file.read()
file.close()
lines = text.split('\n')
skip_next_word = False

line_num = 0
for line in lines[:-1]:
    line_num += 1
    print(line_num, line)
    
    if skip_next_word:         #to skip next word after '.'
        skip_next_word = False
        continue
        
    if line_num == 1: continue #skip header
        
    tokens = line.split(',')
    
    line_part = tokens[1].split()
    
    if line_part[1] == line_part[3] == "1": #skipping first word in first sentence in file
        continue
    
    if tokens[25][-1] ==';': #skipping last word in each sentence
        skip_next_word = True
        continue
    
    #dictionary of words based on file, line-part, and subject
    if lexicon.get(tokens[0] + tokens[1] + tokens[2], 0) == 0:
        lexicon[tokens[0] + tokens[1] + tokens[2]] = Word(tokens[0], tokens[1], tokens[25]) #file, line_part, text
    
    #participant, duration, start_time, len, freq, predict 
    lexicon[tokens[0] + tokens[1] + tokens[2]].add(tokens[2], tokens[7], tokens[8], tokens[27], tokens[28], tokens[29], tokens[0])

    #participants dictionary
    if participants.get(tokens[0], 0) == 0:
        participants[tokens[0]] = []
        
    #if participant not already in the list for that file: to get unique values in the list
    if tokens[2] not in participants[tokens[0]]:
        participants[tokens[0]].append(tokens[2])
       


1 trial,aoi_sub_line,participant,text_type,text_nr,fix_x,fix_y,duration_ms,start_ms,end_ms,fix_x_original,fix_y_original,exp_id,trial_id,program,offset_kind,aoi_line,is_expert,kind,name,x,y,width,height,local_id,wordl1_sc1,line 1 part 1,6,sc,1
2 l1_sc1,line 1 part 1,4,sc,1,406,355,142,617,759,406,355,1,4,sc1,0,line 1,0,sub-line,line 1 part 1,582,357,65,50,,public,public,6,19831.13901,0.042731726
3 l1_sc1,line 1 part 1,4,sc,1,407,394,242,125,367,407,394,1,4,sc1,0,line 1,0,sub-line,line 1 part 1,582,357,65,50,,public,public,6,19831.13901,0.042731726
4 l1_sc1,line 1 part 1,8,sc,1,634,404,250,250,500,634,404,1,8,sc1,0,line 1,0,sub-line,line 1 part 1,582,357,65,50,,public,public,6,19831.13901,0.042731726
5 l1_sc1,line 1 part 1,4,sc,1,411,370,250,12724,12974,411,370,1,4,sc1,0,line 1,0,sub-line,line 1 part 1,582,357,65,50,,public,public,6,19831.13901,0.042731726
6 l1_sc1,line 1 part 1,4,sc,1,429,366,234,375,609,429,366,1,4,sc1,0,line 1,0,sub-line,line 1 part 1,582,357,65,50,,public,public,6,1

2019 l2_sc3,line 4 part 2,10,sc,3,919,506,134,16737,16871,919,508,3,10,sc3,0,line 4,0,sub-line,line 4 part 2,913,477,8,44,,(,(,1,1602.756868,2.54E-05
2020 l2_sc3,line 4 part 2,11,sc,3,916,495,301,16829,17130,916,507,3,11,sc3,0,line 4,0,sub-line,line 4 part 2,913,477,8,44,,(,(,1,1602.756868,2.54E-05
2021 l2_sc3,line 4 part 2,12,sc,3,921,510,242,9553,9795,921,562,3,12,sc3,0,line 4,0,sub-line,line 4 part 2,913,477,8,44,,(,(,1,1602.756868,2.54E-05
2022 l2_sc3,line 4 part 3,12,sc,3,934,507,318,13491,13809,934,559,3,12,sc3,0,line 4,0,sub-line,line 4 part 3,927,477,8,44,,),),1,6182.963337,0.001069234
2023 l2_sc3,line 5 part 1,3,sc,3,800,548,176,18965,19141,800,552,3,3,sc3,0,line 5,0,sub-line,line 5 part 1,728,527,125,44,,word.replace,word.replace,12,0,2.35E-07
2024 l2_sc3,line 5 part 1,3,sc,3,797,541,225,8469,8694,797,545,3,3,sc3,0,line 5,0,sub-line,line 5 part 1,728,527,125,44,,word.replace,word.replace,12,0,2.35E-07
2025 l2_sc3,line 5 part 1,11,sc,3,793,550,242,9170,9412,793,562,3,11,sc3,0,

4154 l3_sc3,line 2 part 1,10,sc,3,701,190,151,10295,10446,701,190,3,10,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4155 l3_sc3,line 2 part 1,12,sc,3,689,186,125,1944,2069,689,186,3,12,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4156 l3_sc3,line 2 part 1,3,sc,3,704,207,384,90029,90413,704,207,3,3,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4157 l3_sc3,line 2 part 1,10,sc,3,646,196,192,16194,16386,646,196,3,10,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4158 l3_sc3,line 2 part 1,10,sc,3,698,188,191,16395,16586,698,188,3,10,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4159 l3_sc3,line 2 part 1,5,sc,3,694,192,276,3412,3688,694,192,3,5,sc3,0,line 2,0,sub-line,line 2 part 1,643,179,63,32,,private,private,7,6816.692337,0.014667274
4160 l3_sc3,

6403 l4_sc3,line 3 part 4,4,sc,3,535,421,250,9971,10221,535,466,3,4,sc3,0,line 3,0,sub-line,line 3 part 4,699,387,40,49,,140,140,3,0,1.01E-06
6404 l4_sc3,line 3 part 5,3,sc,3,748,433,159,31856,32015,748,443,3,3,sc3,0,line 3,0,sub-line,line 3 part 5,746,387,6,49,,;,;,1,135664.4453,1.01E-06
6405 l4_sc3,line 4 part 1,11,sc,3,600,488,117,22252,22369,600,493,3,11,sc3,0,line 4,0,sub-line,line 4 part 1,579,444,27,49,,int,int,3,7523.832539,0.006635084
6406 l4_sc3,line 4 part 1,11,sc,3,598,454,184,22060,22244,598,459,3,11,sc3,0,line 4,0,sub-line,line 4 part 1,579,444,27,49,,int,int,3,7523.832539,0.006635084
6407 l4_sc3,line 4 part 1,11,sc,3,596,470,333,5432,5765,596,475,3,11,sc3,0,line 4,0,sub-line,line 4 part 1,579,444,27,49,,int,int,3,7523.832539,0.006635084
6408 l4_sc3,line 4 part 1,10,sc,3,599,453,225,37888,38113,599,438,3,10,sc3,0,line 4,0,sub-line,line 4 part 1,579,444,27,49,,int,int,3,7523.832539,0.006635084
6409 l4_sc3,line 4 part 1,10,sc,3,602,460,226,40891,41117,602,445,3,10,sc3,0,lin

8153 l6_sc1,line 7 part 16,3,sc,1,982,435,325,82718,83043,982,445,1,3,sc1,none,line 7,0,sub-line,line 7 part 16,974,419,23,30,,ch,ch,2,143.3421695,1.01E-06
8154 l6_sc1,line 7 part 16,4,sc,1,975,438,216,39782,39998,975,458,1,4,sc1,none,line 7,0,sub-line,line 7 part 16,974,419,23,30,,ch,ch,2,143.3421695,1.01E-06
8155 l6_sc1,line 7 part 17,5,sc,1,1025,429,283,58822,59105,1025,444,1,5,sc1,none,line 7,0,sub-line,line 7 part 17,1003,419,25,30,,>=,>=,2,642.6471014,1.01E-06
8156 l6_sc1,line 7 part 17,5,sc,1,1006,448,225,59114,59339,1006,463,1,5,sc1,none,line 7,0,sub-line,line 7 part 17,1003,419,25,30,,>=,>=,2,642.6471014,1.01E-06
8157 l6_sc1,line 7 part 17,5,sc,1,1010,423,175,43962,44137,1010,438,1,5,sc1,none,line 7,0,sub-line,line 7 part 17,1003,419,25,30,,>=,>=,2,642.6471014,1.01E-06
8158 l6_sc1,line 7 part 17,3,sc,1,1020,449,259,33449,33708,1020,459,1,3,sc1,none,line 7,0,sub-line,line 7 part 17,1003,419,25,30,,>=,>=,2,642.6471014,1.01E-06
8159 l6_sc1,line 7 part 18,5,sc,1,1042,420,267,46565

In [28]:
#testing participants dictionary idea
participants['l1_sc1']

['13', '11', '4', '8', '1', '10', '12', '6', '7', '5', '2', '9']

In [29]:
#testing word dictionary idea
lexicon["l1_sc1" + "line 1 part 2" +"6"].get_basic_info()

"l1_sc1  1  2  class  ['6', '6']  [100, 100]  [1705, 25]  5  2563  0.01059009"

In [32]:

#output everything into a file called NT-EZ-style-output.csv
#output style: SFD 231 FFD 227 GD 269 TT 290 PrF 0.95 Pr1 0.90 Pr2 0.05 PrS 0.05 class

out = open('Novice-EZ-style-output-persubject.csv', 'wt')
#writing the header
out.write("SFD,FFD,GD,TT,PrF,Pr1,Pr2,PrS,word,freq,pred,leng,subj,level" + "\n")

count = 0
for word in lexicon:
    count += 1
    #print(count, line)
#     print(lexicon[word].get_basic_info())
#     print("SFD:", str(lexicon[word].single_fix_duration()))
#     print("FFD:", str(lexicon[word].first_fix_duration()))
#     print("GD:", str(lexicon[word].gaze_duration()))
#     print("TT:", str(lexicon[word].total_time()))
#     print("PrF:", str(lexicon[word].fix_probability()))
#     print("Pr1", str(lexicon[word].prob_of_one_fix()))
#     print("Pr2:",  str(lexicon[word].prob_of_2_or_more_fix()))
#     print("PrS:",  str(lexicon[word].prob_of_skip()))
#    print(lexicon[word].get_text())    
    
#     out.write("SFD " + str(lexicon[word].single_fix_duration()) +\
#               " FFD " +  str(lexicon[word].first_fix_duration()) +\
#               " GD " +  str(lexicon[word].gaze_duration()) +\
#               " TT " +  str(lexicon[word].total_time()) +\
#               " PrF " +  str(lexicon[word].fix_probability()) +\
#               " Pr1 " +  str(lexicon[word].prob_of_one_fix()) +\
#               " Pr2 " +  str(lexicon[word].prob_of_2_or_more_fix()) +\
#               " PrS " +  str(lexicon[word].prob_of_skip()) +\
#               " " + lexicon[word].get_text() +\
#               " freq " + str(lexicon[word].get_freq()) +\
#               " pred " + str(lexicon[word].get_pred()) +\
#               " leng " + str(lexicon[word].get_leng()) + "\n"   )
    
    #write data
    out.write(str(lexicon[word].single_fix_duration()) +\
              "," +  str(lexicon[word].first_fix_duration()) +\
              "," +  str(lexicon[word].gaze_duration()) +\
              "," +  str(lexicon[word].total_time()) +\
              "," +  str(lexicon[word].fix_probability()) +\
              "," +  str(lexicon[word].prob_of_one_fix()) +\
              "," +  str(lexicon[word].prob_of_2_or_more_fix()) +\
              "," +  str(lexicon[word].prob_of_skip()) +\
              "," + lexicon[word].get_text() +\
              "," + str(lexicon[word].get_freq()) +\
              "," + str(lexicon[word].get_pred()) +\
              "," + str(lexicon[word].get_leng()) +\
              "," + str(lexicon[word].get_subject())+\
              "," + str(lexicon[word].get_level())[:2] +"\n"      )

out.close()

In [33]:
len(lexicon)

2584

round, session, participant, code_file, code_language, timestamp, duration, x_cord, y_cord, aoi_x, aoi_y, aoi_width, aoi_height, token, length