# Description
This script checks blind pixels (aka animations), evaluates the finder output and counts pixels.

### Imports

In [None]:
from PIL import Image
from collections import defaultdict
from collections import OrderedDict
import os
import re
import pandas as pd
import numpy as np

# Settings

In [None]:
dataset_stimuli_dir = r'C:/GazeMiningDataset/Dataset_stimuli'
dataset_visual_change_dir = r'C:/GazeMiningDataset/Dataset_visual_change'
dataset_overall_evaluation_dir = r'C:/GazeMiningDataset/Dataset_evaluation/quality'
participants = ['p1', 'p2', 'p3', 'p4']
screencast_width = 1024
screencast_height = 768

# Categories
shopping = ['walmart', 'amazon', 'steam']
news = ['reddit', 'cnn', 'guardian']
health = ['nih', 'webmd', 'mayo']
cars = ['gm', 'nissan', 'kia']
categories = {'shopping': shopping, 'news': news, 'health': health, 'cars': cars}

# Folder with stimuli from root layer
root_layer_stimuli_dir = {
    'amazon': dataset_stimuli_dir + '/amazon/stimuli/0_html',
    'steam': dataset_stimuli_dir + '/steam/stimuli/0_html',
    'walmart': dataset_stimuli_dir + '/walmart/stimuli/1_html',
    
    'cnn': dataset_stimuli_dir + '/cnn/stimuli/0_html',
    'guardian': dataset_stimuli_dir + '/guardian/stimuli/1_html',
    'reddit': dataset_stimuli_dir + '/reddit/stimuli/0_html',
    
    'mayo': dataset_stimuli_dir + '/mayo/stimuli/0_html',
    'nih': dataset_stimuli_dir + '/nih/stimuli/1_html',
    'webmd': dataset_stimuli_dir + '/webmd/stimuli/1_html',
    
    'nissan': dataset_stimuli_dir + '/nissan/stimuli/0_html',
    'gm': dataset_stimuli_dir + '/gm/stimuli/2_html',
    'kia': dataset_stimuli_dir + '/kia/stimuli/0_html'
}

# Regex
png_regex = re.compile('.png$') # to filter for .png files

# Counting

In [None]:
# Go over categories
for (_,sites) in categories.items():
    
    # Go over sites
    for site in sites:

        # Stimuli from root layer
        stimuli_files_root = [f for f in os.listdir(root_layer_stimuli_dir[site]) if os.path.isfile(os.path.join(root_layer_stimuli_dir[site],f))]
        stimuli_files_root = list(filter(png_regex.search, stimuli_files_root))
        stimuli_count_root = len(stimuli_files_root)
        
        # Stimuli files across all layers
        stimuli_count = 0
        stimuli_dir = os.path.join(dataset_stimuli_dir, site, 'stimuli')
        layers = [d for d in os.listdir(stimuli_dir) if os.path.isdir(os.path.join(stimuli_dir,d))]
        for layer in layers: # go over layers
            
            # Find stimuli
            layer_dir = os.path.join(stimuli_dir, layer)
            stimuli_files = [f for f in os.listdir(layer_dir) if os.path.isfile(os.path.join(layer_dir,f))]
            stimuli_files = list(filter(png_regex.search, stimuli_files))
            stimuli_count += len(stimuli_files)
        
        # Frame counts in screencasts
        frame_count = 0
        for p in participants:
            df = pd.read_csv(dataset_visual_change_dir + '/' + p + '/' + site + '_meta.csv')
            frame_count += df.iloc[0,2] # screencast_frame_total_count
        
        # Print information
        print(site)
        print('Stimuli count (root layer): ' + str(stimuli_count_root))
        print('Stimuli count: ' + str(stimuli_count))
        print('Frame count: ' + str(frame_count))
        print()

# Blind frames

In [None]:
print('Blind frames of root layer')

# Prepare variables
shots_regex = re.compile('-shots.csv$') # to filter for -shots.csv files
blind_frame_idxs = defaultdict(list)
blind_frame_freq = defaultdict(int) # overall
blind_frame_front_freq = defaultdict(int)
blind_frame_back_freq = defaultdict(int)

# Go over categories
for (_,sites) in categories.items():
    
    # Go over sites
    for site in sites:
        
        # Retrieve shots file per stimulus of root layer
        shots_files = [f for f in os.listdir(root_layer_stimuli_dir[site]) if os.path.isfile(os.path.join(root_layer_stimuli_dir[site],f))]
        shots_files = list(filter(shots_regex.search, shots_files))

        site_count = 0 # count of blind frames per site
        site_front_count = 0
        site_back_count = 0

        # For each shot file, get blind files
        for shots_file in shots_files:
            df_shots = pd.read_csv(root_layer_stimuli_dir[site] + "/" + shots_file)
            
            for session, shot_idx, frame_idx_start, frame_idx_end in zip(df_shots.session_id, df_shots.shot_idx, df_shots.frame_idx_start, df_shots.frame_idx_end): 
                
                # Load data
                shots_dir = os.path.join(dataset_stimuli_dir, site, 'shots')
                df_blind = pd.read_csv(shots_dir + '/' + session + '_' + str(shot_idx) + '-blind.csv')
                
                # Check whether blind frames are added in the front or back
                front_blind = []
                back_blind = []
                non_blind_frames = list(set(range(frame_idx_start, frame_idx_end+1, 1)) - set(df_blind.frame_idx))
                min_id = min(non_blind_frames)
                for blind_frame_idx in df_blind.frame_idx:
                    if blind_frame_idx < min_id:
                        front_blind.append(blind_frame_idx)
                    else:
                        back_blind.append(blind_frame_idx)
                
                # Compute counts
                count = len(df_blind.frame_idx) # count of blind frames
                blind_frame_idxs[site] += list(df_blind.frame_idx) # blind frame idxs per site
                blind_frame_freq[count] += 1 # frequency of blind frame count
                blind_frame_front_freq[len(front_blind)] += 1 # frequency of blind frame count at front
                blind_frame_back_freq[len(back_blind)] += 1 # frequency of blind frame count at back
                site_count += count # count of blind frames per site
                site_front_count += len(front_blind)
                site_back_count += len(back_blind)

        print(site + ': ' + str(site_count) + ' Front: ' + str(site_front_count) + ' Back: ' + str(site_back_count))

# Sort dictionaries before printing
blind_frame_freq = OrderedDict(sorted(blind_frame_freq.items()))
blind_frame_front_freq = OrderedDict(sorted(blind_frame_front_freq.items()))
blind_frame_back_freq = OrderedDict(sorted(blind_frame_back_freq.items()))

# Print dictionaries
print()
print('Blind frame frequencies (root layer):')

print(blind_frame_freq)
print()

print('Blind frame frequencies in front (root layer):')
print(blind_frame_front_freq)
print()

print('Blind frame frequencies in back (root layer):')
print(blind_frame_back_freq)
print()

# Calculate averages
'''
agg = 0
n = 0
for blind_count, freq in blind_frame_freq.items():
    agg += blind_count * freq
    n += freq
print('Blind frames mean: ' + str(agg / n))
'''

# Overall (Finder output)

In [None]:
csv_regex = re.compile('.csv$') # to filter for .csv files

# Get output files of Finder.exe
finder_files = [f for f in os.listdir(dataset_overall_evaluation_dir) if os.path.isfile(os.path.join(dataset_overall_evaluation_dir,f))]
finder_files = list(filter(csv_regex.search, finder_files))

# Analyze each file
for finder_file in finder_files:
    
    site = finder_file.replace('_finder.csv','')
    print('### ' + site)
    df = pd.read_csv(dataset_overall_evaluation_dir + "/" + finder_file)
    
    # Counts
    same_count = np.count_nonzero(df.same)
    pixel_perfect_count = np.count_nonzero(df.pixel_perfect_same)
    frame_count = len(df.same)
    wrong_frames = df.loc[df['same'] == 0].frame_idx
    intersection_blind_and_wrong = list(set(blind_frame_idxs[site]) & set(wrong_frames))
    really_wrong_count = len(wrong_frames) - len(intersection_blind_and_wrong)
    
    # Inform user
    print('Correct Ratio (root layer): ' + str(same_count/frame_count))
    print('Correct Ratio (only really wrong, root layer): ' + str((frame_count - really_wrong_count)/frame_count))
    print('Pixel perfect count, root layer: ' + str(pixel_perfect_count))  
    print(
        'Frame count (root layer): '
        + str(frame_count) +
        ' Blind count: '
        + str(len(blind_frame_idxs[site])) +
        ' Wrong frames: '
        + str(len(wrong_frames)) +
        ' Intersection with blind frames: '
        + str(len(intersection_blind_and_wrong)) +
        ' Really wrong: '
        + str(really_wrong_count))
    print()

# Pixel ratio

In [None]:
# Values to compute
overall_stimuli_pixels = 0
overall_screencast_pixels = 0

# Go over categories
for (_,sites) in categories.items():
    
    # Go over sites
    for site in sites:
        
        print('# ' + site)
        print()
        
        # Values to compute
        site_stimuli_pixels = 0
        site_screencast_pixels = 0
        
        # First, compute opaque pixels in the discovered stimuli
        stimuli_dir = os.path.join(dataset_stimuli_dir, site, 'stimuli')
        layers = [d for d in os.listdir(stimuli_dir) if os.path.isdir(os.path.join(stimuli_dir,d))]
        for layer in layers: # go over layers
            
            # Use this to restrict pixel ratio to root-layer stimuli, only
            if not layer in root_layer_stimuli_dir[site]:
                continue
            
            print('--> ' + layer)
            layer_pixel_count = 0
            
            # Find stimuli
            layer_dir = os.path.join(stimuli_dir, layer)
            stimuli_files = [f for f in os.listdir(layer_dir) if os.path.isfile(os.path.join(layer_dir,f))]
            stimuli_files = list(filter(png_regex.search, stimuli_files))
            # print('Found ' + str(len(stimuli_files)) + ' simuli')
            
            # Go over stimuli and count pixels
            for stimulus_file in stimuli_files:
                img = Image.open(layer_dir + '/' + stimulus_file)
                img = img.convert('RGBA')
                pixdata = img.load()
                width, height = img.size
                for y in range(height):
                    for x in range(width):
                        if pixdata[x, y][3] > 0:
                            layer_pixel_count += 1
                            
            # Accumulate pixel count of site
            site_stimuli_pixels += layer_pixel_count
                            
            # Print pixel count
            # print('Found ' + str(layer_pixel_count) + ' opaque pixels for the layer')
            
        # Second, compute pixels in screencasts
        for p in participants:
            df = pd.read_csv(dataset_visual_change_dir + '/' + p + '/' + site + '_meta.csv')
            frame_count = df.iloc[0,2] # screencast_frame_total_count
            site_screencast_pixels += screencast_width * screencast_height * frame_count
            
        # Accumulate overall
        overall_stimuli_pixels += site_stimuli_pixels
        overall_screencast_pixels += site_screencast_pixels

        # Calculate percentage
        site_percentage = 100.0 * (float(site_stimuli_pixels) / float(site_screencast_pixels))
        print('Stimuli Pixels in relation to Screencast Pixels = ' + str(site_percentage) + '%')
        print()
        
# Calculate percentage
overall_percentage = 100.0 * (float(overall_stimuli_pixels) / float(overall_screencast_pixels))
print('Overall: Stimuli Pixels in relation to Screencast Pixels = ' + str(overall_percentage) + '%')
print()