In [1]:
from random import choice, shuffle, sample
import numpy as np
import multimatch_gaze as mm
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import product

screen_size = [1920, 1080]

# training_num <= 5
training_num = 3
missing_data = ["22K", "25I", "29N", "39Y"]
too_focus = ["8I", "32S", "37T", "30Y", "27I", "7Y", "19M", "23H", "34I", "35S"]
too_MW = ["24K", "31M", "26K", "21H", "10H"]
black_list = set(missing_data + too_focus + too_MW)

def read_scanpaths(input_dir, file_list):
    return [np.recfromcsv(os.path.join(input_dir, file), delimiter='\t', dtype=[('start_x', 'f8'), ('start_y', 'f8'), ('duration', 'f8')]) for file in file_list]

def create_pairs(scanpath_list1, scanpath_list2, label, TDir, TAmp, TDur):
    pairs = []
    for i in range(len(scanpath_list1)):
        for j in range(len(scanpath_list2)):
            similarity = mm.docomparison(scanpath_list1[i], scanpath_list2[j], screensize=screen_size, grouping=True, TDir=TDir, TAmp=TAmp, TDur=TDur)
            pairs.append((similarity, label))
    return pairs

def evaluate_parameters(TDir, TAmp, TDur, MW_training_list, Focus_training_list, MW_test_list, Focus_test_list):
    MW_MW_pair = create_pairs(MW_training_list, MW_training_list, 0, TDir, TAmp, TDur)
    Focus_Focus_pair = create_pairs(Focus_training_list, Focus_training_list, 0, TDir, TAmp, TDur)
    MW_Focus_pair = create_pairs(MW_training_list, Focus_training_list, 1, TDir, TAmp, TDur)

    training_data = MW_MW_pair + Focus_Focus_pair + MW_Focus_pair
    X_train = [pair[0] for pair in training_data]
    y_train = [pair[1] for pair in training_data]

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    test_pairs = create_pairs(MW_test_list, MW_test_list, 0, TDir, TAmp, TDur)
    test_pairs += create_pairs(Focus_test_list, Focus_test_list, 0, TDir, TAmp, TDur)
    test_pairs += create_pairs(MW_test_list, Focus_test_list, 1, TDir, TAmp, TDur)

    X_test = [pair[0] for pair in test_pairs]
    y_test = [pair[1] for pair in test_pairs]

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

def repeat_test(num_repeats=30):
    input_dir = "../../Preprocess/FreeViewing/Scanpath/MultiMatch"
    name_list = [name for name in os.listdir(input_dir) if name not in black_list and not name.startswith(".")]
    results = []

    for _ in range(num_repeats):
        name = choice(name_list)
        input_dir_participant = os.path.join(input_dir, name)
        Focus_stim_list = [name for name in os.listdir(input_dir_participant) if name.endswith("Focus.tsv")]
        MW_stim_list = [name for name in os.listdir(input_dir_participant) if name.endswith("MW.tsv")]
        Focus_list = read_scanpaths(input_dir_participant, Focus_stim_list)
        MW_list = read_scanpaths(input_dir_participant, MW_stim_list)

        shuffle(Focus_stim_list)
        shuffle(MW_stim_list)

        MW_training_list = MW_list[:training_num]
        Focus_training_list = Focus_list[:training_num]

        MW_test_list = MW_list[training_num:]
        Focus_test_list = Focus_list[training_num:]

        min_test_samples = min(len(MW_test_list), len(Focus_test_list))
        MW_test_list = MW_test_list[:min_test_samples]
        Focus_test_list = Focus_test_list[:min_test_samples]

        TDir_values = [0, 15.0, 30.0, 45.0]
        TAmp_values = [0, 50.0, 100.0, 150.0, 200.0, 250.0]
        TDur_values = [0, 0.1, 0.2, 0.3]

        best_accuracy = 0
        best_params = (0, 0, 0)

        for TDir, TAmp, TDur in product(TDir_values, TAmp_values, TDur_values):
            accuracy = evaluate_parameters(TDir, TAmp, TDur, MW_training_list, Focus_training_list, MW_test_list, Focus_test_list)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = (TDir, TAmp, TDur)

        results.append((best_accuracy, best_params))

    return results

results = repeat_test()

import pandas as pd
results_df = pd.DataFrame(results, columns=['Best Accuracy', 'Best Parameters'])
print(results_df)

    Best Accuracy     Best Parameters
0        0.750000   (15.0, 50.0, 0.3)
1        0.750000  (30.0, 100.0, 0.3)
2        0.632653  (15.0, 150.0, 0.3)
3        0.645833   (30.0, 50.0, 0.2)
4        0.693333   (45.0, 50.0, 0.2)
5        0.639456     (0, 200.0, 0.3)
6        0.750000  (30.0, 100.0, 0.3)
7        0.733333  (45.0, 150.0, 0.2)
8        0.833333     (0, 150.0, 0.3)
9        0.703704   (15.0, 50.0, 0.2)
10       0.833333     (0, 150.0, 0.3)
11       0.770833      (15.0, 0, 0.2)
12       0.666667           (0, 0, 0)
13       0.671875      (15.0, 0, 0.3)
14       0.693333   (45.0, 50.0, 0.2)
15       0.632653  (15.0, 150.0, 0.3)
16       0.693333   (45.0, 50.0, 0.2)
17       0.720000  (30.0, 150.0, 0.3)
18       0.639456     (0, 200.0, 0.3)
19       0.645833   (30.0, 50.0, 0.2)
20       0.729167  (15.0, 100.0, 0.3)
21       0.671875      (15.0, 0, 0.3)
22       0.814815  (30.0, 150.0, 0.3)
23       0.750000   (15.0, 50.0, 0.3)
24       0.770833      (15.0, 0, 0.2)
25       0.7

Top 3 Parameter Sets:
- (30.0, 100.0, 0.3), Count: 5
- (30.0, 150.0, 0.3), Count: 3
- (45.0, 150.0, 0.3), Count: 3



So try the top3 parameters for 30 times and find the average accuracy for each pair of parameters

In [3]:
from random import choice, shuffle, sample
import numpy as np
import multimatch_gaze as mm
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import product

screen_size = [1920, 1080]

# training_num <= 5
training_num = 3
missing_data = ["22K", "25I", "29N", "39Y"]
too_focus = ["8I", "32S", "37T", "30Y", "27I", "7Y", "19M", "23H", "34I", "35S"]
too_MW = ["24K", "31M", "26K", "21H", "10H"]
black_list = set(missing_data + too_focus + too_MW)

def read_scanpaths(input_dir, file_list):
    return [np.recfromcsv(os.path.join(input_dir, file), delimiter='\t', dtype=[('start_x', 'f8'), ('start_y', 'f8'), ('duration', 'f8')]) for file in file_list]

def create_pairs(scanpath_list1, scanpath_list2, label, TDir, TAmp, TDur):
    pairs = []
    for i in range(len(scanpath_list1)):
        for j in range(len(scanpath_list2)):
            similarity = mm.docomparison(scanpath_list1[i], scanpath_list2[j], screensize=screen_size, grouping=True, TDir=TDir, TAmp=TAmp, TDur=TDur)
            pairs.append((similarity, label))
    return pairs

def evaluate_parameters(TDir, TAmp, TDur, MW_training_list, Focus_training_list, MW_test_list, Focus_test_list):
    MW_MW_pair = create_pairs(MW_training_list, MW_training_list, 0, TDir, TAmp, TDur)
    Focus_Focus_pair = create_pairs(Focus_training_list, Focus_training_list, 0, TDir, TAmp, TDur)
    MW_Focus_pair = create_pairs(MW_training_list, Focus_training_list, 1, TDir, TAmp, TDur)

    training_data = MW_MW_pair + Focus_Focus_pair + MW_Focus_pair
    X_train = [pair[0] for pair in training_data]
    y_train = [pair[1] for pair in training_data]

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    test_pairs = create_pairs(MW_test_list, MW_test_list, 0, TDir, TAmp, TDur)
    test_pairs += create_pairs(Focus_test_list, Focus_test_list, 0, TDir, TAmp, TDur)
    test_pairs += create_pairs(MW_test_list, Focus_test_list, 1, TDir, TAmp, TDur)

    X_test = [pair[0] for pair in test_pairs]
    y_test = [pair[1] for pair in test_pairs]

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

def repeat_test(num_repeats=50):
    input_dir = "../../Preprocess/FreeViewing/Scanpath/MultiMatch"
    name_list = [name for name in os.listdir(input_dir) if name not in black_list and not name.startswith(".")]
    params_list = [(30.0, 100.0, 0.3), (30.0, 150.0, 0.3), (45.0, 100.0, 0.3), (45.0, 150.0, 0.3)]
    results = {params: [] for params in params_list}

    for params in params_list:
        TDir, TAmp, TDur = params
        for _ in range(num_repeats):
            name = choice(name_list)
            input_dir_participant = os.path.join(input_dir, name)
            Focus_stim_list = [name for name in os.listdir(input_dir_participant) if name.endswith("Focus.tsv")]
            MW_stim_list = [name for name in os.listdir(input_dir_participant) if name.endswith("MW.tsv")]
            Focus_list = read_scanpaths(input_dir_participant, Focus_stim_list)
            MW_list = read_scanpaths(input_dir_participant, MW_stim_list)

            shuffle(Focus_stim_list)
            shuffle(MW_stim_list)

            MW_training_list = MW_list[:training_num]
            Focus_training_list = Focus_list[:training_num]

            MW_test_list = MW_list[training_num:]
            Focus_test_list = Focus_list[training_num:]

            min_test_samples = min(len(MW_test_list), len(Focus_test_list))
            MW_test_list = MW_test_list[:min_test_samples]
            Focus_test_list = Focus_test_list[:min_test_samples]

            accuracy = evaluate_parameters(TDir, TAmp, TDur, MW_training_list, Focus_training_list, MW_test_list, Focus_test_list)
            results[params].append(accuracy)

    return results

results = repeat_test()

# Calculate average accuracy for each parameter set
average_results = {params: np.mean(acc) for params, acc in results.items()}

import pandas as pd
results_df = pd.DataFrame.from_dict(average_results, orient='index', columns=['Average Accuracy'])
print(results_df)


                    Average Accuracy
(30.0, 100.0, 0.3)          0.583208
(30.0, 150.0, 0.3)          0.571488
(45.0, 100.0, 0.3)          0.599122
(45.0, 150.0, 0.3)          0.536486
