In [1]:
import pandas as pd
import numpy as np
from scipy.fftpack import fft
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report





In [50]:
# Load the combined dataset
data = pd.read_csv("../data/eye_movement_type/combined_data.csv")
data

Unnamed: 0.1,Unnamed: 0,Recording timestamp,Computer timestamp,Sensor,Project name,Export date,Participant name,Recording name,Recording date,Recording date UTC,...,Original Media height,Eye movement type,Gaze event duration,Eye movement type index,Fixation point X,Fixation point Y,Fixation point X (MCSnorm),Fixation point Y (MCSnorm),Mouse position X,Mouse position Y
0,6103,48551458,515022635629,,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,,Saccade,133.0,241.0,,,,,,
1,6104,48551458,515022635629,,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,,Saccade,133.0,241.0,,,,,,
2,6105,48556071,515022640242,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
3,6106,48564392,515022648563,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
4,6107,48572740,515022656911,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35097,15622,63899509,620588764860,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35098,15623,63907845,620588773196,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35099,15624,63916183,620588781534,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35100,15625,63924508,620588789859,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,


In [4]:
# Handle missing values (for simplicity, we'll drop rows with missing data)
data = data.dropna(subset=["Gaze point X", "Gaze point Y", "Eye movement type"])
data

Unnamed: 0.1,Unnamed: 0,Recording timestamp,Computer timestamp,Sensor,Project name,Export date,Participant name,Recording name,Recording date,Recording date UTC,...,Original Media height,Eye movement type,Gaze event duration,Eye movement type index,Fixation point X,Fixation point Y,Fixation point X (MCSnorm),Fixation point Y (MCSnorm),Mouse position X,Mouse position Y
2,6105,48556071,515022640242,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
3,6106,48564392,515022648563,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
4,6107,48572740,515022656911,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
5,6108,48581065,515022665236,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
6,6109,48589384,515022673555,Eye Tracker,Control group experiment,30.09.2020,Participant0002,Recording1,30.09.2020,30.09.2020,...,416.0,Fixation,92.0,148.0,1237.0,347.0,06668,03217,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35097,15622,63899509,620588764860,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35098,15623,63907845,620588773196,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35099,15624,63916183,620588781534,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,
35100,15625,63924508,620588789859,Eye Tracker,Participant0056,08.09.2021,Participant0056,Recording2,08.09.2021,08.09.2021,...,,Fixation,108.0,139.0,630.0,635.0,,,,


In [15]:
# Extract gaze data (X and Y coordinates)
gaze_x = data["Gaze point X"].values
gaze_y = data["Gaze point Y"].values
eye_movement_type = data["Eye movement type"].values
len(gaze_x)

30568

In [6]:
# Function to extract time-domain features
def extract_time_domain_features(gaze_x, gaze_y):
    # Mean, standard deviation, max, min of gaze coordinates
    mean_x = np.mean(gaze_x)
    std_x = np.std(gaze_x)
    mean_y = np.mean(gaze_y)
    std_y = np.std(gaze_y)
    
    # Velocity (difference between consecutive points)
    velocity_x = np.diff(gaze_x)
    velocity_y = np.diff(gaze_y)
    mean_velocity_x = np.mean(velocity_x)
    std_velocity_x = np.std(velocity_x)
    mean_velocity_y = np.mean(velocity_y)
    std_velocity_y = np.std(velocity_y)
        # Return the features
    return [mean_x, std_x, mean_y, std_y, mean_velocity_x, std_velocity_x, mean_velocity_y, std_velocity_y]



In [7]:

# Function to extract frequency-domain features (using FFT)
def extract_frequency_domain_features(gaze_x, gaze_y):
    # Apply FFT to gaze coordinates (X, Y)
    fft_x = fft(gaze_x)
    fft_y = fft(gaze_y)
    
    # Get the power spectral density (absolute value squared)
    psd_x = np.abs(fft_x) ** 2
    psd_y = np.abs(fft_y) ** 2
    
    # Dominant frequency (index of the max value)
    dominant_freq_x = np.argmax(psd_x)
    dominant_freq_y = np.argmax(psd_y)
    
    # Power at dominant frequency
    power_x = psd_x[dominant_freq_x]
    power_y = psd_y[dominant_freq_y]
    
    # Return the features
    return [dominant_freq_x, power_x, dominant_freq_y, power_y]


In [51]:
df =data

In [32]:
df.columns

Index(['Unnamed: 0', 'Recording timestamp', 'Computer timestamp', 'Sensor',
       'Project name', 'Export date', 'Participant name', 'Recording name',
       'Recording date', 'Recording date UTC', 'Recording start time',
       'Recording start time UTC', 'Recording duration', 'Timeline name',
       'Recording Fixation filter name', 'Recording software version',
       'Recording resolution height', 'Recording resolution width',
       'Recording monitor latency', 'Eyetracker timestamp', 'Event',
       'Event value', 'Gaze point X', 'Gaze point Y', 'Gaze point left X',
       'Gaze point left Y', 'Gaze point right X', 'Gaze point right Y',
       'Gaze direction left X', 'Gaze direction left Y',
       'Gaze direction left Z', 'Gaze direction right X',
       'Gaze direction right Y', 'Gaze direction right Z',
       'Pupil diameter left', 'Pupil diameter right', 'Validity left',
       'Validity right', 'Eye position left X (DACSmm)',
       'Eye position left Y (DACSmm)', 'Eye po

In [45]:
df["Pupil diameter left"] = pd.to_numeric(df["Pupil diameter left"], errors="coerce")
df["Pupil diameter right"] = pd.to_numeric(df["Pupil diameter right"], errors="coerce")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Pupil diameter left"] = pd.to_numeric(df["Pupil diameter left"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Pupil diameter right"] = pd.to_numeric(df["Pupil diameter right"], errors="coerce")


In [46]:
# Extract gaze data (X and Y coordinates)
gaze_x = df["Gaze point X"].values
gaze_y = df["Gaze point Y"].values


fixation_durations = df["Gaze event duration"].values

pupil_diameter_left = df["Pupil diameter left"].values
pupil_diameter_right = df["Pupil diameter right"].values
gaze_point_left_x = df["Gaze point left X"].values
gaze_point_right_x = df["Gaze point right X"].values
gaze_point_left_y = df["Gaze point left Y"].values
gaze_point_right_y = df["Gaze point right Y"].values


eye_movement_type = data["Eye movement type"].values



array([ 92.,  92.,  92., ..., 108., 108., 108.])

In [52]:
df["Pupil diameter right"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
35097    2,67
35098     NaN
35099     NaN
35100    2,72
35101     NaN
Name: Pupil diameter right, Length: 35102, dtype: object

In [48]:
df["Pupil diameter left"].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: Pupil diameter left, dtype: float64

In [44]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Recording timestamp,Computer timestamp,Recording duration,Recording resolution height,Recording resolution width,Eyetracker timestamp,Gaze point X,Gaze point Y,Gaze point left X,...,Presented Media position X (DACSpx),Presented Media position Y (DACSpx),Original Media width,Original Media height,Gaze event duration,Eye movement type index,Fixation point X,Fixation point Y,Mouse position X,Mouse position Y
count,30568.0,30568.0,30568.0,30568.0,30568.0,30568.0,30568.0,30568.0,30568.0,27020.0,...,30504.0,30504.0,30504.0,30504.0,30568.0,30568.0,17662.0,17662.0,0.0,0.0
mean,11724.96866,47935310.0,556321900000.0,72633.392567,1080.0,1920.0,4820989000.0,882.085383,442.063465,835.281125,...,166.398571,-45.335431,908.648308,691.816942,185.246074,241.16743,869.186842,416.803307,,
std,2945.572758,30178320.0,51471890000.0,33703.186817,0.0,0.0,5300610000.0,270.41398,247.859396,269.398739,...,215.883545,225.606186,602.801899,580.500857,232.353444,217.034736,242.858001,242.230346,,
min,6105.0,3862530.0,515022600000.0,14124.0,1080.0,1920.0,516733700.0,-63.0,-213.0,-63.0,...,-747.0,-1168.0,640.0,416.0,8.0,10.0,75.0,-160.0,,
25%,9346.0,14877520.0,515055300000.0,64099.0,1080.0,1920.0,549419500.0,722.0,279.0,672.0,...,129.0,0.0,640.0,416.0,33.0,68.0,714.0,251.0,,
50%,11737.5,49476840.0,515115200000.0,64099.0,1080.0,1920.0,609315200.0,904.0,442.0,862.0,...,129.0,0.0,798.0,692.0,100.0,189.0,904.0,415.0,,
75%,14357.5,71181300.0,620552000000.0,106276.0,1080.0,1920.0,11428970000.0,1030.0,610.0,980.0,...,337.0,0.0,798.0,692.0,258.0,319.0,1008.0,578.0,,
max,17417.0,106111000.0,620588800000.0,106276.0,1080.0,1920.0,11465780000.0,2176.0,1887.0,1665.0,...,337.0,0.0,3415.0,3415.0,1300.0,1029.0,1681.0,1051.0,,


In [36]:
# Function to extract spatial features
def extract_spatial_features(gaze_x, gaze_y):
    # Mean and standard deviation of gaze point dispersion
    mean_dispersion_x = np.mean(np.abs(gaze_x - np.mean(gaze_x)))
    std_dispersion_x = np.std(np.abs(gaze_x - np.mean(gaze_x)))
    mean_dispersion_y = np.mean(np.abs(gaze_y - np.mean(gaze_y)))
    std_dispersion_y = np.std(np.abs(gaze_y - np.mean(gaze_y)))
    
    # Distance between consecutive gaze points
    distances = np.sqrt(np.diff(gaze_x)**2 + np.diff(gaze_y)**2)
    mean_distance = np.mean(distances)
    std_distance = np.std(distances)
    
    # Angles between consecutive gaze points
    angles = np.arctan2(np.diff(gaze_y), np.diff(gaze_x))
    mean_angle = np.mean(angles)
    std_angle = np.std(angles)
    
    # Return spatial features
    return [mean_dispersion_x, std_dispersion_x, mean_dispersion_y, std_dispersion_y, mean_distance, std_distance, mean_angle, std_angle]


In [37]:
# Function to extract behavioral features
def extract_behavioral_features(fixation_durations, pupil_diameter_left, pupil_diameter_right, gaze_point_left_x, gaze_point_right_x, gaze_point_left_y, gaze_point_right_y):
    # Fixation duration statistics
    mean_fixation_duration = np.mean(fixation_durations)
    std_fixation_duration = np.std(fixation_durations)
    
    # Pupil diameter asymmetry
    pupil_asymmetry = pupil_diameter_left - pupil_diameter_right
    mean_pupil_asymmetry = np.mean(pupil_asymmetry)
    std_pupil_asymmetry = np.std(pupil_asymmetry)
    
    # Left-right eye gaze asymmetry
    gaze_asymmetry_x = gaze_point_left_x - gaze_point_right_x
    gaze_asymmetry_y = gaze_point_left_y - gaze_point_right_y
    mean_gaze_asymmetry_x = np.mean(gaze_asymmetry_x)
    std_gaze_asymmetry_x = np.std(gaze_asymmetry_x)
    mean_gaze_asymmetry_y = np.mean(gaze_asymmetry_y)
    std_gaze_asymmetry_y = np.std(gaze_asymmetry_y)
    
    # Return behavioral features
    return [
        mean_fixation_duration, std_fixation_duration,
        mean_pupil_asymmetry, std_pupil_asymmetry,
        mean_gaze_asymmetry_x, std_gaze_asymmetry_x,
        mean_gaze_asymmetry_y, std_gaze_asymmetry_y
    ]


In [38]:
# Function to extract frequency-domain features (enhanced)
def extract_frequency_domain_features_enhanced(gaze_x, gaze_y):
    # Apply FFT to gaze coordinates
    fft_x = fft(gaze_x)
    fft_y = fft(gaze_y)
    
    # Get the power spectral density
    psd_x = np.abs(fft_x) ** 2
    psd_y = np.abs(fft_y) ** 2
    
    # Divide into bands (low, mid, high frequencies)
    n = len(psd_x)
    low_band_x = np.sum(psd_x[:n // 3])
    mid_band_x = np.sum(psd_x[n // 3: 2 * n // 3])
    high_band_x = np.sum(psd_x[2 * n // 3:])
    
    low_band_y = np.sum(psd_y[:n // 3])
    mid_band_y = np.sum(psd_y[n // 3: 2 * n // 3])
    high_band_y = np.sum(psd_y[2 * n // 3:])
    
    # Spectral entropy
    total_power_x = np.sum(psd_x)
    entropy_x = -np.sum((psd_x / total_power_x) * np.log2(psd_x / total_power_x + 1e-12))  # Avoid log(0)
    total_power_y = np.sum(psd_y)
    entropy_y = -np.sum((psd_y / total_power_y) * np.log2(psd_y / total_power_y + 1e-12))  # Avoid log(0)
    
    # Return enhanced frequency features
    return [low_band_x, mid_band_x, high_band_x, entropy_x, low_band_y, mid_band_y, high_band_y, entropy_y]


In [39]:
# Combine all feature extraction
def extract_all_features(gaze_x, gaze_y, fixation_durations, pupil_diameter_left, pupil_diameter_right, gaze_point_left_x, gaze_point_right_x, gaze_point_left_y, gaze_point_right_y):
    time_features = extract_time_domain_features(gaze_x, gaze_y)
    spatial_features = extract_spatial_features(gaze_x, gaze_y)
    behavioral_features = extract_behavioral_features(
        fixation_durations,
        pupil_diameter_left, pupil_diameter_right,
        gaze_point_left_x, gaze_point_right_x,
        gaze_point_left_y, gaze_point_right_y
    )
    freq_features = extract_frequency_domain_features_enhanced(gaze_x, gaze_y)
    
    # Combine all features
    return time_features + spatial_features + behavioral_features + freq_features


In [40]:
# Extract features for each sample in the dataset
time_domain_features = []
frequency_domain_features = []
spatial_features = []
behavioral_features = []
enhanced_frequency_features = []

# Iterate through the dataset with a window size of 100 data points
for i in range(0, len(gaze_x) - 100, 100):  # Step size: 100
    # Extract time-domain features
    time_domain_features.append(extract_time_domain_features(gaze_x[i:i+100], gaze_y[i:i+100]))
    
    # Extract frequency-domain features
    frequency_domain_features.append(extract_frequency_domain_features(gaze_x[i:i+100], gaze_y[i:i+100]))
    
    # Extract spatial features
    spatial_features.append(extract_spatial_features(gaze_x[i:i+100], gaze_y[i:i+100]))
    
    # Extract behavioral features (you may need to provide corresponding data like fixation durations, etc.)
    behavioral_features.append(extract_behavioral_features(
        fixation_durations[i:i+100],
        pupil_diameter_left[i:i+100], pupil_diameter_right[i:i+100],
        gaze_point_left_x[i:i+100], gaze_point_right_x[i:i+100],
        gaze_point_left_y[i:i+100], gaze_point_right_y[i:i+100]
    ))
    
    # Extract enhanced frequency-domain features
    enhanced_frequency_features.append(extract_frequency_domain_features_enhanced(gaze_x[i:i+100], gaze_y[i:i+100]))

# Convert extracted features into DataFrames
time_domain_df = pd.DataFrame(time_domain_features, columns=["mean_x", "std_x", "mean_y", "std_y", 
                                                             "mean_velocity_x", "std_velocity_x", 
                                                             "mean_velocity_y", "std_velocity_y"])

frequency_domain_df = pd.DataFrame(frequency_domain_features, columns=["dominant_freq_x", "power_x", 
                                                                        "dominant_freq_y", "power_y"])

spatial_df = pd.DataFrame(spatial_features, columns=["mean_dispersion_x", "std_dispersion_x", 
                                                     "mean_dispersion_y", "std_dispersion_y", 
                                                     "mean_distance", "std_distance", 
                                                     "mean_angle", "std_angle"])

behavioral_df = pd.DataFrame(behavioral_features, columns=["mean_fixation_duration", "std_fixation_duration", 
                                                           "mean_pupil_asymmetry", "std_pupil_asymmetry", 
                                                           "mean_gaze_asymmetry_x", "std_gaze_asymmetry_x", 
                                                           "mean_gaze_asymmetry_y", "std_gaze_asymmetry_y"])

enhanced_frequency_df = pd.DataFrame(enhanced_frequency_features, columns=["low_band_x", "mid_band_x", "high_band_x", "entropy_x", 
                                                                            "low_band_y", "mid_band_y", "high_band_y", "entropy_y"])

# Combine all features into a single DataFrame
all_features_df = pd.concat([time_domain_df, frequency_domain_df, spatial_df, behavioral_df, enhanced_frequency_df], axis=1)

# Display the resulting DataFrame
print(all_features_df.head())


TypeError: unsupported operand type(s) for -: 'str' and 'float'

In [8]:

# Extract features for each sample in the dataset
time_domain_features = []
frequency_domain_features = []

In [None]:
for i in range(len(gaze_x)):
    time_domain_features.append(extract_time_domain_features(gaze_x[i:i+100], gaze_y[i:i+100]))  # Window size: 100 data points
    frequency_domain_features.append(extract_frequency_domain_features(gaze_x[i:i+100], gaze_y[i:i+100]))  # Window size: 100 data points

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [10]:
# Convert features into a dataframe
time_domain_df = pd.DataFrame(time_domain_features, columns=["mean_x", "std_x", "mean_y", "std_y", 
                                                            "mean_velocity_x", "std_velocity_x", 
                                                            "mean_velocity_y", "std_velocity_y"])

frequency_domain_df = pd.DataFrame(frequency_domain_features, columns=["dominant_freq_x", "power_x", 
                                                                      "dominant_freq_y", "power_y"])

In [23]:
time_domain_df.isna().sum()

mean_x             0
std_x              0
mean_y             0
std_y              0
mean_velocity_x    1
std_velocity_x     1
mean_velocity_y    1
std_velocity_y     1
dtype: int64

In [21]:
frequency_domain_df.describe()

Unnamed: 0,dominant_freq_x,power_x,dominant_freq_y,power_y
count,30568.0,30568.0,30568.0,30568.0
mean,0.0,8212254000.0,0.038079,2425047000.0
std,0.0,3694250000.0,0.224725,2063552000.0
min,0.0,391876.0,0.0,403225.0
25%,0.0,5567100000.0,0.0,833664600.0
50%,0.0,8048422000.0,0.0,1974025000.0
75%,0.0,10493600000.0,0.0,3308263000.0
max,0.0,21332360000.0,3.0,10657670000.0


In [16]:
features = pd.concat([time_domain_df, frequency_domain_df], axis=1)

In [17]:
# Encode the target variable (Eye movement type) as numerical values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target = le.fit_transform(eye_movement_type)

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [19]:
# Train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators= 50, random_state=42)
rf.fit(X_train, y_train)

In [20]:
# Make predictions and evaluate the model
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

    Fixation       0.99      0.99      0.99      3538
     Saccade       0.93      0.93      0.93      1314
Unclassified       0.95      0.95      0.95      1262

    accuracy                           0.97      6114
   macro avg       0.96      0.96      0.96      6114
weighted avg       0.97      0.97      0.97      6114

