In [None]:
# Standard libraries (none here)

# Third-party libraries - Core data/science libs
import numpy as np
import pandas as pd
import wandb
import matplotlib.pyplot as plt

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from sklearn.neighbors import KNeighborsClassifier

# scikit-learn - model selection
from sklearn.model_selection import train_test_split

# scikit-learn - preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

# scikit-learn - metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# scikit-learn - ensemble methods
from sklearn.ensemble import RandomForestClassifier

# scikit-learn - pipeline and feature selection
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE


# --- Load the dataset ---
df = pd.read_csv("../data/csv/cleaned_fix.csv")  # Replace with your actual path

# --- Sanity check: confirm required columns exist ---
required_cols = {'Participant', 'Image', 'Scene', 'FixDur', 'ROI', 'experience'}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Missing required columns: {required_cols - set(df.columns)}")

# --- Group by Participant and Image, then sum FixDur to get total viewing time ---
viewing_times = df.groupby(['Participant', 'Image', 'Scene', 'experience', 'ROI'])['FixDur'].sum().reset_index()

# --- Optional: rename column for clarity ---
viewing_times.rename(columns={'FixDur': 'TotalViewTime'}, inplace=True)

# --- Display result ---
print(viewing_times)

2025-06-25 15:45:22.429884: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-25 15:45:22.438132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750859122.447572   27516 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750859122.450641   27516 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750859122.458063   27516 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

      Participant  Image  Scene experience  ROI  TotalViewTime
0             2.0      1      1    Control    0        32907.0
1             2.0      1      1    Control    2         4113.0
2             2.0      1      1    Control    3           95.0
3             2.0      1      1    Control    4          407.0
4             2.0      1      1    Control    5         5517.0
...           ...    ...    ...        ...  ...            ...
3805       9008.0      2      3    Control   14          771.0
3806       9008.0      2      3    Control   15         2013.0
3807       9008.0      2      3    Control   16          806.0
3808       9008.0      2      3    Control   18          416.0
3809       9008.0      2      3    Control   19         1163.0

[3810 rows x 6 columns]


In [5]:
agg_df = viewing_times.groupby(['Participant', 'Image', 'Scene', 'experience']).agg(
    total_view_time=('TotalViewTime', 'sum'),
    mean_roi_view_time=('TotalViewTime', 'mean'),
    max_roi_view_time=('TotalViewTime', 'max'),
    std_roi_view_time=('TotalViewTime', 'std'),
    num_rois_viewed=('ROI', 'count')
).fillna(0).reset_index()

# Step 2: Feature and label selection
X = agg_df[['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
            'std_roi_view_time', 'num_rois_viewed', 'Image', 'Scene']]
y = agg_df['experience']

# Step 3: Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 4: Separate numerical and categorical features
numerical_cols = ['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
                  'std_roi_view_time', 'num_rois_viewed']
numerical_cols = ['max_roi_view_time']
categorical_cols = ['Image', 'Scene']

# Step 5: One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[categorical_cols])

# Step 6: Scale numeric features
X_num = StandardScaler().fit_transform(X[numerical_cols])

# Step 7: Combine features
X_processed = np.hstack([X_num, X_cat])

# Step 8: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)


In [6]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

In [7]:
neigh.score(X_test, y_test)

0.13953488372093023