In [None]:
# Standard libraries (none here)

# Third-party libraries - Core data/science libs
import numpy as np
import pandas as pd
import wandb
import matplotlib.pyplot as plt

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

# XGBoost
import xgboost as xgb

# scikit-learn - model selection
from sklearn.model_selection import train_test_split

# scikit-learn - preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

# scikit-learn - metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# scikit-learn - ensemble methods
from sklearn.ensemble import RandomForestClassifier

# scikit-learn - pipeline and feature selection
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE


# --- Load the dataset ---
df = pd.read_csv("../../data/csv/cleaned_fix.csv")  # Replace with your actual path

# --- Sanity check: confirm required columns exist ---
required_cols = {'Participant', 'Image', 'Scene', 'FixDur', 'ROI', 'experience'}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Missing required columns: {required_cols - set(df.columns)}")

# --- Group by Participant and Image, then sum FixDur to get total viewing time ---
viewing_times = df.groupby(['Participant', 'Image', 'Scene', 'experience', 'ROI'])['FixDur'].sum().reset_index()

# --- Optional: rename column for clarity ---
viewing_times.rename(columns={'FixDur': 'TotalViewTime'}, inplace=True)

# --- Display result ---
print(viewing_times)

2025-06-14 14:57:40.737118: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-14 14:57:40.745330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749905860.754786  178496 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749905860.757762  178496 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749905860.765110  178496 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

      Participant  Image  Scene experience  ROI  TotalViewTime
0             2.0      1      1    Control    0        32907.0
1             2.0      1      1    Control    2         4113.0
2             2.0      1      1    Control    3           95.0
3             2.0      1      1    Control    4          407.0
4             2.0      1      1    Control    5         5517.0
...           ...    ...    ...        ...  ...            ...
3805       9008.0      2      3    Control   14          771.0
3806       9008.0      2      3    Control   15         2013.0
3807       9008.0      2      3    Control   16          806.0
3808       9008.0      2      3    Control   18          416.0
3809       9008.0      2      3    Control   19         1163.0

[3810 rows x 6 columns]


In [2]:
agg_df = viewing_times.groupby(['Participant', 'Image', 'Scene', 'experience']).agg(
    total_view_time=('TotalViewTime', 'sum'),
    mean_roi_view_time=('TotalViewTime', 'mean'),
    max_roi_view_time=('TotalViewTime', 'max'),
    std_roi_view_time=('TotalViewTime', 'std'),
    num_rois_viewed=('ROI', 'count')
).fillna(0).reset_index()

# Step 2: Feature and label selection
X = agg_df[['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
            'std_roi_view_time', 'num_rois_viewed', 'Image', 'Scene']]
y = agg_df['experience']

# Step 3: Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 4: Separate numerical and categorical features
numerical_cols = ['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
                  'std_roi_view_time', 'num_rois_viewed']
categorical_cols = ['Image', 'Scene']

# Step 5: One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[categorical_cols])

# Step 6: Scale numeric features
X_num = StandardScaler().fit_transform(X[numerical_cols])

# Step 7: Combine features
X_processed = np.hstack([X_num, X_cat])

# Step 8: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)


In [3]:


# Step 1: Aggregate the data
agg_df = viewing_times.groupby(['Participant', 'Image', 'Scene', 'experience']).agg(
    total_view_time=('TotalViewTime', 'sum'),
    mean_roi_view_time=('TotalViewTime', 'mean'),
    max_roi_view_time=('TotalViewTime', 'max'),
    std_roi_view_time=('TotalViewTime', 'std'),
    num_rois_viewed=('ROI', 'count')
).fillna(0).reset_index()

# Step 2: Create ratio and interaction features
agg_df['max_mean_ratio'] = agg_df['max_roi_view_time'] / (agg_df['mean_roi_view_time'] + 1e-6)
agg_df['std_mean_ratio'] = agg_df['std_roi_view_time'] / (agg_df['mean_roi_view_time'] + 1e-6)
agg_df['avg_view_time_per_roi'] = agg_df['total_view_time'] / (agg_df['num_rois_viewed'] + 1e-6)

# Step 3: Aggregate average view time per Image and Scene and merge back
image_avg = agg_df.groupby('Image')['total_view_time'].mean().rename('image_avg_view_time')
scene_avg = agg_df.groupby('Scene')['total_view_time'].mean().rename('scene_avg_view_time')

agg_df = agg_df.merge(image_avg, on='Image', how='left')
agg_df = agg_df.merge(scene_avg, on='Scene', how='left')

# Step 4: Frequency encode Image and Scene categories
image_freq = agg_df['Image'].value_counts(normalize=True).rename('image_freq_enc')
scene_freq = agg_df['Scene'].value_counts(normalize=True).rename('scene_freq_enc')

agg_df = agg_df.join(image_freq, on='Image')
agg_df = agg_df.join(scene_freq, on='Scene')

# Step 5: Log transform skewed numeric features
for col in ['total_view_time', 'mean_roi_view_time', 'max_roi_view_time', 'std_roi_view_time', 'avg_view_time_per_roi']:
    agg_df[f'log_{col}'] = np.log1p(agg_df[col])

# Step 6: Define feature columns
feature_cols = [
    'total_view_time', 'mean_roi_view_time', 'max_roi_view_time', 'std_roi_view_time', 'num_rois_viewed',
    'max_mean_ratio', 'std_mean_ratio', 'avg_view_time_per_roi',
    'image_avg_view_time', 'scene_avg_view_time',
    'image_freq_enc', 'scene_freq_enc',
    'log_total_view_time', 'log_mean_roi_view_time', 'log_max_roi_view_time', 'log_std_roi_view_time', 'log_avg_view_time_per_roi'
]

X = agg_df[feature_cols]
y = agg_df['experience']

# Step 7: Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 8: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)


In [4]:
model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(32, activation='relu'),
    
    layers.Dense(36, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
W0000 00:00:1749905862.508815  178496 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [6]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

history = model.fit(X_train, y_train,
                    validation_split=0.1,
                    epochs=500,
                    batch_size=64,
                    callbacks=[early_stopping])

Epoch 1/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0091 - loss: 4.4494 - val_accuracy: 0.0000e+00 - val_loss: 3.6952
Epoch 2/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0085 - loss: 4.2183 - val_accuracy: 0.0000e+00 - val_loss: 3.6518
Epoch 3/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0033 - loss: 4.0088 - val_accuracy: 0.0556 - val_loss: 3.6113
Epoch 4/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0052 - loss: 4.0205 - val_accuracy: 0.0556 - val_loss: 3.5729
Epoch 5/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0235 - loss: 3.8115 - val_accuracy: 0.1111 - val_loss: 3.5349
Epoch 6/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0267 - loss: 3.7045 - val_accuracy: 0.1667 - val_loss: 3.4960
Epoch 7/500
[1m3/3[0m [32m━━━

In [7]:
model.evaluate(X_test, y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3161 - loss: 1.4899


[1.485798954963684, 0.302325576543808]

In [8]:


rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Test accuracy: 0.20930232558139536


In [9]:


# --- Step 1: Aggregate your data ---
agg_df = viewing_times.groupby(['Participant', 'Image', 'Scene', 'experience']).agg(
    total_view_time=('TotalViewTime', 'sum'),
    mean_roi_view_time=('TotalViewTime', 'mean'),
    max_roi_view_time=('TotalViewTime', 'max'),
    std_roi_view_time=('TotalViewTime', 'std'),
    num_rois_viewed=('ROI', 'count')
).fillna(0).reset_index()

# --- Step 2: Define features and target ---
X = agg_df[['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
            'std_roi_view_time', 'num_rois_viewed', 'Image', 'Scene']]
y = agg_df['experience']

# --- Step 3: Encode target ---
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- Step 4: Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# --- Step 5: Preprocessing pipelines for numeric and categorical features ---
numeric_features = ['total_view_time', 'mean_roi_view_time', 'max_roi_view_time',
                    'std_roi_view_time', 'num_rois_viewed']
categorical_features = ['Image', 'Scene']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# --- Step 6: Create model (Random Forest) ---
model = RandomForestClassifier(random_state=42)

# --- Step 7: Create pipeline with preprocessor and RFE ---
# RFE with Random Forest as the estimator, selecting top 5 features
rfe_selector = RFE(estimator=model, n_features_to_select=5, step=1)

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', rfe_selector),
    ('classifier', model)
])

# --- Step 8: Train ---
pipeline.fit(X_train, y_train)

# --- Step 9: Evaluate ---
y_pred = pipeline.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# --- Step 10 (optional): Check which features were selected ---
# Get feature names after one-hot encoding
ohe_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(ohe_feature_names)
selected_mask = pipeline.named_steps['feature_selector'].support_
selected_features = [f for f, selected in zip(all_features, selected_mask) if selected]

print("Selected features:", selected_features)


Test accuracy: 0.2558139534883721

Classification report:
               precision    recall  f1-score   support

         CSI       0.11      0.09      0.10        11
     Control       0.22      0.18      0.20        11
   FirstYear       0.33      0.40      0.36        10
   ThirdYear       0.31      0.36      0.33        11

    accuracy                           0.26        43
   macro avg       0.24      0.26      0.25        43
weighted avg       0.24      0.26      0.25        43

Selected features: ['total_view_time', 'mean_roi_view_time', 'max_roi_view_time', 'std_roi_view_time', 'num_rois_viewed']


In [10]:
X_numeric = agg_df[['total_view_time', 'mean_roi_view_time', 'max_roi_view_time', 'std_roi_view_time', 'num_rois_viewed']]
y = agg_df['experience']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Test accuracy: 0.2558139534883721
              precision    recall  f1-score   support

         CSI       0.11      0.09      0.10        11
     Control       0.22      0.18      0.20        11
   FirstYear       0.33      0.40      0.36        10
   ThirdYear       0.31      0.36      0.33        11

    accuracy                           0.26        43
   macro avg       0.24      0.26      0.25        43
weighted avg       0.24      0.26      0.25        43



In [11]:


# Features and target
X_numeric = agg_df[['total_view_time', 'mean_roi_view_time', 'max_roi_view_time', 
                    'std_roi_view_time', 'num_rois_viewed']]
y = agg_df['experience']

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Scale features (optional, but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(label_encoder.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1
)

# Train
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_scaled)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Test accuracy: 0.4186046511627907

Classification report:
               precision    recall  f1-score   support

         CSI       0.43      0.27      0.33        11
     Control       0.50      0.55      0.52        11
   FirstYear       0.40      0.40      0.40        10
   ThirdYear       0.36      0.45      0.40        11

    accuracy                           0.42        43
   macro avg       0.42      0.42      0.41        43
weighted avg       0.42      0.42      0.41        43



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
