In [8]:
# Cell 1: Imports and load data
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Path to your features file (adjust if you saved it with a different name)
FEATURES_FILE = "engineered_features.csv"   # Same folder as this notebook

print("Current working directory:", os.getcwd())
print("Looking for file:", os.path.abspath(FEATURES_FILE))

df = pd.read_csv(FEATURES_FILE)
print("\nLoaded data shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:\n", df['emotion_label'].value_counts())

Current working directory: d:\emotion-drift-project\src\notebooks
Looking for file: d:\emotion-drift-project\src\notebooks\engineered_features.csv

Loaded data shape: (10, 23)

Columns: ['Participant', 'Recording', 'source_file', 'gaze_x_mean', 'gaze_x_std', 'gaze_x_min', 'gaze_x_max', 'gaze_y_mean', 'gaze_y_std', 'gaze_y_min', 'gaze_y_max', 'fixation_count', 'saccade_count', 'unclassified_count', 'fixation_ratio', 'saccade_ratio', 'avg_fixation_duration_ms', 'avg_saccade_duration_ms', 'total_fixation_duration_ms', 'total_duration_ms', 'mean_saccade_speed_px_s', 'max_saccade_speed_px_s', 'emotion_label']

Label distribution:
 emotion_label
anxious    7
neutral    3
Name: count, dtype: int64


In [9]:
# Cell 2: Prepare X (features) and y (labels)
# Remove classes with fewer than 2 samples (required for stratified split)
class_counts = df['emotion_label'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df_filtered = df[df['emotion_label'].isin(valid_classes)].copy()

print(f"Original shape: {df.shape}")
print(f"Filtered shape: {df_filtered.shape}")
print(f"Removed classes with <2 samples: {set(df['emotion_label'].unique()) - set(valid_classes)}")
print(f"\nFiltered label distribution:\n{df_filtered['emotion_label'].value_counts()}")

# Remove non-numeric and identifier columns
columns_to_drop = ['Participant', 'Recording', 'source_file', 'emotion_label']

# If you have extra columns like 'Participant name' from merge, add them here
X = df_filtered.drop(columns=columns_to_drop, errors='ignore')

y = df_filtered['emotion_label']

print("\nNumber of features:", X.shape[1])
print("Number of samples:", X.shape[0])
print("\nFeatures preview (first 3 rows):\n", X.head(3))


Original shape: (10, 23)
Filtered shape: (10, 23)
Removed classes with <2 samples: set()

Filtered label distribution:
emotion_label
anxious    7
neutral    3
Name: count, dtype: int64

Number of features: 19
Number of samples: 10

Features preview (first 3 rows):
    gaze_x_mean  gaze_x_std  gaze_x_min  gaze_x_max  gaze_y_mean  gaze_y_std  \
0  1070.355771  273.425806       280.0      1610.0   253.597742  293.880068   
1   965.955342  306.457171       328.0      1679.0   306.260339  273.827454   
2  1017.325455  304.629167      -258.0      1863.0   555.689808  301.311213   

   gaze_y_min  gaze_y_max  fixation_count  saccade_count  unclassified_count  \
0      -201.0      1869.0            3842           1917                1060   
1      -165.0      1024.0            4221           1903                 840   
2        -5.0      1357.0            2013           2199                1881   

   fixation_ratio  saccade_ratio  avg_fixation_duration_ms  \
0        0.563426       0.281126  

In [10]:
# Cell 3: Train a simple Random Forest model
# Safety check: remove any classes with fewer than 2 samples
class_counts_check = y.value_counts()
print("Class distribution before train-test split:")
print(class_counts_check)

# Only use stratify if all classes have at least 2 samples
use_stratify = (class_counts_check >= 2).all()
print(f"\nUsing stratified split: {use_stratify}")

if not use_stratify:
    # Remove underrepresented classes
    valid_classes = class_counts_check[class_counts_check >= 2].index
    mask = y.isin(valid_classes)
    X_filtered = X[mask]
    y_filtered = y[mask]
    print(f"Removed samples: {(~mask).sum()}")
else:
    X_filtered = X
    y_filtered = y

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, 
    test_size=0.3,          # 30% for testing
    random_state=42, 
    stratify=y_filtered if use_stratify else None  # keep label balance if possible
)

print(f"\nTraining samples: {len(X_train)}, Test samples: {len(X_test)}")

# Create and train model
model = RandomForestClassifier(
    n_estimators=100,       # number of trees
    random_state=42
)

model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Show results
print("\nAccuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Class distribution before train-test split:
emotion_label
anxious    7
neutral    3
Name: count, dtype: int64

Using stratified split: True

Training samples: 7, Test samples: 3

Accuracy: 66.67 %

Classification Report:

              precision    recall  f1-score   support

     anxious       1.00      0.50      0.67         2
     neutral       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3


Confusion Matrix:

[[1 1]
 [0 1]]
