In [None]:
# Upload
from google.colab import files
uploaded = files.upload()





Saving dataset.zip to dataset.zip


In [None]:
# Unzip
import zipfile
zip_path = 'dataset.zip'  # Replace with your uploaded zip name
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('new folder')

import os

print("Patient files:", os.listdir('new folder/new folder/patients'))
print("Non-patient files:", os.listdir('new folder/new folder/non_patients'))


Patient files: ['best_segment_cleaned (16).csv', 'best_segment_cleaned (20).csv', 'best_segment_cleaned (18).csv', 'best_segment_cleaned (19).csv']
Non-patient files: ['D_First10k.csv', 'processed_eye_data (15).csv', 'processed_eye_data (14).csv', 'D_remaining.csv']


In [None]:
import os
import shutil

# Create new folder
os.makedirs('new_folder', exist_ok=True)

# Move and rename patient files
patient_path = 'new folder/new folder/patients'
for i, file in enumerate(os.listdir(patient_path)):
    if file.endswith('.csv'):
        new_name = f'P_{i}.csv'  # P_ prefix for patients
        shutil.move(os.path.join(patient_path, file), os.path.join('new_folder', new_name))

# Move and rename non-patient files
non_patient_path = 'new folder/new folder/non_patients'
for i, file in enumerate(os.listdir(non_patient_path)):
    if file.endswith('.csv'):
        new_name = f'N_{i}.csv'  # N_ prefix for non-patients
        shutil.move(os.path.join(non_patient_path, file), os.path.join('new_folder', new_name))

print("All files moved and renamed successfully!")


All files moved and renamed successfully!


In [None]:
import os
import pandas as pd

folder_path = 'new_folder'

# Initialize
patient_cols = None
non_patient_cols = None

# Loop through files and find one patient and one non-patient file
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, nrows=1)  # only read first row for speed

        if file.startswith('P_') and patient_cols is None:
            patient_cols = df.columns.tolist()
            print("Patient File Columns:")
            print(patient_cols)

        elif file.startswith('N_') and non_patient_cols is None:
            non_patient_cols = df.columns.tolist()
            print("\nNon-Patient File Columns:")
            print(non_patient_cols)

        # Break if both found
        if patient_cols and non_patient_cols:
            break


Patient File Columns:
['timestamp', 'RecordingTime [ms]', 'gaze_x', 'gaze_y', 'blink', 'saccade_velocity', 'fixation', 'pupil_size', 'left_pupil_x', 'left_pupil_y', 'left_pupil_diameter', 'right_pupil_x', 'right_pupil_y', 'right_pupil_diameter', 'PoR_binocular_x', 'PoR_binocular_y', 'Point of Regard Right X', 'Point of Regard Right Y', 'Point of Regard Left X', 'Point of Regard Left Y', 'Category Binocular', 'Index Binocular', 'group_change', 'is_short_blink', 'is_long_blink', 'is_uncategorized']

Non-Patient File Columns:
['timestamp', 'gaze_x', 'gaze_y', 'eye_aspect_ratio', 'blink', 'saccade_velocity', 'fixation', 'pupil_size', 'left_pupil_x', 'left_pupil_y', 'left_pupil_diameter', 'right_pupil_x', 'right_pupil_y', 'right_pupil_diameter', 'por_binocular_x', 'por_binocular_y', 'point_of_regard_right_x', 'point_of_regard_right_y', 'point_of_regard_left_x', 'point_of_regard_left_y', 'category_binocular', 'index_binocular', 'gx_med', 'gy_med', 'sv_grp', 'RecordingTime [ms]']


In [None]:
import pandas as pd
import os

def extract_features(file_path):
    df = pd.read_csv(file_path)

    # Normalize column names to lowercase
    df.columns = [col.lower() for col in df.columns]

    # Fix naming differences
    cat_col = 'category_binocular' if 'category_binocular' in df.columns else 'category binocular'

    # Basic derived features
    total_rows = len(df)
    blink_ratio = (df[cat_col] == 'Blink').sum() / total_rows
    saccade_ratio = (df[cat_col] == 'Saccade').sum() / total_rows
    visual_ratio = ((df[cat_col] == 'Visual Intake') | (df[cat_col] == 'Fixation')).sum() / total_rows

    mean_saccade_velocity = df['saccade_velocity'].mean()
    mean_pupil_size = df['pupil_size'].mean()
    std_pupil_size = df['pupil_size'].std()

    # Optional blink features
    short_blink_count = df['is_short_blink'].sum() if 'is_short_blink' in df.columns else 0
    long_blink_count = df['is_long_blink'].sum() if 'is_long_blink' in df.columns else 0

    return {
        'blink_ratio': blink_ratio,
        'saccade_ratio': saccade_ratio,
        'visual_ratio': visual_ratio,
        'mean_saccade_velocity': mean_saccade_velocity,
        'mean_pupil_size': mean_pupil_size,
        'std_pupil_size': std_pupil_size,
        'short_blink_count': short_blink_count,
        'long_blink_count': long_blink_count
    }


In [None]:
data = []
labels = []
folder_path = 'new_folder'

for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        features = extract_features(file_path)
        label = 1 if file.startswith('P_') else 0
        data.append(features)
        labels.append(label)

X = pd.DataFrame(data)
y = pd.Series(labels)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report

loo = LeaveOneOut()
y_true = []
y_pred = []

for train_idx, test_idx in loo.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    y_true.append(y_test.values[0])
    y_pred.append(pred[0])

# Evaluate
print("LOOCV Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))


LOOCV Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report

loo = LeaveOneOut()
y_true = []
y_pred = []

for train_idx, test_idx in loo.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = KNeighborsClassifier(n_neighbors=1)  # You can also try k=3 later
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    y_true.append(y_test.values[0])
    y_pred.append(pred[0])

# Evaluate
print("LOOCV Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))


LOOCV Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [None]:
import pandas as pd
import numpy as np

def load_sequence(file_path, max_len=300):
    df = pd.read_csv(file_path)

    # Normalize column names
    df.columns = [c.lower().strip() for c in df.columns]

    # Use category_binocular to define blink
    cat_col = 'category_binocular' if 'category_binocular' in df.columns else 'category binocular'
    df['blink_bin'] = df[cat_col].apply(lambda x: 1 if str(x).strip().lower() == 'blink' else 0)

    # Select features (you can add more like saccade_velocity, pupil_size, etc.)
    features = ['gaze_x', 'gaze_y', 'saccade_velocity', 'pupil_size', 'blink_bin']
    df = df[[f for f in features if f in df.columns]].copy()

    # Fill missing
    df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)

    # Pad or truncate
    if len(df) > max_len:
        df = df.iloc[:max_len]
    else:
        pad_len = max_len - len(df)
        df = pd.concat([df, pd.DataFrame(np.zeros((pad_len, df.shape[1])), columns=df.columns)], axis=0)

    return df.values


In [None]:
X = []
y = []

for file in os.listdir('new_folder'):
    if file.endswith('.csv'):
        file_path = os.path.join('new_folder', file)
        label = 1 if file.startswith('P_') else 0
        X.append(load_sequence(file_path))
        y.append(label)

X = np.array(X)
y = np.array(y)


  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


  super().__init__(**kwargs)


In [None]:
model.fit(X, y, epochs=30, batch_size=1, verbose=1)


Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.6964 - loss: 0.6056
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9573 - loss: 0.2714
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9053 - loss: 0.2587
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 0.1447
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 0.0868
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0702
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0533
Epoch 8/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.0433
Epoch 9/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f421113da10>

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report

loo = LeaveOneOut()
y_true = []
y_pred = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = tf.keras.models.clone_model(model)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=62, batch_size=1, verbose=0)

    pred = (model.predict(X_test)[0][0] > 0.5).astype(int)

    y_true.append(y_test[0])
    y_pred.append(pred)

# Evaluate
print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         8

    accuracy       

In [None]:
# Upload
from google.colab import files
uploaded = files.upload()


Saving processed_eye_data (22).csv to processed_eye_data (22) (1).csv


In [None]:
import numpy as np
import pandas as pd

def preprocess_new_file(file_path, max_len=300):
    df = pd.read_csv(file_path)
    df.columns = [c.lower().strip() for c in df.columns]

    cat_col = 'category_binocular' if 'category_binocular' in df.columns else 'category binocular'
    df['blink_bin'] = df[cat_col].apply(lambda x: 1 if str(x).strip().lower() == 'blink' else 0)

    features = ['gaze_x', 'gaze_y', 'saccade_velocity', 'pupil_size', 'blink_bin']
    df = df[[f for f in features if f in df.columns]].copy()

    df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)

    if len(df) > max_len:
        df = df.iloc[:max_len]
    else:
        pad_len = max_len - len(df)
        df = pd.concat([df, pd.DataFrame(np.zeros((pad_len, df.shape[1])), columns=df.columns)], axis=0)

    return df.values.reshape(1, max_len, len(df.columns))  # reshape for LSTM: (1, timesteps, features)



In [None]:
# Example: load your new test file
new_file_path = 'processed_eye_data (22).csv'  # Change this to your file name

X_new = preprocess_new_file(new_file_path)

# Predict
prediction = model.predict(X_new)[0][0]  # Output is probability

# Threshold at 0.5
if prediction > 0.5:
    print("🧠 Predicted: Parkinson’s Patient (Prob =", round(prediction, 3), ")")
else:
    print("✅ Predicted: Non-Parkinson’s (Prob =", round(prediction, 3), ")")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
✅ Predicted: Non-Parkinson’s (Prob = 0.001 )


  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)


In [None]:
def load_windowed_sequences(file_path, label, window_size=500, step=500):
    df = pd.read_csv(file_path)
    df.columns = [c.lower().strip() for c in df.columns]

    cat_col = 'category_binocular' if 'category_binocular' in df.columns else 'category binocular'
    df['blink_bin'] = df[cat_col].apply(lambda x: 1 if str(x).strip().lower() == 'blink' else 0)

    features = ['gaze_x', 'gaze_y', 'saccade_velocity', 'pupil_size', 'blink_bin']
    df = df[[f for f in features if f in df.columns]].copy()
    df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)

    sequences = []
    labels = []

    for start in range(0, len(df) - window_size + 1, step):
        window = df.iloc[start:start + window_size].values
        sequences.append(window)
        labels.append(label)

    return sequences, labels


In [None]:
X = []
y = []

folder = 'new_folder'

for file in os.listdir(folder):
    if file.endswith('.csv'):
        label = 1 if file.startswith('P_') else 0
        file_path = os.path.join(folder, file)
        windows, labels = load_windowed_sequences(file_path, label)
        X.extend(windows)
        y.extend(labels)

X = np.array(X)  # Shape: (samples, timesteps, features)
y = np.array(y)


  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense

model = Sequential([
    LSTM(64, input_shape=(X.shape[1], X.shape[2])),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=15, batch_size=8, validation_split=0.2)


Epoch 1/15


  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 341ms/step - accuracy: 0.6528 - loss: 0.6524 - val_accuracy: 0.5000 - val_loss: 0.6106
Epoch 2/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.5556 - loss: 0.6275 - val_accuracy: 1.0000 - val_loss: 0.5201
Epoch 3/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 1.0000 - loss: 0.5250 - val_accuracy: 1.0000 - val_loss: 0.4675
Epoch 4/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 1.0000 - loss: 0.5340 - val_accuracy: 1.0000 - val_loss: 0.4523
Epoch 5/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 1.0000 - loss: 0.4290 - val_accuracy: 1.0000 - val_loss: 0.4062
Epoch 6/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 1.0000 - loss: 0.3423 - val_accuracy: 1.0000 - val_loss: 0.3592
Epoch 7/15
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7f418d2c87d0>

In [None]:
X_new, _ = load_windowed_sequences('best_segment_cleaned (23).csv', label=0)
X_new = np.array(X_new)
preds = model.predict(X_new)
mean_pred = preds.mean()

print("Mean prediction probability across windows:", round(mean_pred, 4))

if mean_pred > 0.6:
    print("🧠 Parkinson’s")
elif mean_pred < 0.4:
    print("✅ Non-Parkinson’s")
else:
    print("🤔 Uncertain")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Mean prediction probability across windows: 0.9999
🧠 Parkinson’s


  df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)


In [None]:
'''import pandas as pd
import numpy as np

def extract_features(file_path):
    df = pd.read_csv(file_path)

    # Clean columns
    df.columns = df.columns.str.replace('\xa0', ' ', regex=True)
    df.columns = df.columns.str.strip().str.lower()

    # Debug: Check if key columns exist
    required_columns = ['recordingtime [ms]', 'blink', 'fixation', 'saccade_velocity', 'por_binocular_x', 'por_binocular_y']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in {file_path}")

    # Calculate time differences (in seconds)
    time_diff_series = df['recordingtime [ms]'].diff() / 1000.0  # ms to seconds
    time_diff_series.iloc[0] = 0  # First row has no previous row

    # Remove invalid or zero time differences to avoid division errors
    time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')

    total_time_s = time_diff_series.sum()

    # Blink rate (blinks per second)
    total_blinks = df['blink'].sum()
    blink_rate = total_blinks / total_time_s if total_time_s > 0 else 0

    # Calculate average blink duration
    blink_durations = []
    current_blink_duration = 0

    for i in range(len(df)):
        if df['blink'].iloc[i] == 1:
            current_blink_duration += time_diff_series.iloc[i]
        elif current_blink_duration > 0:
            blink_durations.append(current_blink_duration)
            current_blink_duration = 0

    if current_blink_duration > 0:
        blink_durations.append(current_blink_duration)

    avg_blink_duration = np.mean(blink_durations) if blink_durations else 0

    # Fixation stability
    fixation_percentage = df['fixation'].sum() / len(df) if len(df) > 0 else 0

    # Saccade statistics
    saccade_mean = df['saccade_velocity'].mean()
    saccade_max = df['saccade_velocity'].max()

    # Saccade frequency
    saccade_threshold = 30  # Adjustable threshold
    saccade_count = (df['saccade_velocity'] > saccade_threshold).sum()
    saccade_frequency = saccade_count / total_time_s if total_time_s > 0 else 0

    # POR (Point of Regard) velocity
    por_x_diff = df['por_binocular_x'].diff()
    por_y_diff = df['por_binocular_y'].diff()

    por_velocity = np.sqrt(por_x_diff**2 + por_y_diff**2) / time_diff_series
    por_velocity = por_velocity.replace([np.inf, -np.inf], np.nan).dropna()

    por_velocity_mean = por_velocity.mean() if not por_velocity.empty else 0
    por_velocity_std = por_velocity.std() if not por_velocity.empty else 0

    return {
        'blink_rate': blink_rate,
        'avg_blink_duration': avg_blink_duration,
        'fixation_percentage': fixation_percentage,
        'saccade_mean': saccade_mean,
        'saccade_max': saccade_max,
        'saccade_frequency': saccade_frequency,
        'por_velocity_mean': por_velocity_mean,
        'por_velocity_std': por_velocity_std
    }
'''

In [None]:
'''import pandas as pd
import os

# Assuming your extract_features function is ready

folder_path = 'new_folder'
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

data = []
labels = []

for file in all_files:
    features = extract_features(file)
    data.append(features)

    if os.path.basename(file).startswith('P_'):
        labels.append(1)  # Patient
    else:
        labels.append(0)  # Non-patient

# Create final dataset
feature_df = pd.DataFrame(data)
feature_df['label'] = labels

print("Dataset preview:")
print(feature_df.head())
print(f"Total samples: {len(feature_df)}")
'''

  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')
  time_diff_series = time_diff_series.replace(0, np.nan).fillna(method='bfill')


Dataset preview:
   blink_rate  avg_blink_duration  fixation_percentage  saccade_mean  \
0    1.814881            0.051240             0.050900      0.477543   
1  404.268887            0.000004             0.001576    133.214295   
2  456.430625            0.000553             0.053601    147.147547   
3    0.567997            0.032902             0.076700      0.519501   
4    1.491164            0.041827             0.025500      0.166146   

    saccade_max  saccade_frequency  por_velocity_mean  por_velocity_std  label  
0     24.837986           0.000000        1344.243839      11064.820758      1  
1   2829.905201      147933.536270      825573.992902     603382.572876      0  
2  15092.420380      128818.766330      600359.325224     664740.769795      0  
3     34.077435           0.006406         953.024440       2092.690415      1  
4     38.099731           0.002018        1797.759761       3941.515127      1  
Total samples: 8


In [None]:
'''from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and labels
X = feature_df.drop('label', axis=1)
y = feature_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)'''





In [None]:
import pandas as pd
import os

folder_path = 'new_folder'
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

all_data = []

for file in all_files:
    df = pd.read_csv(file)

    # Add label to each row based on file name
    if os.path.basename(file).startswith('P_'):
        df['label'] = 1  # Patient
    else:
        df['label'] = 0  # Non-patient

    all_data.append(df)

# Combine all rows from all files
final_df = pd.concat(all_data, ignore_index=True)


In [None]:
# Select relevant features for training
features = ['blink', 'saccade_velocity', 'fixation', 'pupil_size']  # You can add more

X = final_df[features]
y = final_df['label']


In [None]:
import numpy as np

sequence_length = 50  # You can try 30, 50, 100

def create_sequences(X, y, sequence_length):
    X_seq = []
    y_seq = []

    for i in range(len(X) - sequence_length):
        X_seq.append(X.iloc[i:i+sequence_length].values)
        y_seq.append(y.iloc[i+sequence_length - 1])  # Label of the last row in the sequence

    return np.array(X_seq), np.array(y_seq)

X_sequences, y_sequences = create_sequences(X, y, sequence_length)

print(f"X shape: {X_sequences.shape}, y shape: {y_sequences.shape}")


X shape: (86906, 50, 4), y shape: (86906,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.3, random_state=42, stratify=y_sequences)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([
    LSTM(64, input_shape=(sequence_length, len(features)), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.5436 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 2/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.5404 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 3/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - accuracy: 0.5413 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 4/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.5401 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 5/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.5432 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 6/10
[1m1902/1902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.5416 - loss: nan - val_accuracy: 0.5403 - val_loss: nan
Epoch 7/10
[1m1902/1902[0m [32m━━━━━━━━━━━━

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))


[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5447 - loss: nan
Test Accuracy: 0.5403
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Confusion Matrix:
[[14087     0]
 [11985     0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70     14087
           1       0.00      0.00      0.00     11985

    accuracy                           0.54     26072
   macro avg       0.27      0.50      0.35     26072
weighted avg       0.29      0.54      0.38     26072



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
'''from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Build the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation (harder test)
scores = cross_val_score(model, X_train_scaled, y_train, cv=3)

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean cross-validation accuracy: {scores.mean():.4f}")'''




Cross-validation accuracy scores: [1. 1. 1.]
Mean cross-validation accuracy: 1.0000
