# Setups

In [None]:
import glob
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
root_data = "../.data/AdVitam/Exp4/Preprocessed"

In [None]:
period = pd.read_csv(os.path.join(root_data, "Physio/periods/features_segm_5.csv"))
takeover = pd.read_csv(os.path.join(root_data, "Physio/takeover_interval/features_tor_10s_30s_with_driving_features.csv"))
questions = pd.read_csv(os.path.join(root_data, "Questionnaire/Exp4_Database.csv"))

# EDA - Periods 


## Single file EDA

In [None]:
period.head()

In [None]:
takeover.head()

In [None]:
period[period['subject_id'] == 1]

In [None]:
period[['subject_id', 'label_sleep']].drop_duplicates().sum()

## Dataset EDA 

The data doesn't seem to be consistent as there are differing % of missing values across different periods / segment.

In [None]:
def analyze_all_datasets_missing_data():
   import glob
   import os
   
   pattern = os.path.join(root_data, "Physio/periods/features_segm_*.csv")
   csv_files = glob.glob(pattern)
   results = []
   
   for file_path in csv_files:
       df = pd.read_csv(file_path)
       missing_info = pd.DataFrame({
           'Missing Count': df.isnull().sum(),
           'Missing Percentage': (df.isnull().sum() / len(df)) * 100
       })
       
       missing_only = missing_info[missing_info['Missing Count'] > 0]
       
       results.append({
           'File': os.path.basename(file_path),
           'Total_Columns': len(df.columns),
           'Missing_Columns': len(missing_only),
       })
   
   results_df = pd.DataFrame(results)
   return results_df

analyze_all_datasets_missing_data()

In [None]:
def analyze_all_datasets_missing_data():
   
   pattern = os.path.join(root_data, "Physio/periods/features_segm_*.csv")
   csv_files = glob.glob(pattern)
   
   # Dictionary to store missing percentages for each file
   all_missing_percentages = {}
   
   for file_path in csv_files:
       df = pd.read_csv(file_path)
       missing_pct = (df.isnull().sum() / len(df)) * 100
       
       # Extract segm_n from filename
       filename = os.path.basename(file_path)
       segm_name = filename.replace('features_', '').replace('.csv', '')
       all_missing_percentages[segm_name] = missing_pct
   
   # Create comparison dataframe
   comparison_df = pd.DataFrame(all_missing_percentages)
   
   # Only keep rows (features) that have missing data in at least one file
   comparison_df = comparison_df[(comparison_df > 0).any(axis=1)]
   
   # Fill NaN with 0 (meaning no missing data for that feature in that file)
   comparison_df = comparison_df.fillna(0)
   
   # Sort columns by numeric order
   cols = [col for col in comparison_df.columns if col.startswith('segm_')]
   cols_sorted = sorted(cols, key=lambda x: int(x.split('_')[1]))
   comparison_df = comparison_df[cols_sorted]
   
   # Add average missing percentage column
   comparison_df['Average_Missing'] = comparison_df.mean(axis=1)
   
   # Sort by average missing percentage (descending)
   comparison_df = comparison_df.sort_values('Average_Missing', ascending=False)
   
   return comparison_df

analyze_all_datasets_missing_data()

In [None]:
questions.columns

In [None]:
missing_info = pd.DataFrame({
    'Missing Count': questions.isnull().sum(),
    'Missing Percentage': (questions.isnull().sum() / len(questions)) * 100
})
print(missing_info[missing_info['Missing Count'] > 0])

In [None]:
questions[questions['time_sleep_raw'].isna()]

In [None]:
def create_sleep_deprivation_analysis(period_data):
   """
   Create physiological changes visualization over segment_id, separated by urban/rural periods
   """
   
   # Claude generated
   
   # Key features selected based on sleep deprivation research findings
   key_features = [
       'ECG_Rate_Mean_Dr',      # Heart rate - fundamental autonomic response, increases with fatigue/stress
                                # Research: Direct indicator of sympathetic activation during sleep deprivation
       
       'EDA_filtered_mean_Dr',  # EDA mean - sympathetic nervous system activation marker
                                # Research: Greco et al. (2017) found EDA higher frequencies most sensitive to sleep deprivation
                                # Reduced SNS activation occurs with physical fatigue from sleep loss
       
       'HRV_RMSSD_Dr',         # Root Mean Square of Successive Differences - parasympathetic tone indicator
                               # Research: RMSSD reflects recovery capacity and vagal tone
                               # Decreases significantly with sleep deprivation, indicating reduced autonomic recovery
       
       'HRV_LF_Dr',            # Low Frequency HRV (0.04-0.15 Hz) - sympathetic activity marker
                               # Research: LF power changes reflect sympathetic nervous system modulation
                               # Important for detecting autonomic dysfunction during fatigue
       
       'HRV_HF_Dr',            # High Frequency HRV (0.15-0.4 Hz) - parasympathetic activity marker  
                               # Research: HF power directly reflects vagal tone and respiratory sinus arrhythmia
                               # Decreases with sleep deprivation indicating autonomic imbalance
       
       'SCR_Peaks_freq_Dr'     # Skin Conductance Response peak frequency - arousal/alertness indicator
                               # Research: SCR frequency correlates with cognitive arousal and attention
                               # Decreases with sleepiness, making it ideal for driver alertness monitoring
   ]
   
   # Feature selection rationale:
   # 1. ECG_Rate_Mean_Dr: Most fundamental and reliable autonomic measure, changes predictably with fatigue
   # 2. EDA_filtered_mean_Dr: Research shows EDA is most sensitive to sleep deprivation effects on SNS
   # 3. HRV_RMSSD_Dr: Gold standard for parasympathetic assessment, critical for recovery capacity
   # 4. HRV_LF_Dr & HRV_HF_Dr: Complementary autonomic measures (sympathetic vs parasympathetic balance)
   # 5. SCR_Peaks_freq_Dr: Direct measure of moment-to-moment alertness, crucial for driving safety
   #
   # These 6 features provide comprehensive autonomic nervous system assessment:
   # - Basic cardiac response (heart rate)
   # - Sympathetic activation (EDA, LF HRV)  
   # - Parasympathetic function (RMSSD, HF HRV)
   # - Alertness/arousal (SCR frequency)
   
   # Assuming period column contains 'urban' and 'rural' values
   urban_data = period_data[period_data['period'].str.contains('urban', case=False, na=False)]
   rural_data = period_data[period_data['period'].str.contains('rural', case=False, na=False)]
   
   # Create the plot - 2 rows (urban/rural) x 3 columns (features)
   fig, axes = plt.subplots(4, 3, figsize=(18, 16))
   
   feature_labels = [
       'Heart Rate (BPM)',
       'EDA Mean (μS)', 
       'HRV RMSSD (ms)',
       'HRV LF Power',
       'HRV HF Power',
       'SCR Peak Frequency'
   ]
   
   colors = ['#2E86AB', '#A23B72']  # Blue for normal, Red for sleep deprived
   
   # Plot urban data (first 2 rows)
   for idx in range(6):
       row = idx // 3
       col = idx % 3
       ax = axes[row, col]
       
       feature = key_features[idx]
       label = feature_labels[idx]
       
       if feature not in urban_data.columns:
           continue
           
       # Group by sleep condition and segment_id for urban data
       sleep_grouped = urban_data.groupby(['label_sleep', 'segment_id'])[feature].agg(['mean', 'std', 'count']).reset_index()
       
       # Separate data for each sleep condition
       normal_sleep = sleep_grouped[sleep_grouped['label_sleep'] == 0]
       sleep_deprived = sleep_grouped[sleep_grouped['label_sleep'] == 1]
       
       # Plot with error bars
       if not normal_sleep.empty:
           normal_se = normal_sleep['std'] / np.sqrt(normal_sleep['count'])
           ax.errorbar(normal_sleep['segment_id'], normal_sleep['mean'], 
                      yerr=normal_se, color=colors[0], linewidth=3, 
                      marker='o', markersize=8, capsize=5,
                      label='Normal Sleep', alpha=0.8)
       
       if not sleep_deprived.empty:
           deprived_se = sleep_deprived['std'] / np.sqrt(sleep_deprived['count'])
           ax.errorbar(sleep_deprived['segment_id'], sleep_deprived['mean'], 
                      yerr=deprived_se, color=colors[1], linewidth=3,
                      marker='s', markersize=8, capsize=5,
                      label='Sleep Deprived', alpha=0.8)
       
       ax.set_xlabel('Segment ID', fontsize=12, fontweight='bold')
       ax.set_ylabel(label, fontsize=12, fontweight='bold')
       ax.set_title(f'Urban: {label}', fontsize=14, fontweight='bold')
       ax.grid(True, alpha=0.3)
       ax.legend(fontsize=10)
   
   # Plot rural data (last 2 rows)
   for idx in range(6):
       row = (idx // 3) + 2  # Start from row 2
       col = idx % 3
       ax = axes[row, col]
       
       feature = key_features[idx]
       label = feature_labels[idx]
       
       if feature not in rural_data.columns:
           continue
           
       # Group by sleep condition and segment_id for rural data
       sleep_grouped = rural_data.groupby(['label_sleep', 'segment_id'])[feature].agg(['mean', 'std', 'count']).reset_index()
       
       # Separate data for each sleep condition
       normal_sleep = sleep_grouped[sleep_grouped['label_sleep'] == 0]
       sleep_deprived = sleep_grouped[sleep_grouped['label_sleep'] == 1]
       
       # Plot with error bars
       if not normal_sleep.empty:
           normal_se = normal_sleep['std'] / np.sqrt(normal_sleep['count'])
           ax.errorbar(normal_sleep['segment_id'], normal_sleep['mean'], 
                      yerr=normal_se, color=colors[0], linewidth=3, 
                      marker='o', markersize=8, capsize=5,
                      label='Normal Sleep', alpha=0.8)
       
       if not sleep_deprived.empty:
           deprived_se = sleep_deprived['std'] / np.sqrt(sleep_deprived['count'])
           ax.errorbar(sleep_deprived['segment_id'], sleep_deprived['mean'], 
                      yerr=deprived_se, color=colors[1], linewidth=3,
                      marker='s', markersize=8, capsize=5,
                      label='Sleep Deprived', alpha=0.8)
       
       ax.set_xlabel('Segment ID', fontsize=12, fontweight='bold')
       ax.set_ylabel(label, fontsize=12, fontweight='bold')
       ax.set_title(f'Rural: {label}', fontsize=14, fontweight='bold')
       ax.grid(True, alpha=0.3)
       ax.legend(fontsize=10)
   
   plt.tight_layout()
   
   fig.suptitle('Physiological Changes Over Segments: Urban vs Rural Driving\n' +
               'Sleep Deprived vs Normal Sleep Conditions', 
               fontsize=16, fontweight='bold', y=0.98)
   
   return fig

fig = create_sleep_deprivation_analysis(period)
plt.show()

# Visualization (Correlation)

In [None]:
# Load the data
df = pd.read_csv('../.data/AdVitam/Exp4/Preprocessed/Physio/periods/features_segm_10.csv')

In the classification paper, to me this is odd. The top 10 features here include baseline values. However, the baseline values across periods are the same (because its the same subject). Therefore, it essentially acts as a subject id? 

See how below there's no duplicate subject id (total rows = 63)

In [None]:
df[['subject_id', 'SCR_Peaks_Amplitude_Mean_Bl', 'EDA_tonic_min_Bl', 'SCR_Peaks_freq_Bl']].drop_duplicates()

In [None]:
# Top 10 features from Meteier et al. paper (Table 8)
# These are the most predictive features for sleep deprivation detection
top_10_features = [
    'SCR_Peaks_Amplitude_Mean_Bl',  # 1. Mean Amplitude of SCRs
    'EDA_tonic_min_Bl',             # 2. Minimum tonic EDA level (Bl)
    'EDA_filtered_std_Bl',          # 3. SD of raw EDA level (Bl)
    'SCR_Peaks_N_Bl',               # 4. Number of SCRs per minute (Bl)
    'EDA_tonic_max_Bl',             # 5. Maximum tonic EDA level (Bl)
    'SCR_Peaks_freq_Bl',            # 6. Number of SCRs per minute
    'EDA_filtered_min_Bl',          # 7. Minimum raw EDA level
    'SCR_Peaks_Amplitude_Mean_Dr',  # 8. Mean Amplitude of SCRs (Dr)
    'EDA_filtered_max_Bl',          # 9. Maximum raw EDA level (Bl)
    'EDA_filtered_std_Dr'           # 10. SD of raw EDA level (Bl)
]

sns.heatmap(df[['label_sleep'] + top_10_features].corr())

In [None]:
# Get all feature columns (exclude metadata)
metadata_cols = ['subject_id', 'label_sleep', 'label_first_scenario', 'label_time_exp', 
                 'period', 'segment_id', 'time_start', 'time_end']
feature_cols = [col for col in df.columns if col not in metadata_cols]

# Calculate correlation of all features with sleep deprivation
correlations = df[feature_cols + ['label_sleep']].corr()['label_sleep'].drop('label_sleep')
top_10_correlated = correlations.abs().sort_values(ascending=False).head(10)

print(f"Top 10 most correlated features with sleep deprivation:")
for i, (feat, corr) in enumerate(top_10_correlated.items(), 1):
    print(f"{i:2d}. {feat}: {correlations[feat]:.3f}")

# Heatmap of top 10 correlated features
plt.figure(figsize=(10, 8))
sns.heatmap(df[['label_sleep'] + top_10_correlated.index.tolist()].corr(), 
            annot=True, cmap='RdBu_r', center=0, fmt='.3f')
plt.title('Top 10 Most Correlated Features with Sleep Deprivation')
plt.tight_layout()
plt.show()

# Test a simple model

In [None]:
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

# Features and target
X = df[feature_cols].fillna(0)  # Simple fillna for missing values
X = X.replace([np.inf, -np.inf], 0)  # Replace inf values
y = df['label_sleep']
groups = df['subject_id']  # For proper splitting

print(f"Features: {X.shape[1]}")
print(f"Samples: {len(y)}")
print(f"Sleep distribution: {y.value_counts().to_dict()}")

# Split by participants (proper way)
splitter = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
train_idx, test_idx = next(splitter.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print(f"\nTrain: {len(X_train)} samples, {len(X_train.groupby(groups.iloc[train_idx]).size())} participants")
print(f"Test: {len(X_test)} samples, {len(X_test.groupby(groups.iloc[test_idx]).size())} participants")

# Train XGBoost
model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTest Accuracy: {accuracy:.3f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

print(f"\nTop 10 Feature Importances:")
print(feature_importance)

# Test for similarity amongst baseline for the between-subject groups

In [None]:
from scipy import stats

# Check baseline differences between sleep-deprived and well-rested groups
baseline_features = [f for f in df.columns if f.endswith('_Bl')]

for feature in baseline_features:
    sleep_depr = df[df['label_sleep'] == 1][feature]
    well_rest = df[df['label_sleep'] == 0][feature]
    
    # T-test
    t_stat, p_value = stats.ttest_ind(sleep_depr, well_rest)
    
    print(f"{feature}:")
    print(f"  Sleep Deprived: μ={sleep_depr.mean():.3f}, σ={sleep_depr.std():.3f}")
    print(f"  Well Rested:    μ={well_rest.mean():.3f}, σ={well_rest.std():.3f}")
    print(f"  T-test p-value: {p_value:.3f}")
    print(f"  Significant:    {'YES' if p_value < 0.05 else 'NO'}")
    print()

# Summary
significant_count = sum([stats.ttest_ind(df[df['label_sleep'] == 1][f], 
                                        df[df['label_sleep'] == 0][f])[1] < 0.05 
                        for f in baseline_features])

print(f"SUMMARY: {significant_count}/{len(baseline_features)} baseline features differ significantly")
print(f"Percentage: {significant_count/len(baseline_features)*100:.1f}%")

if significant_count/len(baseline_features) > 0.5:
    print("🚨 MAJOR PROBLEM: Most baselines differ - model likely learning individual differences")
elif significant_count/len(baseline_features) > 0.3:
    print("⚠️ PROBLEM: Many baselines differ - results questionable")
else:
    print("✅ Baselines look reasonable")