# Late Fusion Model by Visit Number

Train and evaluate the late fusion combined model separately for:
- Visit 1 only (first time subjects)
- Visit 2 only (second time subjects)
- Visit 3 only (third time subjects)

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## Load Session Mapping

In [2]:
# Load session mapping to get visit numbers
session_mapping = pd.read_csv('../data/results/session_mapping.csv', 
                              dtype={'mmdd': str, 'hhmm': str, 'user_id': str})

# Create join key
session_mapping['subject_id'] = session_mapping['mmdd'] + '_' + session_mapping['hhmm'] + '_' + session_mapping['user_id']

print(f"Session mapping: {session_mapping.shape}")
print(f"\nVisit number distribution:")
print(session_mapping['visit_number'].value_counts().sort_index())
session_mapping.head()

Session mapping: (99, 10)

Visit number distribution:
visit_number
1.0    51
2.0    34
3.0     8
Name: count, dtype: int64


Unnamed: 0,filename,date,time,user_id,mmdd,hhmm,team,session,visit_number,subject_id
0,preprocessing_0816_1400_9M4VCHG.json,08/16/2021,14:00,9M4VCHG,816,1400,Team 12,Session 3,2.0,0816_1400_9M4VCHG
1,preprocessing_0924_1600_9M4VCHG.json,09/24/2021,16:00,9M4VCHG,924,1600,Team 24,Session 3,3.0,0924_1600_9M4VCHG
2,preprocessing_0825_1000_9M4VCHG.json,08/25/2021,10:00,9M4VCHG,825,1000,Team 17,Session 2,2.0,0825_1000_9M4VCHG
3,preprocessing_0813_1600_539136F.json,08/13/2021,16:00,539136F,813,1600,Team 12,Session 2,1.0,0813_1600_539136F
4,preprocessing_0826_1000_539136F.json,08/26/2021,10:00,539136F,826,1000,Team 17,Session 3,3.0,0826_1000_539136F


## Load Feature Data

Load the preprocessed behavioral features (gaze, pose, audio)

In [3]:
# Load preprocessed features
with open('../data/results/preprocessing/preprocessed_features.pkl', 'rb') as f:
    data = pickle.load(f)

print(f"Loaded data for {len(data)} subjects")
print(f"\nSample subject keys: {list(data.keys())[:5]}")

# Check structure of first subject
first_subject = list(data.keys())[0]
print(f"\nKeys for subject {first_subject}:")
print(data[first_subject].keys())

FileNotFoundError: [Errno 2] No such file or directory: '../data/results/preprocessing/preprocessed_features.pkl'

## Prepare Data by Visit Number

In [None]:
def prepare_data_for_visit(data, session_mapping, visit_num):
    """
    Filter data to include only subjects from a specific visit number.
    """
    # Get subject IDs for this visit
    visit_subjects = session_mapping[session_mapping['visit_number'] == visit_num]['subject_id'].tolist()
    
    # Filter data dictionary
    filtered_data = {k: v for k, v in data.items() if k in visit_subjects}
    
    print(f"Visit {int(visit_num)}: {len(filtered_data)} subjects")
    return filtered_data

# Prepare data for each visit
data_visit1 = prepare_data_for_visit(data, session_mapping, 1.0)
data_visit2 = prepare_data_for_visit(data, session_mapping, 2.0)
data_visit3 = prepare_data_for_visit(data, session_mapping, 3.0)

## Feature Extraction Function

In [None]:
def extract_features_and_labels(data_dict):
    """
    Extract combined behavioral features (gaze, pose, audio) and labels.
    """
    X_list = []
    y_list = []
    subject_list = []
    
    for subject_id, subject_data in data_dict.items():
        # Get features
        gaze_features = subject_data.get('gaze_features', None)
        pose_features = subject_data.get('pose_features', None)
        audio_features = subject_data.get('audio_features', None)
        labels = subject_data.get('labels', None)
        
        if gaze_features is None or pose_features is None or audio_features is None or labels is None:
            continue
        
        # Combine features (late fusion at feature level)
        combined_features = np.concatenate([
            gaze_features,
            pose_features,
            audio_features
        ], axis=1)
        
        X_list.append(combined_features)
        y_list.append(labels)
        subject_list.extend([subject_id] * len(labels))
    
    X = np.vstack(X_list)
    y = np.concatenate(y_list)
    subjects = np.array(subject_list)
    
    print(f"Features shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    print(f"Unique subjects: {len(np.unique(subjects))}")
    print(f"Label distribution: {np.bincount(y)}")
    
    return X, y, subjects

## LOSO Cross-Validation Function

In [None]:
def run_loso_cv(X, y, subjects, random_state=42):
    """
    Run Leave-One-Subject-Out cross-validation.
    """
    logo = LeaveOneGroupOut()
    
    all_y_true = []
    all_y_pred = []
    subject_accuracies = []
    
    for train_idx, test_idx in logo.split(X, y, subjects):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Train model
        clf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
        clf.fit(X_train, y_train)
        
        # Predict
        y_pred = clf.predict(X_test)
        
        # Store results
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
        
        # Subject-level accuracy
        subject_acc = accuracy_score(y_test, y_pred)
        subject_accuracies.append(subject_acc)
    
    # Overall metrics
    overall_acc = accuracy_score(all_y_true, all_y_pred)
    overall_f1 = f1_score(all_y_true, all_y_pred, average='weighted')
    
    results = {
        'accuracy': overall_acc,
        'f1_score': overall_f1,
        'subject_accuracies': subject_accuracies,
        'y_true': all_y_true,
        'y_pred': all_y_pred
    }
    
    return results

## Run Model for Visit 1

In [None]:
print("="*70)
print("VISIT 1: First Time Subjects")
print("="*70)

X1, y1, subjects1 = extract_features_and_labels(data_visit1)
results_v1 = run_loso_cv(X1, y1, subjects1)

print(f"\nOverall Accuracy: {results_v1['accuracy']:.4f}")
print(f"Overall F1 Score: {results_v1['f1_score']:.4f}")
print(f"Mean Subject Accuracy: {np.mean(results_v1['subject_accuracies']):.4f} ± {np.std(results_v1['subject_accuracies']):.4f}")
print(f"\nClassification Report:")
print(classification_report(results_v1['y_true'], results_v1['y_pred']))

## Run Model for Visit 2

In [None]:
print("="*70)
print("VISIT 2: Second Time Subjects")
print("="*70)

X2, y2, subjects2 = extract_features_and_labels(data_visit2)
results_v2 = run_loso_cv(X2, y2, subjects2)

print(f"\nOverall Accuracy: {results_v2['accuracy']:.4f}")
print(f"Overall F1 Score: {results_v2['f1_score']:.4f}")
print(f"Mean Subject Accuracy: {np.mean(results_v2['subject_accuracies']):.4f} ± {np.std(results_v2['subject_accuracies']):.4f}")
print(f"\nClassification Report:")
print(classification_report(results_v2['y_true'], results_v2['y_pred']))

## Run Model for Visit 3

In [None]:
print("="*70)
print("VISIT 3: Third Time Subjects")
print("="*70)

X3, y3, subjects3 = extract_features_and_labels(data_visit3)
results_v3 = run_loso_cv(X3, y3, subjects3)

print(f"\nOverall Accuracy: {results_v3['accuracy']:.4f}")
print(f"Overall F1 Score: {results_v3['f1_score']:.4f}")
print(f"Mean Subject Accuracy: {np.mean(results_v3['subject_accuracies']):.4f} ± {np.std(results_v3['subject_accuracies']):.4f}")
print(f"\nClassification Report:")
print(classification_report(results_v3['y_true'], results_v3['y_pred']))

## Compare Results Across Visits

In [None]:
# Summary comparison
comparison_df = pd.DataFrame({
    'Visit': [1, 2, 3],
    'N_subjects': [len(data_visit1), len(data_visit2), len(data_visit3)],
    'Accuracy': [results_v1['accuracy'], results_v2['accuracy'], results_v3['accuracy']],
    'F1_Score': [results_v1['f1_score'], results_v2['f1_score'], results_v3['f1_score']],
    'Mean_Subject_Acc': [
        np.mean(results_v1['subject_accuracies']),
        np.mean(results_v2['subject_accuracies']),
        np.mean(results_v3['subject_accuracies'])
    ],
    'Std_Subject_Acc': [
        np.std(results_v1['subject_accuracies']),
        np.std(results_v2['subject_accuracies']),
        np.std(results_v3['subject_accuracies'])
    ]
})

print("\n" + "="*70)
print("COMPARISON ACROSS VISITS")
print("="*70)
print(comparison_df.to_string(index=False))

## Visualization: Accuracy Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall accuracy by visit
axes[0].bar([1, 2, 3], comparison_df['Accuracy'], 
           color=['blue', 'orange', 'green'], alpha=0.7)
axes[0].set_xlabel('Visit Number')
axes[0].set_ylabel('Overall Accuracy')
axes[0].set_title('Overall Accuracy by Visit')
axes[0].set_ylim(0, 1)
axes[0].set_xticks([1, 2, 3])
axes[0].grid(True, alpha=0.3)

# Mean subject accuracy with error bars
axes[1].errorbar([1, 2, 3], comparison_df['Mean_Subject_Acc'], 
                yerr=comparison_df['Std_Subject_Acc'],
                marker='o', capsize=5, linewidth=2, markersize=10)
axes[1].set_xlabel('Visit Number')
axes[1].set_ylabel('Mean Subject Accuracy')
axes[1].set_title('Mean Subject Accuracy by Visit (±std)')
axes[1].set_ylim(0, 1)
axes[1].set_xticks([1, 2, 3])
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Distribution of Subject Accuracies

In [None]:
# Box plot of subject accuracies
plt.figure(figsize=(10, 6))

data_for_box = [
    results_v1['subject_accuracies'],
    results_v2['subject_accuracies'],
    results_v3['subject_accuracies']
]

bp = plt.boxplot(data_for_box, labels=['Visit 1', 'Visit 2', 'Visit 3'],
                patch_artist=True)

# Color the boxes
colors = ['blue', 'orange', 'green']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

plt.ylabel('Subject Accuracy')
plt.title('Distribution of Subject Accuracies by Visit')
plt.ylim(0, 1.05)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Save Results

In [None]:
# Save comparison results
comparison_df.to_csv('../data/results/late_fusion_by_visit_comparison.csv', index=False)
print("✓ Saved comparison to: ../data/results/late_fusion_by_visit_comparison.csv")

# Save detailed results
results_dict = {
    'visit_1': results_v1,
    'visit_2': results_v2,
    'visit_3': results_v3
}

with open('../data/results/late_fusion_by_visit_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f)
print("✓ Saved detailed results to: ../data/results/late_fusion_by_visit_results.pkl")