# Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from scipy.spatial.transform import Rotation as R
import joblib
import gc
import os
import warnings

warnings.filterwarnings('ignore')

DATA_DIR = '/kaggle/input/cmi-detect-behavior-with-sensor-data'
TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
TRAIN_DEMO_CSV = os.path.join(DATA_DIR, 'train_demographics.csv')
TOF_FILL_VALUE = 500 # Value to replace -1 in TOF

# --- Output Paths ---
OUTPUT_DIR = '/kaggle/working/data/preprocessed' # Save outputs in working directory subfolder
SCALER_PATH = os.path.join(OUTPUT_DIR, 'standard_scaler.joblib')
LABEL_ENCODER_PATH = os.path.join(OUTPUT_DIR, 'label_encoder.joblib')
IMU_COLS_PATH = os.path.join(OUTPUT_DIR, 'imu_feature_cols_3branch.pkl')
THM_COLS_PATH = os.path.join(OUTPUT_DIR, 'thm_feature_cols_3branch.pkl')
TOF_COLS_PATH = os.path.join(OUTPUT_DIR, 'tof_feature_cols_3branch.pkl')
ALL_COLS_PATH = os.path.join(OUTPUT_DIR, 'all_feature_cols_3branch.pkl')
PROCESSED_TRAIN_PATH = os.path.join(OUTPUT_DIR, 'train_processed.parquet') # Save processed data

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created/exists at: {OUTPUT_DIR}")


# ==============================================================================
#                 HELPER FUNCTIONS (Preprocessing Steps)
# ==============================================================================

# --- Define Sensor Columns ---
# Define these globally or pass them around as needed
acc_cols = ['acc_x', 'acc_y', 'acc_z']
rot_cols = ['rot_x', 'rot_y', 'rot_z', 'rot_w']
# Dynamically get thm/tof cols after loading
try:
    _temp_df = pd.read_csv(TRAIN_CSV, nrows=0) # Read only header
    thm_cols = sorted([col for col in _temp_df.columns if 'thm_' in col])
    tof_cols = sorted([col for col in _temp_df.columns if 'tof_' in col])
    del _temp_df
except Exception as e:
    print(f"Warning: Could not dynamically determine thm/tof columns from header: {e}")
    # Fallback to static definition if dynamic fails
    thm_cols = ['thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5']
    tof_cols = sorted([f'tof_{s}_v{i}' for s in range(1, 6) for i in range(64)])


imu_cols = acc_cols + rot_cols
non_imu_sensor_cols = thm_cols + tof_cols
all_initial_sensor_cols = imu_cols + non_imu_sensor_cols

TARGET_GESTURES = [
    'Above ear - pull hair', 'Cheek - pinch skin', 'Eyebrow - pull hair',
    'Eyelash - pull hair', 'Forehead - pull hairline', 'Forehead - scratch',
    'Neck - pinch skin', 'Neck - scratch'
]

def preprocess_impute(df, tof_fill_value=500):
    """Handles imputation for sensor data."""
    print(f"Imputing with TOF fill value: {tof_fill_value}...")
    present_tof_cols = [col for col in tof_cols if col in df.columns]
    present_thm_cols = [col for col in thm_cols if col in df.columns]
    present_all_initial_cols = [col for col in all_initial_sensor_cols if col in df.columns]

    if present_tof_cols:
        df[present_tof_cols] = df[present_tof_cols].replace(-1, tof_fill_value)
    if present_thm_cols:
        for col in present_thm_cols:
             df[col] = df[col].apply(lambda x: np.nan if x < 20 else x)
        if 'sequence_id' in df.columns:
            df[present_thm_cols] = df.groupby('sequence_id')[present_thm_cols].transform(
                lambda x: x.interpolate(method='linear', limit_direction='both', axis=0)
            )
        else:
            df[present_thm_cols] = df[present_thm_cols].interpolate(method='linear', limit_direction='both', axis=0)

    group_cols_to_fill = list(set(present_all_initial_cols) - set(present_thm_cols))
    if 'sequence_id' in df.columns and group_cols_to_fill:
         df[group_cols_to_fill] = df.groupby('sequence_id')[group_cols_to_fill].transform(
             lambda x: x.ffill().bfill()
         )
    elif group_cols_to_fill:
         df[group_cols_to_fill] = df[group_cols_to_fill].ffill().bfill()

    if present_all_initial_cols:
        df[present_all_initial_cols] = df[present_all_initial_cols].fillna(0)
    print("Imputation complete.")
    return df

def correct_handedness(df, demo_df):
    """Corrects sensor readings based on subject handedness."""
    print("Correcting handedness...")
    if 'handedness' not in df.columns:
      if 'subject' in df.columns and 'subject' in demo_df.columns:
          df['subject'] = df['subject'].astype(str)
          demo_df['subject'] = demo_df['subject'].astype(str)
          df = df.merge(demo_df[['subject', 'handedness']], on='subject', how='left')
          df['handedness'] = df['handedness'].fillna('Right')
      else:
          df['handedness'] = 'Right'

    left_handed_mask = df['handedness'] == 'Left'
    if left_handed_mask.any():
        df.loc[left_handed_mask, 'acc_x'] *= -1
        df.loc[left_handed_mask, 'rot_y'] *= -1
        df.loc[left_handed_mask, 'rot_z'] *= -1
    print("Handedness correction complete.")
    return df

def correct_upside_down(df):
    """Corrects sensor readings for known upside-down subjects."""
    print("Correcting upside-down subjects...")
    upside_down_subjects = ['SUBJ_019262', 'SUBJ_045235']
    if 'subject' in df.columns:
        df['subject'] = df['subject'].astype(str)
        ud_mask = df['subject'].isin(upside_down_subjects)
        if ud_mask.any():
            df.loc[ud_mask, ['acc_x', 'acc_y']] *= -1
            df.loc[ud_mask, ['rot_x', 'rot_y']] *= -1
    print("Upside-down correction complete.")
    return df

def add_linear_acceleration(df):
    """Calculates linear acceleration by removing gravity."""
    print("Calculating linear acceleration...")
    if not all(col in df.columns for col in rot_cols + acc_cols):
        print("Warning: Acc/Rot columns missing. Assigning raw acceleration to linear.")
        # Ensure target columns exist before assignment
        for col in acc_cols:
             if col not in df.columns: df[col] = 0.0 # Add missing acc cols if needed
        df['lin_acc_x'], df['lin_acc_y'], df['lin_acc_z'] = df['acc_x'], df['acc_y'], df['acc_z']
        return df

    # Ensure float type before processing
    for col in rot_cols + acc_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    quats = df[rot_cols].values
    accels = df[acc_cols].values
    linear_accel = np.zeros_like(accels)
    gravity_world = np.array([0, 0, 1.0])

    quats_numeric = np.nan_to_num(quats)
    valid_quat_mask = ~np.all(quats_numeric == 0, axis=1) & (np.abs(np.linalg.norm(quats_numeric, axis=1) - 1.0) < 1e-2)
    valid_quats_data = quats_numeric[valid_quat_mask]

    if len(valid_quats_data) > 0:
        try:
            norms = np.linalg.norm(valid_quats_data, axis=1, keepdims=True)
            norms[norms < 1e-6] = 1.0
            valid_quats_normalized = valid_quats_data / norms
            valid_quats_normalized = np.clip(valid_quats_normalized, -1.0, 1.0)
            r = R.from_quat(valid_quats_normalized)
            r_inv = r.inv()
            gravity_sensor_frame = r_inv.apply(gravity_world)
            linear_accel[valid_quat_mask] = accels[valid_quat_mask] - gravity_sensor_frame
        except Exception as e:
            print(f"Warning: Scipy Rotation error during gravity removal: {e}. Using raw accel for some rows.")
            linear_accel[valid_quat_mask] = accels[valid_quat_mask] # Fallback

    invalid_quat_mask = ~valid_quat_mask
    linear_accel[invalid_quat_mask] = accels[invalid_quat_mask] # Use raw accel if quat invalid

    df['lin_acc_x'], df['lin_acc_y'], df['lin_acc_z'] = linear_accel[:, 0], linear_accel[:, 1], linear_accel[:, 2]
    print("Linear acceleration added.")
    return df

def add_basic_features(df):
    """Adds magnitude features."""
    print("Adding basic features (magnitudes)...")
     # Ensure float type
    acc_mag_cols = ['acc_x', 'acc_y', 'acc_z']
    lin_acc_mag_cols = ['lin_acc_x', 'lin_acc_y', 'lin_acc_z']
    rot_mag_cols = ['rot_x', 'rot_y', 'rot_z']
    for col in acc_mag_cols + lin_acc_mag_cols + rot_mag_cols:
         if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    if all(col in df.columns for col in acc_mag_cols):
        df['acc_mag'] = np.linalg.norm(df[acc_mag_cols].values, axis=1)
    else: df['acc_mag'] = 0.0

    if all(col in df.columns for col in lin_acc_mag_cols):
        df['lin_acc_mag'] = np.linalg.norm(df[lin_acc_mag_cols].values, axis=1)
    else: df['lin_acc_mag'] = 0.0

    if all(col in df.columns for col in rot_mag_cols):
        df['rot_mag'] = np.linalg.norm(df[rot_mag_cols].values, axis=1)
    else: df['rot_mag'] = 0.0
    print("Basic features added.")
    return df

def full_preprocess_pipeline(df, demo_df, scaler=None, fit_scaler=False):
    """Applies the full preprocessing pipeline using CPU libraries."""
    print("-" * 30)
    print("Starting Full Preprocessing Pipeline...")
    # Ensure IDs are numeric/string
    if 'sequence_id' in df.columns and df['sequence_id'].dtype == 'object':
         df['sequence_id'] = df['sequence_id'].str.extract('(\d+)').astype(int)
    if 'subject' in df.columns: df['subject'] = df['subject'].astype(str)
    if 'subject' in demo_df.columns: demo_df['subject'] = demo_df['subject'].astype(str)

    # --- Preprocessing Steps ---
    df_processed = preprocess_impute(df.copy(), tof_fill_value=TOF_FILL_VALUE)
    df_processed = correct_handedness(df_processed, demo_df)
    df_processed = correct_upside_down(df_processed)
    df_processed = add_linear_acceleration(df_processed)
    df_processed = add_basic_features(df_processed)

    # --- Scaling ---
    engineered_imu_cols = ['lin_acc_x', 'lin_acc_y', 'lin_acc_z', 'acc_mag', 'lin_acc_mag', 'rot_mag']
    potential_cols_to_scale = list(set(imu_cols + non_imu_sensor_cols + engineered_imu_cols))
    # Ensure columns exist and sort for consistent order
    cols_to_scale = sorted([col for col in potential_cols_to_scale if col in df_processed.columns])

    if not cols_to_scale:
         print("Warning: No columns identified for scaling.")
         return df_processed, scaler, cols_to_scale # Return early if no cols

    if fit_scaler:
        print(f"Fitting StandardScaler on {len(cols_to_scale)} features.")
        scaler = StandardScaler()
        # Ensure data is numeric before fitting
        df_processed[cols_to_scale] = df_processed[cols_to_scale].apply(pd.to_numeric, errors='coerce').fillna(0)
        df_processed[cols_to_scale] = scaler.fit_transform(df_processed[cols_to_scale])
        # Store feature names IN THE ORDER THEY WERE FITTED
        # Use get_feature_names_out if available, otherwise use cols_to_scale
        try:
             scaler.feature_names_in_ = scaler.get_feature_names_out()
        except AttributeError:
             scaler.feature_names_in_ = cols_to_scale
        print("Scaler fitted.")
    elif scaler is not None:
        print(f"Transforming features using loaded scaler...")
        # Ensure scaler has feature names
        if not hasattr(scaler, 'feature_names_in_') or scaler.feature_names_in_ is None:
             raise ValueError("Loaded scaler is missing 'feature_names_in_'. Cannot proceed.")

        # Ensure columns match scaler's expectations
        cols_available_ordered = [col for col in scaler.feature_names_in_ if col in df_processed.columns]
        # Create a DataFrame with the exact columns and order the scaler expects
        df_ordered = pd.DataFrame(0.0, index=df_processed.index, columns=scaler.feature_names_in_)
        # Fill with available data, respecting scaler's column order
        df_ordered[cols_available_ordered] = df_processed[cols_available_ordered]
        if len(cols_available_ordered) < len(scaler.feature_names_in_):
            missing_cols = list(set(scaler.feature_names_in_) - set(cols_available_ordered))
            print(f"Warning: Filled {len(missing_cols)} missing columns with zeros before scaling: {missing_cols}")
        # Ensure data is numeric before transforming
        df_ordered[scaler.feature_names_in_] = df_ordered[scaler.feature_names_in_].apply(pd.to_numeric, errors='coerce').fillna(0)
        # Transform using the reordered/filled dataframe
        df_processed[scaler.feature_names_in_] = scaler.transform(df_ordered[scaler.feature_names_in_])
        print("Transformation complete.")
    else:
        raise ValueError("Scaler must be provided if fit_scaler is False")

    # Drop handedness column after scaling (if it wasn't used as feature)
    if 'handedness' in df_processed.columns and 'handedness' not in cols_to_scale:
        df_processed = df_processed.drop(columns=['handedness'])

    print("Full Preprocessing Pipeline Finished.")
    print("-" * 30)
    return df_processed, scaler, cols_to_scale

# ==============================================================================
#                      MAIN EXECUTION (Preprocessing Only)
# ==============================================================================
print("Running Preprocessing Script...")

# --- Load Data ---
print("Loading data...")
try:
    train_df_main = pd.read_csv(TRAIN_CSV)
    train_demo_df_main = pd.read_csv(TRAIN_DEMO_CSV)
    print(f"Loaded Train shape: {train_df_main.shape}")
except FileNotFoundError:
    print(f"Error: Training files not found at {DATA_DIR}. Exiting.")
    exit()
except Exception as e:
    print(f"Error loading data: {e}. Exiting.")
    exit()

# --- Apply Full Preprocessing to Training Data ---
train_processed_df_main, scaler_main, feature_columns_main = full_preprocess_pipeline(
    train_df_main, train_demo_df_main, scaler=None, fit_scaler=True
)
print(f"Processed Train shape: {train_processed_df_main.shape}")
print(f"Number of features scaled: {len(feature_columns_main)}")

# --- Define Feature Lists for Branches (Example) ---
imu_eng_cols_main = ['lin_acc_x', 'lin_acc_y', 'lin_acc_z', 'acc_mag', 'lin_acc_mag', 'rot_mag']
model_imu_feature_cols_main = sorted([col for col in feature_columns_main if col in imu_cols + imu_eng_cols_main])
model_thm_feature_cols_main = sorted([col for col in feature_columns_main if col in thm_cols])
model_tof_feature_cols_main = sorted([col for col in feature_columns_main if col in tof_cols])

N_FEATURES_IMU_main = len(model_imu_feature_cols_main)
N_FEATURES_THM_main = len(model_thm_feature_cols_main)
N_FEATURES_TOF_main = len(model_tof_feature_cols_main)
print(f"Branch Features: IMU={N_FEATURES_IMU_main}, THM={N_FEATURES_THM_main}, TOF={N_FEATURES_TOF_main}")


# --- Save Objects ---
print("\nSaving preprocessor objects...")
joblib.dump(scaler_main, SCALER_PATH)
joblib.dump(model_imu_feature_cols_main, IMU_COLS_PATH)
joblib.dump(model_thm_feature_cols_main, THM_COLS_PATH)
joblib.dump(model_tof_feature_cols_main, TOF_COLS_PATH)
joblib.dump(feature_columns_main, ALL_COLS_PATH) # Save list of all scaled features
print(f"Objects saved to: {OUTPUT_DIR}")

# --- Label Encode and Save Encoder ---
print("Encoding target and saving encoder...")
label_encoder_main = LabelEncoder()
if 'gesture' in train_processed_df_main.columns:
    train_processed_df_main['gesture_encoded'] = label_encoder_main.fit_transform(train_processed_df_main['gesture'])
    joblib.dump(label_encoder_main, LABEL_ENCODER_PATH)
    print(f"Target Classes ({len(label_encoder_main.classes_)}): {label_encoder_main.classes_}")
    print(f"Label encoder saved to: {LABEL_ENCODER_PATH}")
else:
    print("Warning: 'gesture' column not found in processed data. Cannot save label encoder.")

# --- Save Processed Data (Optional) ---
print(f"\nSaving processed training data to {PROCESSED_TRAIN_PATH}...")
try:
    # Ensure directory exists before saving
    os.makedirs(os.path.dirname(PROCESSED_TRAIN_PATH), exist_ok=True)
    train_processed_df_main.to_parquet(PROCESSED_TRAIN_PATH, index=False)
    print("Processed data saved.")
except Exception as e:
    print(f"Error saving processed data: {e}")

print("\nPreprocessing script finished.")

# Clean up large dataframes from memory
del train_df_main, train_demo_df_main, train_processed_df_main
gc.collect()

Output directory created/exists at: /kaggle/working/data/preprocessed
Running Preprocessing Script...
Loading data...
Loaded Train shape: (574945, 341)
------------------------------
Starting Full Preprocessing Pipeline...
Imputing with TOF fill value: 500...
Imputation complete.
Correcting handedness...
Handedness correction complete.
Correcting upside-down subjects...
Upside-down correction complete.
Calculating linear acceleration...
Linear acceleration added.
Adding basic features (magnitudes)...
Basic features added.
Fitting StandardScaler on 338 features.
Scaler fitted.
Full Preprocessing Pipeline Finished.
------------------------------
Processed Train shape: (574945, 347)
Number of features scaled: 338
Branch Features: IMU=13, THM=5, TOF=320

Saving preprocessor objects...
Objects saved to: /kaggle/working/data/preprocessed
Encoding target and saving encoder...
Target Classes (18): ['Above ear - pull hair' 'Cheek - pinch skin' 'Drink from bottle/cup'
 'Eyebrow - pull hair' 'Eye

0

In [2]:
import pandas as pd
import numpy as np
import joblib
import os
from tqdm import tqdm # For progress bar

# ==============================================================================
#                            CONFIGURATION
# ==============================================================================
# --- Input Paths ---
# Should match the OUTPUT_DIR and filenames from preprocessing.py
PREPROCESSED_DIR = './data/preprocessed'
PROCESSED_TRAIN_PATH = os.path.join(PREPROCESSED_DIR, 'train_processed.parquet')
ALL_COLS_PATH = os.path.join(PREPROCESSED_DIR, 'all_feature_cols_3branch.pkl') # List of scaled + passthrough

# --- Model Input Parameters ---
MAX_LENGTH = 192 # Sequence length for padding/truncating

# --- Output Paths (Optional: Save the prepared arrays) ---
OUTPUT_NP_DIR = './data/model_input'
X_PATH = os.path.join(OUTPUT_NP_DIR, 'X_train.npy')
Y_PATH = os.path.join(OUTPUT_NP_DIR, 'y_train.npy')
GROUPS_PATH = os.path.join(OUTPUT_NP_DIR, 'groups_train.npy')

os.makedirs(OUTPUT_NP_DIR, exist_ok=True)


# ==============================================================================
#                      LOAD PREPROCESSED DATA & OBJECTS
# ==============================================================================
print("Loading preprocessed data and feature lists...")
try:
    train_processed_df = pd.read_parquet(PROCESSED_TRAIN_PATH)
    # Load the list of all columns that were included after preprocessing (scaled + passthrough)
    final_column_names = joblib.load(ALL_COLS_PATH)
    # Identify which of these are the features (i.e., not IDs, labels, etc.)
    # Define columns that are NOT features
    non_feature_cols = [
        'sequence_id', 'subject', 'gesture', 'row_id', 'sequence_counter',
        'orientation', 'behavior', 'phase', 'sequence_type', 'handedness', # If kept
        'gesture_encoded' # The target label
    ]
    # Filter final_column_names to get only feature columns
    feature_cols = sorted([col for col in final_column_names if col not in non_feature_cols])

    print(f"Loaded processed data with shape: {train_processed_df.shape}")
    print(f"Identified {len(feature_cols)} feature columns.")
    # print("Feature columns:", feature_cols) # Uncomment to verify features

    # Verify essential columns exist
    if 'sequence_id' not in train_processed_df.columns: raise ValueError("Missing 'sequence_id'")
    if 'gesture_encoded' not in train_processed_df.columns: raise ValueError("Missing 'gesture_encoded'")
    if 'subject' not in train_processed_df.columns: raise ValueError("Missing 'subject'")

except FileNotFoundError as e:
    print(f"Error: Required file not found. Ensure preprocessing script ran successfully. Missing: {e.filename}")
    exit()
except Exception as e:
    print(f"An error occurred during loading: {e}")
    exit()

# ==============================================================================
#                      PREPARE MODEL INPUT ARRAYS
# ==============================================================================
print("\nPreparing model input arrays (Grouping, Padding/Truncating)...")

all_sequences = []
all_labels = []
all_groups = []
num_features = len(feature_cols)

# Group by sequence ID
grouped_data = train_processed_df.groupby('sequence_id')
total_sequences = len(grouped_data)
print(f"Processing {total_sequences} sequences...")

# Iterate through each sequence group
for name, group in tqdm(grouped_data, total=total_sequences, desc="Processing Sequences"):
    # Extract features, label, and group
    sequence_features = group[feature_cols].values.astype(np.float32) # Ensure float32
    label = group['gesture_encoded'].iloc[0] # Get the single label for the sequence
    subject = group['subject'].iloc[0]       # Get the subject ID

    current_length = sequence_features.shape[0]
    padded_sequence = np.zeros((MAX_LENGTH, num_features), dtype=np.float32) # Initialize with zeros

    if current_length == 0:
        print(f"Warning: Sequence {name} is empty. Skipping.")
        continue # Skip empty sequences

    # Pad or Truncate
    if current_length >= MAX_LENGTH:
        # Truncate: Take the last MAX_LENGTH steps
        padded_sequence = sequence_features[-MAX_LENGTH:]
    else:
        # Pad: Add zeros at the beginning (pre-padding)
        pad_width = MAX_LENGTH - current_length
        padded_sequence[pad_width:] = sequence_features

    # Append results to lists
    all_sequences.append(padded_sequence)
    all_labels.append(label)
    all_groups.append(subject)

# Convert lists to NumPy arrays
X = np.array(all_sequences)
y = np.array(all_labels)
groups = np.array(all_groups)

print("\nModel input preparation complete.")
print(f"Shape of X: {X.shape}") # Should be (num_sequences, MAX_LENGTH, num_features)
print(f"Shape of y: {y.shape}")   # Should be (num_sequences,)
print(f"Shape of groups: {groups.shape}") # Should be (num_sequences,)

# ==============================================================================
#                      SAVE MODEL INPUT ARRAYS (Optional)
# ==============================================================================
print(f"\nSaving prepared NumPy arrays to {OUTPUT_NP_DIR}...")
try:
    np.save(X_PATH, X)
    np.save(Y_PATH, y)
    np.save(GROUPS_PATH, groups)
    print("Arrays saved successfully.")
except Exception as e:
    print(f"Error saving NumPy arrays: {e}")

print("\nScript finished. You now have X, y, and groups ready for model training.")

Loading preprocessed data and feature lists...
Loaded processed data with shape: (574945, 348)
Identified 338 feature columns.

Preparing model input arrays (Grouping, Padding/Truncating)...
Processing 8151 sequences...


Processing Sequences: 100%|██████████| 8151/8151 [00:07<00:00, 1040.18it/s]



Model input preparation complete.
Shape of X: (8151, 192, 338)
Shape of y: (8151,)
Shape of groups: (8151,)

Saving prepared NumPy arrays to ./data/model_input...
Arrays saved successfully.

Script finished. You now have X, y, and groups ready for model training.


# Model Building and Evaluation

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import linear_sum_assignment
import lightgbm as lgb
import joblib
import gc
import os
import warnings
from tqdm import tqdm # Keep tqdm if installed, otherwise remove

warnings.filterwarnings('ignore')

print("Starting LGBM Pipeline...")
# ==============================================================================
#                            CONFIGURATION
# ==============================================================================
# --- Input Paths ---
PREPROCESSED_DIR = './data/preprocessed'
PROCESSED_TRAIN_PATH = os.path.join(PREPROCESSED_DIR, 'train_processed.parquet')
LABEL_ENCODER_PATH = os.path.join(PREPROCESSED_DIR, 'label_encoder.joblib')
ALL_COLS_PATH = os.path.join(PREPROCESSED_DIR, 'all_feature_cols_3branch.pkl') # List of all scaled columns from preprocessor
IMU_COLS_PATH = os.path.join(PREPROCESSED_DIR, 'imu_feature_cols_3branch.pkl') # List of IMU-related scaled columns from preprocessor

# --- Model/Output Paths ---
OUTPUT_DIR = './data/lgbm_simple_output' # New directory
MODEL_ALL_TMPL = os.path.join(OUTPUT_DIR, "model_lgbm_simple_all_fold_{fold}.txt")
MODEL_IMU_TMPL = os.path.join(OUTPUT_DIR, "model_lgbm_simple_imu_fold_{fold}.txt")
OOF_CSV_PATH = os.path.join(OUTPUT_DIR, "oof_predictions_lgbm_simple.csv")
LGBM_ALL_FEATURES_PATH = os.path.join(OUTPUT_DIR, 'lgbm_simple_all_feature_names.pkl')
LGBM_IMU_FEATURES_PATH = os.path.join(OUTPUT_DIR, 'lgbm_simple_imu_feature_names.pkl')


os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Training Parameters ---
N_SPLITS = 2 # Changed back to 5 splits
SEED = 42

# LGBM Parameters (Simplified, tune these)
LGBM_PARAMS = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'n_estimators': 1500, # Use early stopping
    'learning_rate': 0.03,
    'feature_fraction': 0.7, # Reduced slightly
    'bagging_fraction': 0.7, # Reduced slightly
    'bagging_freq': 1,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'num_leaves': 31, # Kept moderate
    'verbose': -1,
    'n_jobs': -1,
    'seed': SEED,
    'boosting_type': 'gbdt',
    # 'num_class': 18 # Will be set dynamically
}

# --- Target Gestures ---
TARGET_GESTURES = [
    'Above ear - pull hair', 'Cheek - pinch skin', 'Eyebrow - pull hair',
    'Eyelash - pull hair', 'Forehead - pull hairline', 'Forehead - scratch',
    'Neck - pinch skin', 'Neck - scratch'
]

np.random.seed(SEED)

# ==============================================================================
#                      LOAD PREPROCESSED DATA & OBJECTS
# ==============================================================================
print("Loading preprocessed data and objects...")
try:
    train_processed_df = pd.read_parquet(PROCESSED_TRAIN_PATH)
    label_encoder = joblib.load(LABEL_ENCODER_PATH)
    all_feature_cols = joblib.load(ALL_COLS_PATH) # All scaled + engineered features from preprocessor
    imu_feature_cols = joblib.load(IMU_COLS_PATH) # Only IMU related scaled + engineered from preprocessor

    # --- Verification ---
    required_original_cols = ['sequence_id', 'gesture_encoded', 'subject']
    if not all(col in train_processed_df.columns for col in required_original_cols):
        raise ValueError(f"Missing one or more required columns: {required_original_cols}")
    # Ensure feature lists are subsets of dataframe columns
    all_feature_cols = [c for c in all_feature_cols if c in train_processed_df.columns]
    imu_feature_cols = [c for c in imu_feature_cols if c in train_processed_df.columns]
    if not all_feature_cols: raise ValueError("No ALL feature columns found in DataFrame after loading list.")
    if not imu_feature_cols: raise ValueError("No IMU feature columns found in DataFrame after loading list.")
    # --- End Verification ---

    N_CLASSES = len(label_encoder.classes_)
    LGBM_PARAMS['num_class'] = N_CLASSES

    print(f"Loaded processed data with shape: {train_processed_df.shape}")
    print(f"Number of ALL base features identified: {len(all_feature_cols)}")
    print(f"Number of IMU base features identified: {len(imu_feature_cols)}")
    print(f"Number of classes: {N_CLASSES}")

except FileNotFoundError as e:
    print(f"Error: Required file not found. Ensure preprocessing ran. Missing: {e.filename}"); exit()
except Exception as e:
    print(f"An error occurred during loading: {e}"); exit()

# ==============================================================================
#            FEATURE AGGREGATION FOR LGBM (Reduced Statistics)
# ==============================================================================

def aggregate_features_for_lgbm_simple(df, feature_cols_to_agg, sequence_id_col='sequence_id', subject_col='subject', label_col='gesture_encoded'):
    """ Aggregates time-series features using a reduced set of statistics. """
    print(f"Aggregating {len(feature_cols_to_agg)} features (Simple Stats)...")

    # --- REDUCED Aggregations ---
    # Core stats + maybe first/last
    aggs = ['mean', 'std', 'min', 'max', 'median', 'first', 'last']
    # If computation time still too long, reduce further to ['mean', 'std', 'min', 'max']

    # Group by sequence and aggregate
    agg_dict = {col: aggs for col in feature_cols_to_agg}

    # Use tqdm if available
    try:
        from tqdm.auto import tqdm
        tqdm.pandas()
        agg_df = df.groupby(sequence_id_col).progress_agg(agg_dict) # Requires tqdm>=4.42.0
    except ImportError:
        print("tqdm not found or version too old, aggregation progress bar disabled.")
        agg_df = df.groupby(sequence_id_col).agg(agg_dict)
    except AttributeError: # If progress_agg doesn't exist
         print("progress_agg not available, using standard agg.")
         agg_df = df.groupby(sequence_id_col).agg(agg_dict)


    # Flatten multi-level column index
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
    agg_df = agg_df.reset_index()

    # Add sequence-level metadata
    meta_cols = [sequence_id_col, subject_col, label_col]
    meta_df = df[meta_cols].drop_duplicates(subset=[sequence_id_col])

    # Merge aggregated features with metadata
    final_df = pd.merge(meta_df, agg_df, on=sequence_id_col, how='left')

    # Fill NaNs/Infs
    final_df = final_df.fillna(0)
    final_df.replace([np.inf, -np.inf], 0, inplace=True)

    print(f"Aggregation complete. Shape: {final_df.shape}")
    return final_df

# --- Aggregate for ALL features ---
train_agg_all_df = aggregate_features_for_lgbm_simple(train_processed_df, all_feature_cols)
lgbm_all_feature_names = [col for col in train_agg_all_df.columns if col not in ['sequence_id', 'subject', 'gesture_encoded']]
print(f"Number of aggregated features (ALL): {len(lgbm_all_feature_names)}")

# --- Aggregate for IMU features ---
train_agg_imu_df = aggregate_features_for_lgbm_simple(train_processed_df, imu_feature_cols)
lgbm_imu_feature_names = [col for col in train_agg_imu_df.columns if col not in ['sequence_id', 'subject', 'gesture_encoded']]
print(f"Number of aggregated features (IMU): {len(lgbm_imu_feature_names)}")

# Save the list of aggregated feature names
joblib.dump(lgbm_all_feature_names, LGBM_ALL_FEATURES_PATH)
joblib.dump(lgbm_imu_feature_names, LGBM_IMU_FEATURES_PATH)
print("Saved aggregated feature name lists.")

del train_processed_df; gc.collect() # Clean up

# ==============================================================================
#                      COMPETITION METRIC FUNCTION (UNCHANGED)
# ==============================================================================
def hierarchical_macro_f1(y_true, y_pred_labels, target_gestures_list, le):
    """Calculates the CMI competition metric with corrected variable scope."""
    # Ensure inputs are not empty first
    if len(y_true) == 0 or len(y_pred_labels) == 0:
        print("Warning: hierarchical_macro_f1 received empty input.")
        return 0.0, 0.0, 0.0

    # --- Ensure assignments happen AFTER the initial check ---
    y_true_np = np.asarray(y_true)
    y_pred_labels_np = np.asarray(y_pred_labels)
    # --- End Assignment Fix ---

    # --- Check for valid label indices ---
    known_labels = np.arange(len(le.classes_))
    # Create mask where predicted labels are within the known range
    valid_pred_mask = np.isin(y_pred_labels_np, known_labels)

    # Filter both true and predicted labels based on the mask
    y_true_filtered = y_true_np[valid_pred_mask]
    y_pred_filtered = y_pred_labels_np[valid_pred_mask]

    # If filtering removed all samples, return 0
    if len(y_true_filtered) == 0:
        print("Warning: No valid predictions found after filtering in hierarchical_macro_f1.")
        return 0.0, 0.0, 0.0
    # --- End Validity Check ---

    # --- Proceed with metric calculation using filtered arrays ---
    try:
        y_true_str = le.inverse_transform(y_true_filtered)
        y_pred_str = le.inverse_transform(y_pred_filtered)
    except ValueError as e:
        print(f"LabelEncoder Error in metric: {e}. Check label range. True: {np.unique(y_true_filtered)}, Pred: {np.unique(y_pred_filtered)}")
        return 0.0, 0.0, 0.0 # Return 0 if inverse_transform fails

    y_true_bin = np.isin(y_true_str, target_gestures_list)
    y_pred_bin = np.isin(y_pred_str, target_gestures_list)
    binary_f1 = f1_score(y_true_bin, y_pred_bin, pos_label=True, zero_division=0)

    y_true_mc = np.where(y_true_bin, y_true_str, 'non_target')
    y_pred_mc = np.where(y_pred_bin, y_pred_str, 'non_target')
    unique_labels_mc_present = np.unique(np.concatenate((y_true_mc, y_pred_mc)))

    # Ensure 'non_target' class is handled if present
    # Get all possible labels expected by f1_score
    all_possible_classes = np.append(le.classes_, 'non_target')
    labels_for_f1 = [lbl for lbl in all_possible_classes if lbl in unique_labels_mc_present]


    gesture_f1 = f1_score(y_true_mc, y_pred_mc, labels=labels_for_f1, average='macro', zero_division=0)

    metric = 0.5 * binary_f1 + 0.5 * gesture_f1
    return metric, binary_f1, gesture_f1
# ==============================================================================
#                      CROSS-VALIDATION TRAINING (LGBM)
# ==============================================================================
print(f"\nStarting {N_SPLITS}-Fold Cross-Validation (LGBM - Simple Features)...")

# --- Prepare data for CV ---
X_all = train_agg_all_df[lgbm_all_feature_names]
X_imu = train_agg_imu_df[lgbm_imu_feature_names]
y = train_agg_all_df['gesture_encoded']
groups = train_agg_all_df['subject']
sequence_ids_cv = train_agg_all_df['sequence_id']

# Use pandas DataFrames directly for LGBM (often easier)
gkf = GroupKFold(n_splits=N_SPLITS)

# OOF storage
oof_preds_all = np.zeros((len(train_agg_all_df), N_CLASSES))
oof_preds_imu = np.zeros((len(train_agg_imu_df), N_CLASSES))
oof_labels_all = np.zeros(len(train_agg_all_df))
oof_labels_imu = np.zeros(len(train_agg_imu_df))
oof_groups_store = np.empty(len(train_agg_all_df), dtype=object)
oof_seq_ids_store = np.zeros(len(train_agg_all_df))

fold_metrics_all = []
fold_metrics_imu = []
feature_importances_all = pd.DataFrame()
feature_importances_imu = pd.DataFrame()

for fold, (train_idx, val_idx) in enumerate(gkf.split(X_all, y, groups)):
    print(f"\n===== Fold {fold+1}/{N_SPLITS} =====")

    # --- Data for ALL Features Model ---
    X_train_all_fold, X_val_all_fold = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # --- Data for IMU Features Model ---
    # Need to align IMU data with the same train/val indices
    X_imu_aligned = train_agg_imu_df.set_index('sequence_id').loc[sequence_ids_cv].reset_index() # Ensure same order as X_all
    X_train_imu_fold, X_val_imu_fold = X_imu_aligned.iloc[train_idx][lgbm_imu_feature_names], X_imu_aligned.iloc[val_idx][lgbm_imu_feature_names]

    # Store validation info
    oof_groups_store[val_idx] = groups.iloc[val_idx].values
    oof_seq_ids_store[val_idx] = sequence_ids_cv.iloc[val_idx].values

    # --- Train ALL Features Model ---
    print(f"\n--- Training ALL Features Model (Fold {fold+1}) ---")
    model_all = lgb.LGBMClassifier(**LGBM_PARAMS)
    model_all.fit(X_train_all_fold, y_train_fold,
                  eval_set=[(X_val_all_fold, y_val_fold)],
                  callbacks=[lgb.early_stopping(150, verbose=False), lgb.log_evaluation(period=500)]) # Log less often

    fold_oof_preds_all = model_all.predict_proba(X_val_all_fold)
    oof_preds_all[val_idx] = fold_oof_preds_all
    oof_labels_all[val_idx] = np.argmax(fold_oof_preds_all, axis=1)
    metric_all, _, _ = hierarchical_macro_f1(y_val_fold, np.argmax(fold_oof_preds_all, axis=1), TARGET_GESTURES, label_encoder)
    fold_metrics_all.append(metric_all)
    print(f"Fold {fold+1} ALL Features Hierarchical F1: {metric_all:.4f}")
    model_all.booster_.save_model(MODEL_ALL_TMPL.format(fold=fold+1)); print(f"ALL features model saved.")
    fold_importance_all_df = pd.DataFrame({"feature": lgbm_all_feature_names, "importance": model_all.feature_importances_, "fold": fold + 1})
    feature_importances_all = pd.concat([feature_importances_all, fold_importance_all_df], axis=0)

    # --- Train IMU Features Model ---
    print(f"\n--- Training IMU Features Model (Fold {fold+1}) ---")
    model_imu = lgb.LGBMClassifier(**LGBM_PARAMS)
    model_imu.fit(X_train_imu_fold, y_train_fold, # Use same y_train_fold
                  eval_set=[(X_val_imu_fold, y_val_fold)], # Use same y_val_fold
                  callbacks=[lgb.early_stopping(150, verbose=False), lgb.log_evaluation(period=500)])

    fold_oof_preds_imu = model_imu.predict_proba(X_val_imu_fold)
    oof_preds_imu[val_idx] = fold_oof_preds_imu
    oof_labels_imu[val_idx] = np.argmax(fold_oof_preds_imu, axis=1)
    metric_imu, _, _ = hierarchical_macro_f1(y_val_fold, np.argmax(fold_oof_preds_imu, axis=1), TARGET_GESTURES, label_encoder)
    fold_metrics_imu.append(metric_imu)
    print(f"Fold {fold+1} IMU Features Hierarchical F1: {metric_imu:.4f}")
    model_imu.booster_.save_model(MODEL_IMU_TMPL.format(fold=fold+1)); print(f"IMU features model saved.")
    fold_importance_imu_df = pd.DataFrame({"feature": lgbm_imu_feature_names, "importance": model_imu.feature_importances_, "fold": fold + 1})
    feature_importances_imu = pd.concat([feature_importances_imu, fold_importance_imu_df], axis=0)

    del X_train_all_fold, X_val_all_fold, X_train_imu_fold, X_val_imu_fold, y_train_fold, y_val_fold
    del model_all, model_imu; gc.collect()

# ==============================================================================
#                      OVERALL OOF EVALUATION
# ==============================================================================
print("\n===== Overall OOF Evaluation (LGBM Simple - Before PP) =====")
# --- ALL Features ---
valid_oof_mask_all = np.isin(oof_labels_all.astype(int), np.arange(N_CLASSES))
oof_metric_overall_all, oof_bin_f1_all, oof_gest_f1_all = hierarchical_macro_f1(y.iloc[valid_oof_mask_all].values, oof_labels_all.astype(int)[valid_oof_mask_all], TARGET_GESTURES, label_encoder)
print(f"Overall OOF ALL Features Hierarchical F1: {oof_metric_overall_all:.4f}")
print(f"Mean Fold Metric (ALL): {np.mean(fold_metrics_all):.4f} +/- {np.std(fold_metrics_all):.4f}")
print("\nOOF Classification Report (ALL Features - Before PP):")
print(classification_report(y.iloc[valid_oof_mask_all].values, oof_labels_all.astype(int)[valid_oof_mask_all], target_names=label_encoder.classes_, zero_division=0))

# --- IMU Features ---
print("\n--- IMU Features ---")
valid_oof_mask_imu = np.isin(oof_labels_imu.astype(int), np.arange(N_CLASSES))
oof_metric_overall_imu, oof_bin_f1_imu, oof_gest_f1_imu = hierarchical_macro_f1(y.iloc[valid_oof_mask_imu].values, oof_labels_imu.astype(int)[valid_oof_mask_imu], TARGET_GESTURES, label_encoder)
print(f"Overall OOF IMU Features Hierarchical F1: {oof_metric_overall_imu:.4f}")
print(f"Mean Fold Metric (IMU): {np.mean(fold_metrics_imu):.4f} +/- {np.std(fold_metrics_imu):.4f}")

# --- Save OOF Predictions (using ALL features primarily) ---
oof_df = pd.DataFrame({'sequence_id': oof_seq_ids_store, 'subject': oof_groups_store, 'true_label': y.values, 'pred_label_raw': oof_labels_all.astype(int)})
for i in range(N_CLASSES): oof_df[f'pred_proba_{i}'] = oof_preds_all[:, i]
oof_df.to_csv(OOF_CSV_PATH, index=False); print(f"\nOOF predictions saved to {OOF_CSV_PATH}")

print("\n===== FULL LGBM SIMPLE PIPELINE FINISHED (Training and OOF Evaluation Only) =====")

Starting LGBM Pipeline...
Loading preprocessed data and objects...
Loaded processed data with shape: (574945, 348)
Number of ALL base features identified: 338
Number of IMU base features identified: 13
Number of classes: 18
Aggregating 338 features (Simple Stats)...
progress_agg not available, using standard agg.
Aggregation complete. Shape: (8151, 2369)
Number of aggregated features (ALL): 2366
Aggregating 13 features (Simple Stats)...
progress_agg not available, using standard agg.
Aggregation complete. Shape: (8151, 94)
Number of aggregated features (IMU): 91
Saved aggregated feature name lists.

Starting 2-Fold Cross-Validation (LGBM - Simple Features)...

===== Fold 1/2 =====

--- Training ALL Features Model (Fold 1) ---
Fold 1 ALL Features Hierarchical F1: 0.7505
ALL features model saved.

--- Training IMU Features Model (Fold 1) ---
Fold 1 IMU Features Hierarchical F1: 0.6619
IMU features model saved.

===== Fold 2/2 =====

--- Training ALL Features Model (Fold 2) ---
Fold 2 ALL