In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# ==========================================
# 1. DATA PROCESSING PIPELINE
# ==========================================
class AccelPipeline:
    def __init__(self, df):
        self.df = df.copy()
        
        # Ensure Timestamp format
        self.df['local_ts'] = pd.to_datetime(self.df['local_ts'])
        
        # Sort by Subject THEN Time to ensure rolling window correctness
        self.df = self.df.sort_values(by=['subject', 'local_ts']).reset_index(drop=True)
        
    def convert_to_gravity(self):
        print("--- Converting to Gravity Units & Calculating ENMO ---")
        scale = 16384.0
        
        self.df['x_g'] = self.df['x'] / scale
        self.df['y_g'] = self.df['y'] / scale
        self.df['z_g'] = self.df['z'] / scale
        
        # Magnitude
        self.df['mag'] = np.sqrt(self.df['x_g']**2 + self.df['y_g']**2 + self.df['z_g']**2)
        
        # ENMO: max(mag - 1, 0)
        self.df['enmo'] = np.maximum(self.df['mag'] - 1, 0)
        return self.df

    def _get_dynamic_component(self, window_seconds=1):
        # Create temp df indexed by time
        temp_df = self.df.set_index('local_ts').sort_index()
        cols = ['x_g', 'y_g', 'z_g']
        
        # Group by subject -> Rolling Mean (Static Gravity)
        static_component = temp_df.groupby('subject')[cols].rolling(f'{window_seconds}s').mean()
        
        # Merge static values back to main dataframe
        static_reset = static_component.reset_index()
        merged = pd.merge(self.df, static_reset, on=['subject', 'local_ts'], suffixes=('', '_static'))
        
        # Dynamic = Raw - Static
        dynamic_df = pd.DataFrame()
        dynamic_df['x_d'] = merged['x_g'] - merged['x_g_static']
        dynamic_df['y_d'] = merged['y_g'] - merged['y_g_static']
        dynamic_df['z_d'] = merged['z_g'] - merged['z_g_static']
        
        return dynamic_df.fillna(0)

    def calc_odba(self):
        print("--- Calculating ODBA ---")
        dyn = self._get_dynamic_component()
        self.df['odba'] = dyn['x_d'].abs() + dyn['y_d'].abs() + dyn['z_d'].abs()
        return self.df

    def calc_vedba(self):
        print("--- Calculating VeDBA ---")
        dyn = self._get_dynamic_component()
        self.df['vedba'] = np.sqrt(dyn['x_d']**2 + dyn['y_d']**2 + dyn['z_d']**2)
        return self.df

    def resample_data(self, interval_seconds=5):
        print(f"--- Resampling data to {interval_seconds} second windows ---")
        
        # Aggregation Dictionary
        agg_dict = {
            'x_g': 'mean', 'y_g': 'mean', 'z_g': 'mean',
            'mag': 'mean', 'enmo': 'mean',
            'odba': 'mean', 'vedba': 'mean',
            # UPDATED: Mode for behavioral_category
            'behavioral_category': lambda x: x.mode()[0] if not x.mode().empty else np.nan
        }
        
        resampled_df = (
            self.df.set_index('local_ts')
            .groupby('subject')
            .resample(f'{interval_seconds}s')
            .agg(agg_dict)
        )
        
        return resampled_df.dropna().reset_index()

# ==========================================
# 2. MACHINE LEARNING MODELER
# ==========================================



# 1. Load Data


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

class ActivityModeler:
    def __init__(self, data, target_col='behavioral_category'):
        self.data = data
        self.target_col = target_col
        self.le = LabelEncoder()
        
        # Store results here
        self.results_log = []
        
        # Check target
        if self.target_col not in self.data.columns:
            raise ValueError(f"Target column '{self.target_col}' not found.")
            
        # Capture Value Counts as a string for the report
        counts = self.data[self.target_col].value_counts().to_dict()
        self.target_distribution_str = str(counts)

    def prepare_data(self, feature_cols):
        """Prepares X and y, applies scaling for non-tree models."""
        X = self.data[feature_cols]
        y = self.data[self.target_col]
        
        # Encode labels
        y_encoded = self.le.fit_transform(y)
        
        # Split
        # Try stratify, fallback if classes are too small
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
            )
        except ValueError:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y_encoded, test_size=0.3, random_state=42
            )
            
        return X_train, X_test, y_train, y_test

    def _evaluate_model(self, model, X_train, X_test, y_train, y_test):
        """Fits model and calculates metrics."""
        # Scale data for algorithms that need it (SVM, KNN, LogReg)
        # Random Forest doesn't strictly need it, but it doesn't hurt here for uniformity
        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s = scaler.transform(X_test)
        
        # Fit
        model.fit(X_train_s, y_train)
        
        # Predict
        y_pred = model.predict(X_test_s)
        
        # Calculate Metrics (Weighted average handles class imbalance best)
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, y_pred, average='weighted', zero_division=0
        )
        
        return accuracy, precision, recall, f1

    def run_experiments(self):
        """
        Loops through 4 Feature Sets AND Multiple Algorithms.
        Returns a Pandas DataFrame of results.
        """
        # 1. Define Feature Sets
        feature_sets = {
            "Raw Accel (XYZ)": ['x_g', 'y_g', 'z_g'],
            "ODBA Only": ['odba'],
            "VeDBA Only": ['vedba'],
            "Magnitude": ['mag']
        }
        
        # 2. Define Algorithms
        algorithms = {
            "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
            "SVM (RBF)": SVC(kernel='rbf', random_state=42),
            "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
            "LogisticReg": LogisticRegression(random_state=42, max_iter=1000)
        }
        
        print(f"Starting Experiments on {len(self.data)} rows...")
        
        for feat_name, cols in feature_sets.items():
            # Skip if columns missing
            if not set(cols).issubset(self.data.columns):
                print(f"Skipping {feat_name} (Missing columns)")
                continue
                
            # Prepare data once per feature set
            X_train, X_test, y_train, y_test = self.prepare_data(cols)
            
            for algo_name, model in algorithms.items():
                print(f"  -> Testing {algo_name} on {feat_name}...")
                
                acc, prec, rec, f1 = self._evaluate_model(
                    model, X_train, X_test, y_train, y_test
                )
                
                # Log results
                self.results_log.append({
                    'Algorithm': algo_name,
                    'Feature_Set': feat_name,
                    'Accuracy': round(acc, 4),
                    'Precision': round(prec, 4),
                    'Recall': round(rec, 4),
                    'F1_Score': round(f1, 4),
                    'Target_Counts': self.target_distribution_str,
                    'Features_Used': str(cols)
                })
                
        # Convert to DataFrame
        results_df = pd.DataFrame(self.results_log)
        
        # Sort by F1 Score to show best models first
        return results_df.sort_values(by='F1_Score', ascending=False)

In [None]:
for root, dirs, files in os.walk():
        for file in files: