In [16]:
import pandas as pd
import numpy as np
import csv
import sys
import json
from IPython.display import display

class NoNaNJSONEncoder(json.JSONEncoder):
    """A custom JSON encoder that replaces NaN/Inf with None (i.e., null)."""
    def default(self, obj):
        if isinstance(obj, float):
            if np.isnan(obj) or np.isinf(obj):
                return None  # becomes 'null' in the final JSON
        return super().default(obj)

def csv_to_labelstudio_json(input_csv: str, output_json: str, skip_count: int = 4):
    """
    1) Reads the CSV (input_csv).
    2) Merges timestamps from:
       'sensortime_wakeup_ns' (unless '*') and
       'sensortime_nonwakeup_ns'.
    3) Renames sensor columns.
    4) Filters columns to keep only:
       {accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z, timestamp_s}.
    5) Skips the first skip_count samples from each column.
    6) Builds the final JSON from remaining rows.
    7) Removes rows (in the final JSON) if any sensor column is `'*'`.
    8) Exports a single-task Label Studio–compatible JSON (output_json).
    """

    # -----------------------
    # STEP 1) READ CSV
    # -----------------------
    df = pd.read_csv(input_csv)
    
    # -----------------------
    # STEP 2) MERGE TIMESTAMPS
    # -----------------------
    df["timestamp_ns"] = df["sensortime_wakeup_ns"].mask(
        df["sensortime_wakeup_ns"] == "*",
        df["sensortime_nonwakeup_ns"]
    )
    df["timestamp_ns"] = pd.to_numeric(df["timestamp_ns"], errors="coerce")
    df["timestamp_s"] = df["timestamp_ns"] / 1e9

    # -----------------------
    # STEP 3) RENAME COLUMNS
    # -----------------------
    rename_map = {
        "accelerometer_corrected_non_wakeup_ms2_x": "accel_x",
        "accelerometer_corrected_non_wakeup_ms2_y": "accel_y",
        "accelerometer_corrected_non_wakeup_ms2_z": "accel_z",
        "gyroscope_corrected_non_wakeup_dps_x": "gyro_x",
        "gyroscope_corrected_non_wakeup_dps_y": "gyro_y",
        "gyroscope_corrected_non_wakeup_dps_z": "gyro_z"
    }
    df.rename(columns=rename_map, inplace=True)

    # -----------------------
    # STEP 4) FILTER COLUMNS
    # -----------------------
    columns_to_keep = [
        "accel_x", "accel_y", "accel_z",
        "gyro_x",  "gyro_y",  "gyro_z",
        "timestamp_s"
    ]
    df_filtered = df[columns_to_keep]

    # -----------------------
    # STEP 5) SKIP FIRST N ROWS
    # -----------------------
    df_skipped = df_filtered.iloc[skip_count:].reset_index(drop=True)

    # -----------------------
    # STEP 6) PREPARE TO BUILD TIME SERIES DATA
    # -----------------------
    # We'll convert df_skipped to a list of dictionaries (one per row).
    # Then we'll filter out rows where any sensor column equals '*'.
    sensor_cols = ["accel_x", "accel_y", "accel_z", "gyro_x", "gyro_y", "gyro_z"]
    valid_rows = []

    for _, row in df_skipped.iterrows():
        if any(str(row[col]) == '*' for col in sensor_cols):
            continue
        valid_rows.append(row)

    # -----------------------
    # BUILD THE TIME-SERIES DATA FOR LABEL STUDIO
    # -----------------------
    # Construct each column's values as a list from the valid rows,
    # converting sensor columns to float for numeric auto scaling.
    timeseries_dict = {col: [] for col in df_skipped.columns}
    for row in valid_rows:
        for col in df_skipped.columns:
            if col in sensor_cols:
                timeseries_dict[col].append(float(row[col]))
            else:
                timeseries_dict[col].append(row[col])

    tasks = [
        {
            "data": {
                "tsData": timeseries_dict
            }
        }
    ]

    # -----------------------
    # STEP 7) DUMP TO JSON
    # -----------------------
    with open(output_json, "w") as f:
        json.dump(tasks, f, indent=2, cls=NoNaNJSONEncoder)

    display(f"Processed data (skipping first {skip_count} samples) saved to '{output_json}'.")

# ------------------------------------------------------------------------------
# Example usage:
# ------------------------------------------------------------------------------
input_csv_path = "mar_25_data/DURURKEN-OTURMAK.csv"
output_json_path = "mar_25_data_label_studio_input/DURURKEN-OTURMAK.json"
skip_count = 0

csv_to_labelstudio_json(
    input_csv=input_csv_path,
    output_json=output_json_path,
    skip_count=skip_count
)

"Processed data (skipping first 0 samples) saved to 'mar_25_data_label_studio_input/DURURKEN-OTURMAK.json'."

In [2]:
# ------------------------------------------------------------------------------
# AT THIS STEP, IMPORT THE GIVEN JSON FILE TO LABEL STUDIO WITH THE FOLLOWING CONFIGURATION
# ------------------------------------------------------------------------------
#
# <View>
#   <TimeSeriesLabels name="tsLabels" toName="ts">
#     <Label value="FALL" background="red"/>
#     <Label value="MOTION" background="#ffea00"/>
#     <Label value="NO MOTION" background="#05ff16"/>
#   </TimeSeriesLabels>
#
#   <TimeSeries name="ts" valueType="json" value="$tsData" timeColumn="timestamp_s" ordered="true">
#     <Channel column="accel_x" legend="accel_x"/>
#     <Channel column="accel_y" legend="accel_y"/>
#     <Channel column="accel_z" legend="accel_z"/>
#     <Channel column="gyro_x" legend="gyro_x"/>
#     <Channel column="gyro_y" legend="gyro_y"/>
#     <Channel column="gyro_z" legend="gyro_z"/>
#   </TimeSeries>
# </View>
#
# This configuration tells Label Studio to:
#  - Use the "timestamp_s" column for the time axis,
#  - Display multiple channels (accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z),
#  - Provide labeling options like "FALL", "MOTION", and "NO MOTION".
# ------------------------------------------------------------------------------

In [3]:
# ---------------------------------------------------------------------------------------
# EXPORT THE LABELED DATA AS CSV FROM LABELSTUDIO AND IMPORT IT TO THE DATA DIRECTORY
# ---------------------------------------------------------------------------------------

In [17]:
import csv
import sys
import json

def convert_labelstudio_tsdata_with_timeserieslabels(input_csv, output_csv):
    """
    Reads a Label Studio exported CSV with columns:
      - 'tsData' (JSON dict of arrays for sensor data)
      - 'tsLabels' (JSON list of intervals, each like:
           {
             "start": float,
             "end": float,
             "instant": bool,
             "timeserieslabels": [label]
           }
        )

    Merges sensor data into row-based format (accel_x..timestamp_s) and assigns
    a label among {STANDING, WALKING, TRANSITION, LAYING, SITTING, FALL}.
    If no interval covers the timestamp, the row is removed.

    Postprocessing:
      - If a row has all sensor columns (accel_x..gyro_z) empty (None), it is removed.

    Output columns:
      accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z, timestamp_s, label
    """

    # Increase field size limit for large JSON in tsData/tsLabels columns
    csv.field_size_limit(sys.maxsize)

    all_rows = []

    with open(input_csv, 'r', newline='', encoding='utf-8') as f_in:
        reader = csv.DictReader(f_in)

        for row in reader:
            ts_str = row.get('tsData', '')
            if not ts_str:
                # No sensor data in this row, skip
                continue

            # Parse the sensor dictionary-of-arrays
            try:
                ts_dict = json.loads(ts_str)
            except json.JSONDecodeError as e:
                print(f"Warning: Could not parse tsData JSON. Skipping. Error: {e}")
                continue

            # Parse intervals from 'tsLabels'
            intervals_str = row.get('tsLabels', '[]')
            try:
                intervals = json.loads(intervals_str)
            except json.JSONDecodeError as e:
                print(f"Warning: Could not parse tsLabels JSON. Using empty intervals. Error: {e}")
                intervals = []

            # Expected sensor keys in ts_dict
            sensor_keys = [
                "accel_x", "accel_y", "accel_z",
                "gyro_x", "gyro_y", "gyro_z",
                "timestamp_s"
            ]
            missing = [k for k in sensor_keys if k not in ts_dict]
            if missing:
                print(f"Warning: Missing keys {missing} in tsData. Skipping row.")
                continue

            # Convert dictionary-of-arrays -> row-based
            length = len(ts_dict["timestamp_s"])
            for i in range(length):
                row_data = {}
                for k in sensor_keys:
                    arr = ts_dict[k]
                    row_data[k] = arr[i] if i < len(arr) else None

                # Determine the label among our new set of labels; returns None if no interval covers timestamp.
                label = determine_label_timeserieslabels(
                    row_data["timestamp_s"],
                    intervals
                )
                # If no label is found, skip this row.
                if label is None:
                    continue

                row_data["label"] = label

                # >>> POSTPROCESSING STEP <<<
                # Skip if *all* sensor values are None/empty
                sensor_values = [
                    row_data["accel_x"],
                    row_data["accel_y"],
                    row_data["accel_z"],
                    row_data["gyro_x"],
                    row_data["gyro_y"],
                    row_data["gyro_z"]
                ]
                if all(v is None for v in sensor_values):
                    continue

                # Otherwise, keep the row
                all_rows.append(row_data)

    # Write final CSV
    out_cols = sensor_keys + ["label"]
    with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=out_cols)
        writer.writeheader()
        writer.writerows(all_rows)


def determine_label_timeserieslabels(timestamp, intervals):
    """
    For each interval object in intervals, we have:
      {
        "start": float,
        "end": float,
        "instant": bool,
        "timeserieslabels": [label]
      }

    The supported labels are:
      STANDING, WALKING, TRANSITION, LAYING, SITTING, FALL

    Priority within a matched [start, end) interval:
      1) If 'FALL' in timeserieslabels -> label = 'FALL'
      2) Else if 'TRANSITION' in timeserieslabels -> label = 'TRANSITION'
      3) Else if 'LAYING' in timeserieslabels -> label = 'LAYING'
      4) Else if 'SITTING' in timeserieslabels -> label = 'SITTING'
      5) Else if 'WALKING' in timeserieslabels -> label = 'WALKING'
      6) Else if 'STANDING' in timeserieslabels -> label = 'STANDING'
    If no interval contains the timestamp, returns None.
    """
    if timestamp is None:
        return None

    for interval in intervals:
        start = interval.get("start")
        end = interval.get("end")
        ts_labels = interval.get("timeserieslabels", [])
        # Check if timestamp falls into [start, end)
        if start is not None and end is not None and start <= timestamp < end:
            labels_upper = [lbl.upper() for lbl in ts_labels]
            if "FALL" in labels_upper:
                return "FALL"
            elif "TRANSITION" in labels_upper:
                return "TRANSITION"
            elif "LAYING" in labels_upper:
                return "LAYING"
            elif "SITTING" in labels_upper:
                return "SITTING"
            elif "WALKING" in labels_upper:
                return "WALKING"
            elif "STANDING" in labels_upper:
                return "STANDING"
            else:
                # If interval exists but doesn't contain a recognized label,
                # we treat it as no label found.
                return None
    # If no interval matched, return None so that the row can be removed.
    return None


# Example usage (when running as script, or adjust if using in a notebook):
if __name__ == "__main__":
    input_file = "mar_25_data_label_studio_input/merged_labeled_data.csv"
    output_file = "mar_25_data_label_studio_input/cleaned_training_data.csv"
    convert_labelstudio_tsdata_with_timeserieslabels(input_file, output_file)
    print(f"Done! Wrote row-based CSV with label to '{output_file}'.")


Done! Wrote row-based CSV with label to 'mar_25_data_label_studio_input/cleaned_training_data.csv'.


In [18]:
import pandas as pd

def sort_by_timestamp(input_csv, output_csv):
    # Read the CSV
    df = pd.read_csv(input_csv)
    
    # Sort the DataFrame by 'timestamp_s' in ascending order
    df_sorted = df.sort_values(by='timestamp_s', ascending=True)
    
    # Write the sorted data to a new CSV
    df_sorted.to_csv(output_csv, index=False)
    print(f"Sorted data saved to '{output_csv}'.")

if __name__ == "__main__":
    input_file = "mar_25_data_label_studio_input/cleaned_training_data.csv"
    output_file = "mar_25_data_label_studio_input/cleaned_training_data_sorted.csv"
    sort_by_timestamp(input_file, output_file)


Sorted data saved to 'mar_25_data_label_studio_input/cleaned_training_data_sorted.csv'.


In [None]:
#!/usr/bin/env python3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import kurtosis, skew
from scipy.fft import rfft
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Replace RandomOverSampler with SMOTE
from imblearn.over_sampling import SMOTE

###############################################################################
# 1) READ CSV
###############################################################################

def read_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df.sort_values(by="timestamp_s").reset_index(drop=True)
    return df

###############################################################################
# 2) EXTRACT ADVANCED FEATURES
###############################################################################

def extract_advanced_features_from_timeseries(
    df_timeseries,
    window_size=0.150,
    step_size=0.120,
    do_fft=False
):
    """
    Splits the time series into overlapping windows.
    Each window => time-domain stats + optional FFT + correlations.
    If ANY sample is 'FALL', the window label = 'FALL', else majority non-FALL.
    """
    df = df_timeseries.copy()
    df["timestamp_s"] = df["timestamp_s"].astype(float)

    def safe_corr(a, b):
        if len(a) > 1:
            return np.corrcoef(a, b)[0,1]
        return np.nan

    def compute_time_stats(vals, prefix="acc_x_"):
        if len(vals) == 0:
            return {
                f"{prefix}mean": np.nan, f"{prefix}var": np.nan,
                f"{prefix}skew": np.nan, f"{prefix}kurt": np.nan,
                f"{prefix}median": np.nan, f"{prefix}range": np.nan,
                f"{prefix}rms": np.nan, f"{prefix}p10": np.nan,
                f"{prefix}p25": np.nan, f"{prefix}p50": np.nan, f"{prefix}p75": np.nan
            }
        mean_ = np.mean(vals)
        var_  = np.var(vals)
        skew_val = skew(vals)
        kurt_val = kurtosis(vals)
        med_  = np.median(vals)
        rng   = np.max(vals) - np.min(vals)
        rms_  = np.sqrt(np.mean(vals**2))
        p10, p25, p50, p75 = np.percentile(vals, [10,25,50,75])
        return {
            f"{prefix}mean": mean_,
            f"{prefix}var":  var_,
            f"{prefix}skew": skew_val,
            f"{prefix}kurt": kurt_val,
            f"{prefix}median": med_,
            f"{prefix}range": rng,
            f"{prefix}rms":   rms_,
            f"{prefix}p10":   p10,
            f"{prefix}p25":   p25,
            f"{prefix}p50":   p50,
            f"{prefix}p75":   p75
        }

    def compute_fft_stats(vals, prefix="acc_x_"):
        if len(vals) < 2:
            return {f"{prefix}fft_sum_amp": np.nan, f"{prefix}fft_peak_idx": np.nan}
        fft_vals = rfft(vals)
        mag = np.abs(fft_vals)
        sum_amp = np.sum(mag)
        peak_idx = np.argmax(mag)
        return {
            f"{prefix}fft_sum_amp": sum_amp,
            f"{prefix}fft_peak_idx": peak_idx
        }

    start_time = df["timestamp_s"].min()
    end_time   = df["timestamp_s"].max()

    windows_data = []
    current_start = start_time

    while current_start <= end_time:
        current_end = current_start + window_size
        mask = (df["timestamp_s"] >= current_start) & (df["timestamp_s"] < current_end)
        df_w = df[mask]

        if len(df_w) > 0:
            ax = df_w["accel_x"].values
            ay = df_w["accel_y"].values
            az = df_w["accel_z"].values
            gx = df_w["gyro_x"].values
            gy = df_w["gyro_y"].values
            gz = df_w["gyro_z"].values

            # time-domain stats
            stats_ax = compute_time_stats(ax, "acc_x_")
            stats_ay = compute_time_stats(ay, "acc_y_")
            stats_az = compute_time_stats(az, "acc_z_")
            stats_gx = compute_time_stats(gx, "gyro_x_")
            stats_gy = compute_time_stats(gy, "gyro_y_")
            stats_gz = compute_time_stats(gz, "gyro_z_")

            # correlations
            def sc(a, b): return safe_corr(a, b)
            acc_xy_corr = sc(ax, ay)
            acc_xz_corr = sc(ax, az)
            acc_yz_corr = sc(ay, az)
            gyro_xy_corr = sc(gx, gy)
            gyro_xz_corr = sc(gx, gz)
            gyro_yz_corr = sc(gy, gz)

            # optional FFT
            fft_ax = compute_fft_stats(ax, "acc_x_") if do_fft else {}
            fft_ay = compute_fft_stats(ay, "acc_y_") if do_fft else {}
            fft_az = compute_fft_stats(az, "acc_z_") if do_fft else {}
            fft_gx = compute_fft_stats(gx, "gyro_x_") if do_fft else {}
            fft_gy = compute_fft_stats(gy, "gyro_y_") if do_fft else {}
            fft_gz = compute_fft_stats(gz, "gyro_z_") if do_fft else {}

            # label
            labs_in = df_w["label"].unique()
            if "FALL" in labs_in:
                lab = "FALL"
            else:
                lab_counts = df_w["label"].value_counts()
                lab = lab_counts.idxmax()

            row_dict = {
                "start_time": current_start,
                "end_time": current_end,
                "label": lab,
                "acc_xy_corr": acc_xy_corr,
                "acc_xz_corr": acc_xz_corr,
                "acc_yz_corr": acc_yz_corr,
                "gyro_xy_corr": gyro_xy_corr,
                "gyro_xz_corr": gyro_xz_corr,
                "gyro_yz_corr": gyro_yz_corr
            }
            row_dict.update(stats_ax)
            row_dict.update(stats_ay)
            row_dict.update(stats_az)
            row_dict.update(stats_gx)
            row_dict.update(stats_gy)
            row_dict.update(stats_gz)
            row_dict.update(fft_ax)
            row_dict.update(fft_ay)
            row_dict.update(fft_az)
            row_dict.update(fft_gx)
            row_dict.update(fft_gy)
            row_dict.update(fft_gz)

            windows_data.append(row_dict)

        current_start += step_size

    df_feat = pd.DataFrame(windows_data)
    df_feat.dropna(subset=["label"], inplace=True)
    return df_feat

###############################################################################
# 3) QUICK RF FEATURE IMPORTANCE => PRINT TOP 10
###############################################################################

def print_top_10_features(df_features):
    df_non_fall = df_features[df_features["label"] != "FALL"].copy()
    all_feat_cols = [c for c in df_non_fall.columns if c not in ["start_time", "end_time", "label"]]
    if len(all_feat_cols) == 0:
        print("No feature columns found!")
        return

    X = df_non_fall[all_feat_cols].values
    y = df_non_fall["label"].values

    imputer = SimpleImputer(strategy="mean")
    X_imp = imputer.fit_transform(X)

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_imp, y_enc)

    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_10 = indices[:10]

    print("\n=== Quick RF Feature Importance Ranking ===")
    for rank, idx in enumerate(top_10, start=1):
        print(f"{rank:2d}) {all_feat_cols[idx]:>20s}: {importances[idx]:.4f}")
    print("(...only showing top 10)\n")

###############################################################################
# 4) DATA AUGMENTATION + CLASSIFIER
###############################################################################

def augment_with_noise(X, y, noise_level=0.01, num_copies=2):
    X_list = [X]
    y_list = [y]
    std_dev = X.std(axis=0, keepdims=True)

    for _ in range(num_copies):
        noise = np.random.normal(loc=0.0, scale=noise_level, size=X.shape)
        X_noisy = X + noise * std_dev
        X_list.append(X_noisy)
        y_list.append(y)

    X_aug = np.concatenate(X_list, axis=0)
    y_aug = np.concatenate(y_list, axis=0)
    return X_aug, y_aug

# -------------------- SMOTE Oversampling Instead of RandomOverSampler -------------------- #
from imblearn.over_sampling import SMOTE

def train_posture_classifier(df_features, feat_cols):
    """
    Trains a 5-class posture classifier (LAYING, SITTING, STANDING,
    TRANSITION, WALKING), ignoring FALL rows.
    """
    df_non_fall = df_features[df_features["label"] != "FALL"].copy()
    X = df_non_fall[feat_cols].values
    y = df_non_fall["label"].values

    imp = SimpleImputer(strategy="mean")
    X_no_nan = imp.fit_transform(X)

    # Data augmentation with noise
    X_aug, y_aug = augment_with_noise(X_no_nan, y, noise_level=0.01, num_copies=2)

    # Use SMOTE instead of RandomOverSampler
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_aug, y_aug)

    le = LabelEncoder()
    y_res_enc = le.fit_transform(y_res)

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_res, y_res_enc)

    return clf, le

def train_fall_classifier(df_features, feat_cols):
    """
    Trains a binary fall vs. not-fall classifier.
    """
    df_fall_bin = df_features.copy()
    df_fall_bin["is_fall"] = df_fall_bin["label"].apply(lambda x: 1 if x == "FALL" else 0)

    X = df_fall_bin[feat_cols].values
    y = df_fall_bin["is_fall"].values

    imp = SimpleImputer(strategy="mean")
    X_no_nan = imp.fit_transform(X)

    # Data augmentation with noise
    X_aug, y_aug = augment_with_noise(X_no_nan, y, noise_level=0.01, num_copies=2)

    # Use SMOTE instead of RandomOverSampler
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_aug, y_aug)

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_res, y_res)
    return clf


###############################################################################
# 5) VITERBI CHAIN
###############################################################################

def build_transition_info_6():
    """
    Returns a transition matrix and label list for 6 states:
    STANDING, SITTING, LAYING, WALKING, TRANSITION, FALL.
    """
    label_list_6 = ["STANDING","SITTING","LAYING","WALKING","TRANSITION","FALL"]
    trans_mat_6 = np.array([
        [0.4, 0.0, 0.0, 0.2, 0.3, 0.1],
        [0.0, 0.5, 0.0, 0.0, 0.4, 0.1],
        [0.0, 0.0, 0.6, 0.0, 0.3, 0.1],
        [0.3, 0.0, 0.0, 0.5, 0.0, 0.2],
        [0.2, 0.2, 0.2, 0.0, 0.2, 0.2],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667],
    ], dtype=np.float64)
    return trans_mat_6, label_list_6

def combine_posteriors(posture_probs, fall_probs):
    """
    Merges posture probabilities with fall probabilities into a single
    6-state distribution per time-window, with no user-defined scaling factor.
    """
    N = posture_probs.shape[0]
    combined = np.zeros((N, 6))
    for i in range(N):
        p_fall = fall_probs[i]
        # Clip just to be safe
        if p_fall > 1.0:
            p_fall = 1.0
        elif p_fall < 0.0:
            p_fall = 0.0

        p_not_fall = 1.0 - p_fall

        # First 5 columns => posture distribution scaled by (1 - p_fall)
        combined[i, :5] = posture_probs[i, :] * p_not_fall

        # 6th column => fall probability
        combined[i, 5] = p_fall

    return combined

def viterbi_decode_6state(combined_probs, trans_mat_6):
    N = combined_probs.shape[0]
    n_states = 6
    log_obs = np.log(combined_probs + 1e-12)
    log_trans = np.log(trans_mat_6 + 1e-12)

    viterbi_mat = np.full((N, n_states), -np.inf)
    backpointer = np.zeros((N, n_states), dtype=int)

    # Initialize
    for s in range(n_states):
        viterbi_mat[0,s] = log_obs[0,s]

    # Viterbi recursion
    for t in range(1, N):
        for s in range(n_states):
            max_val = -np.inf
            best_prev = 0
            for s_prev in range(n_states):
                candidate = (
                    viterbi_mat[t-1, s_prev]
                    + log_trans[s_prev, s]
                    + log_obs[t, s]
                )
                if candidate > max_val:
                    max_val = candidate
                    best_prev = s_prev
            viterbi_mat[t,s] = max_val
            backpointer[t,s] = best_prev

    # Backtrack
    best_path = []
    last_state = np.argmax(viterbi_mat[N-1])
    best_path.append(last_state)
    for t in range(N-1, 0, -1):
        last_state = backpointer[t, last_state]
        best_path.append(last_state)
    best_path.reverse()
    return best_path

def decode_path_to_labels(best_path, label_list_6):
    return [label_list_6[s] for s in best_path]

def evaluate_viterbi_accuracy(df_features):
    label_order = ["STANDING","SITTING","LAYING","WALKING","TRANSITION","FALL"]
    y_true = df_features["label"].values
    y_pred = df_features["viterbi_6state"].values

    print("\n=== Markov Chain Confusion Matrix (Window-Level) ===")
    cm = confusion_matrix(y_true, y_pred, labels=label_order)
    print("Order:", label_order)
    print(cm)

    print("\n=== Markov Chain Classification Report ===")
    print(classification_report(y_true, y_pred, labels=label_order, zero_division=0))

    acc = accuracy_score(y_true, y_pred)
    print(f"=== Overall Markov Chain Accuracy: {acc*100:.2f}% ===\n")

###############################################################################
# MAIN
###############################################################################

def main():
    csv_path = "mar_25_data_label_studio_input/cleaned_training_data_sorted.csv"
    df_raw = read_data(csv_path)
    print(f"Read {len(df_raw)} rows from {csv_path}")

    # Extract advanced features
    df_features = extract_advanced_features_from_timeseries(
        df_raw,
        window_size=0.150,
        step_size=0.120,
        do_fft=True
    )
    print(f"\nExtracted {len(df_features)} window-level advanced feature rows.")
    print("Columns in df_features:", df_features.columns)

    # Just for reference => see top features
    print_top_10_features(df_features)

    # -------------------------------------------------------------------------
    # TIME-BASED SPLIT (instead of random split)
    # -------------------------------------------------------------------------
    # Sort by time in case not already sorted.
    df_features = df_features.sort_values("start_time").reset_index(drop=True)
    split_index = int(0.8 * len(df_features))  # 80% for training, last 20% for testing

    train_df = df_features.iloc[:split_index].copy()
    test_df  = df_features.iloc[split_index:].copy()

    print(f"\nTrain windows (time-based): {len(train_df)}, Test windows (time-based): {len(test_df)}")

    # We'll pick all feature columns except these
    feat_cols = [c for c in df_features.columns if c not in ["start_time","end_time","label"]]

    # Train posture + fall classifiers
    posture_clf, posture_le = train_posture_classifier(train_df, feat_cols)
    fall_clf = train_fall_classifier(train_df, feat_cols)

    # Show the actual label-encoder order for the posture classes:
    print("\nPosture label encoder classes:", posture_le.classes_)

    ################################################################
    # 1) Evaluate posture classifier ignoring FALL
    ################################################################
    df_test_posture = test_df[test_df["label"] != "FALL"].copy()
    X_test_posture = df_test_posture[feat_cols].values
    y_test_posture = df_test_posture["label"].values

    y_test_posture_enc = posture_le.transform(y_test_posture)
    y_pred_posture_enc = posture_clf.predict(X_test_posture)

    print("\n=== [Direct] Posture Classifier (Ignoring FALL) on Test ===")
    print("Confusion Matrix (encoded):")
    print(confusion_matrix(y_test_posture_enc, y_pred_posture_enc))
    print("Classification Report (encoded):")
    print(classification_report(
        y_test_posture_enc,
        y_pred_posture_enc,
        zero_division=0,
        target_names=posture_le.classes_
    ))

    ################################################################
    # 2) Evaluate fall vs. not-fall classifier
    #    USING A CUSTOM THRESHOLD of 0.20
    ################################################################
    X_test_fall = test_df[feat_cols].values
    y_test_fall = np.array([1 if lbl == "FALL" else 0 for lbl in test_df["label"].values])

    # Get FALL probabilities from the model
    fall_probs_test = fall_clf.predict_proba(X_test_fall)[:, 1]

    # Use a threshold (0.20 shown here) to classify
    y_pred_fall = (fall_probs_test >= 0.20).astype(int)

    print("\n=== [Direct] Fall vs. Not-Fall Classifier on Test (Threshold=0.20) ===")
    print("Confusion Matrix (binary): [ [TN, FP], [FN, TP] ]")
    print(confusion_matrix(y_test_fall, y_pred_fall))
    print("Classification Report (binary):")
    print(classification_report(y_test_fall, y_pred_fall, zero_division=0))

    ################################################################
    # 3) Combine both into a 6-state Markov chain (posture + FALL)
    ################################################################
    posture_probs_test = posture_clf.predict_proba(X_test_fall)

    # We want them in the order: ["STANDING","SITTING","LAYING","WALKING","TRANSITION"]
    desired_5class_order = ["STANDING","SITTING","LAYING","WALKING","TRANSITION"]
    class_to_idx = {label: i for i, label in enumerate(posture_le.classes_)}
    reorder_indices = [class_to_idx[cls_name] for cls_name in desired_5class_order]
    posture_probs_reordered = posture_probs_test[:, reorder_indices]

    # We'll combine these posture probabilities with the raw fall probabilities
    # (No scale factor)
    fall_probs_2d = fall_clf.predict_proba(X_test_fall)
    raw_fall_probs = fall_probs_2d[:, 1]

    combined_6 = combine_posteriors(posture_probs_reordered, raw_fall_probs)

    trans_mat_6, label_list_6 = build_transition_info_6()
    best_path = viterbi_decode_6state(combined_6, trans_mat_6)
    best_labels = decode_path_to_labels(best_path, label_list_6)

    test_df["viterbi_6state"] = best_labels

    print("\n=== Markov Chain results (First 20 Windows) ===")
    for i in range(min(20, len(test_df))):
        real_lab = test_df["label"].iloc[i]
        pred_lab = test_df["viterbi_6state"].iloc[i]
        print(f"Win {i:3d}: real={real_lab}, pred={pred_lab}")

    evaluate_viterbi_accuracy(test_df)
    print("\n=== Pipeline Complete ===")

if __name__ == "__main__":
    main()


Read 324217 rows from mar_25_data_label_studio_input/cleaned_training_data_sorted.csv

Extracted 9253 window-level advanced feature rows.
Columns in df_features: Index(['start_time', 'end_time', 'label', 'acc_xy_corr', 'acc_xz_corr',
       'acc_yz_corr', 'gyro_xy_corr', 'gyro_xz_corr', 'gyro_yz_corr',
       'acc_x_mean', 'acc_x_var', 'acc_x_skew', 'acc_x_kurt', 'acc_x_median',
       'acc_x_range', 'acc_x_rms', 'acc_x_p10', 'acc_x_p25', 'acc_x_p50',
       'acc_x_p75', 'acc_y_mean', 'acc_y_var', 'acc_y_skew', 'acc_y_kurt',
       'acc_y_median', 'acc_y_range', 'acc_y_rms', 'acc_y_p10', 'acc_y_p25',
       'acc_y_p50', 'acc_y_p75', 'acc_z_mean', 'acc_z_var', 'acc_z_skew',
       'acc_z_kurt', 'acc_z_median', 'acc_z_range', 'acc_z_rms', 'acc_z_p10',
       'acc_z_p25', 'acc_z_p50', 'acc_z_p75', 'gyro_x_mean', 'gyro_x_var',
       'gyro_x_skew', 'gyro_x_kurt', 'gyro_x_median', 'gyro_x_range',
       'gyro_x_rms', 'gyro_x_p10', 'gyro_x_p25', 'gyro_x_p50', 'gyro_x_p75',
       'gyro_y_mea