In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score
from xgboost import XGBClassifier

In [21]:
def detect_anomaly_range_ml(df, speed_col='speed', label_col='pred_label',
                            timestamp_col='indo_time', test_size=0.3, random_state=42,
                            save_path='anomaly_range_plot.png'
                            ):

    features = pd.DataFrame(df[speed_col])
    features.columns = ['Speed']

    window_short = 60
    window_medium = 300
    window_long = 900

    # Rolling features for different time scales
    features[f'Speed_rolling_mean_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).mean()
    features[f'Speed_rolling_std_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).std().fillna(0)

    features[f'Speed_rolling_mean_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).mean() # min_periods 20% of window
    features[f'Speed_rolling_std_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).std().fillna(0)

    features[f'Speed_rolling_mean_{window_long}'] = df[speed_col].rolling(window=window_long, min_periods=int(window_long * 0.2)).mean() # min_periods 20% of window
    features[f'Speed_rolling_std_{window_long}'] = df[speed_col].rolling(window=window_long, min_periods=int(window_long * 0.2)).std().fillna(0)

    # 1. Temporal Transition Features
    features['Speed_diff_1'] = df[speed_col].diff(periods=1).fillna(0)
    features['Speed_diff_5'] = df[speed_col].diff(periods=5).fillna(0) # Change over 5 seconds
    features['Speed_acceleration'] = df[speed_col].diff(periods=1).diff(periods=1).fillna(0)

    # 2. Contextual Window Features (Rolling Z-Score and Window Discrepancy)
    features[f'Speed_rolling_zscore_{window_short}'] = (df[speed_col] - features[f'Speed_rolling_mean_{window_short}']) / (features[f'Speed_rolling_std_{window_short}'] + 1e-9)
    features[f'Speed_rolling_zscore_{window_medium}'] = (df[speed_col] - features[f'Speed_rolling_mean_{window_medium}']) / (features[f'Speed_rolling_std_{window_medium}'] + 1e-9)
    features[f'Speed_rolling_zscore_{window_long}'] = (df[speed_col] - features[f'Speed_rolling_mean_{window_long}']) / (features[f'Speed_rolling_std_{window_long}'] + 1e-9)

    features[f'Speed_rolling_median_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).median()
    features[f'Speed_mean_median_diff_{window_short}'] = features[f'Speed_rolling_mean_{window_short}'] - features[f'Speed_rolling_median_{window_short}']

    features[f'Speed_rolling_median_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).median()
    features[f'Speed_mean_median_diff_{window_medium}'] = features[f'Speed_rolling_mean_{window_medium}'] - features[f'Speed_rolling_median_{window_medium}']

    features[f'Speed_rolling_median_{window_long}'] = df[speed_col].rolling(window=window_long, min_periods=int(window_long * 0.2)).median()
    features[f'Speed_mean_median_diff_{window_long}'] = features[f'Speed_rolling_mean_{window_long}'] - features[f'Speed_rolling_median_{window_long}']

    # 3. Anomaly Persistence Metrics (Simplified CUSUM of Positive Deviations)
    features[f'Speed_positive_deviation_{window_short}'] = np.maximum(0, df[speed_col] - features[f'Speed_rolling_mean_{window_short}'])
    features[f'Speed_cusum_positive_{window_short}'] = features[f'Speed_positive_deviation_{window_short}'].cumsum()

    features[f'Speed_positive_deviation_{window_medium}'] = np.maximum(0, df[speed_col] - features[f'Speed_rolling_mean_{window_medium}'])
    features[f'Speed_cusum_positive_{window_medium}'] = features[f'Speed_positive_deviation_{window_medium}'].cumsum()

    # 4. Additional Rolling Statistics
    features[f'Speed_rolling_min_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).min()
    features[f'Speed_rolling_max_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).max()
    features[f'Speed_rolling_q25_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).quantile(0.25)
    features[f'Speed_rolling_q75_{window_short}'] = df[speed_col].rolling(window=window_short, min_periods=1).quantile(0.75)

    features[f'Speed_rolling_min_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).min()
    features[f'Speed_rolling_max_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).max()
    features[f'Speed_rolling_q25_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).quantile(0.25)
    features[f'Speed_rolling_q75_{window_medium}'] = df[speed_col].rolling(window=window_medium, min_periods=int(window_medium * 0.2)).quantile(0.75)

    # 5. Exponentially Weighted Moving Average (EWMA)
    features['Speed_ewm_mean_alpha_0.1'] = df[speed_col].ewm(span=10, adjust=False, min_periods=1).mean()
    features['Speed_ewm_std_alpha_0.1'] = df[speed_col].ewm(span=10, adjust=False, min_periods=1).std().fillna(0)

    # 6. Lagged Features
    features['Speed_lag_1'] = df[speed_col].shift(1).fillna(0)
    features['Speed_lag_5'] = df[speed_col].shift(5).fillna(0)
    features['Speed_lag_60'] = df[speed_col].shift(60).fillna(0) # Lag by 1 minute

    X = features
    y = df[label_col]

    X = X.fillna(X.mean())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    print(f"Training data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")
    print(f"Anomaly distribution in training set:\n{y_train.value_counts(normalize=True)}")
    print(f"Anomaly distribution in test set:\n{y_test.value_counts(normalize=True)}")

    neg_count = y_train.value_counts().get(0, 0)
    pos_count = y_train.value_counts().get(1, 0)
    scale_pos_weight = neg_count / pos_count if pos_count > 0 else 1

    print("\nTraining XGBClassifier...")
    model = XGBClassifier(n_estimators=100, random_state=random_state, use_label_encoder=False,
                          eval_metric='logloss', scale_pos_weight=scale_pos_weight)
    model.fit(X_train, y_train)
    print("Training complete.")

    # Predict on the test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    print("\n--- Model Evaluation Metrics on Test Data ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.4f}")
    try:
        print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    except ValueError:
        print("ROC AUC Score: Not applicable (only one class present in y_test or y_pred_proba).")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\n--- Feature Importances (XGBoost) ---")
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print(feature_importances.to_string(index=False))

    # Full‐data prediction
    y_full_pred = model.predict(X)

    intervals_idx = []
    in_anomaly = False
    start_idx = None

    for i, lab in enumerate(y_full_pred):
        if lab == 1 and not in_anomaly:
            in_anomaly = True
            start_idx = i
        elif lab == 0 and in_anomaly:
            intervals_idx.append((start_idx, i - 1))
            in_anomaly = False

    if in_anomaly:
        intervals_idx.append((start_idx, len(y_full_pred) - 1))

    if not intervals_idx:
        print("No predicted anomaly intervals (model output == 1).")
        return []

    intervals_ts = []
    for (s_idx, e_idx) in intervals_idx:
        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]
        intervals_ts.append((s_ts, e_ts))

        context_window = 60

    for i, (s_idx, e_idx) in enumerate(intervals_idx[:5]):
        start_context = max(0, s_idx - context_window)
        end_context = min(len(df) - 1, e_idx + context_window)

        df_seg = df.loc[start_context:end_context].copy()

        plt.figure(figsize=(12, 5))
        sc = plt.scatter(
            df_seg[timestamp_col],
            df_seg[speed_col],
            c=df_seg[label_col],
            cmap='coolwarm',
            s=15,
            edgecolors='none'
        )

        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]

        plt.axvline(x=s_ts, color='green', linestyle='--', linewidth=1.2, label='Anomaly Start')
        plt.axvline(x=e_ts, color='red', linestyle='--', linewidth=1.2, label='Anomaly End')

        plt.xlabel('Time')
        plt.ylabel('Speed')
        plt.title(f'Interval {i+1}: {s_ts} → {e_ts}')
        plt.legend(loc='upper right')
        plt.xticks(rotation=45)
        plt.tight_layout()

        filename = f"anomaly_interval_{i+1}.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved plot: {filename}")


    return intervals_ts

In [22]:
file_path = 'AGC_Data.csv'

try:
    data_df = pd.read_csv(file_path, parse_dates=['indo_time'])
    intervals_ts = detect_anomaly_range_ml(df=data_df, speed_col='speed', label_col='pred_label', timestamp_col='indo_time')

    if intervals_ts:
        print(f"\nFinal Result: Found {len(intervals_ts)} anomaly ranges.")
    else:
        print("\nFinal Result: No anomaly range found.")
except Exception as e:
    print(f"An error occurred during execution: {e}")

Training data shape: (70000, 36)
Testing data shape: (30000, 36)
Anomaly distribution in training set:
pred_label
0    0.956943
1    0.043057
Name: proportion, dtype: float64
Anomaly distribution in test set:
pred_label
0    0.956967
1    0.043033
Name: proportion, dtype: float64

Training XGBClassifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

--- Model Evaluation Metrics on Test Data ---
Accuracy: 0.9999
Balanced Accuracy: 0.9988
ROC AUC Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28709
           1       1.00      1.00      1.00      1291

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000


Confusion Matrix:
[[28708     1]
 [    3  1288]]

--- Feature Importances (XGBoost) ---
                     Feature  Importance
       Speed_rolling_std_300    0.576651
    Speed_ewm_mean_alpha_0.1    0.117772
        Speed_rolling_min_60    0.056618
    Speed_rolling_median_300    0.050634
       Speed_rolling_min_300    0.041285
  Speed_mean_median_diff_300    0.033804
        Speed_rolling_std_60    0.019560
        Speed_rolling_q25_60    0.015761
      Speed_rolling_mean_900    0.015355
       Speed_rollin

In [23]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt

In [None]:
def detect_anomaly_range_ml(
    df,
    speed_col='speed',
    label_col='pred_label',
    timestamp_col='indo_time',
    test_size=0.1,
    random_state=42,
    save_path_prefix='anomaly_interval'
):
    features = pd.DataFrame(df[speed_col]).copy()
    features.columns = ['Speed']

    window_short = 60
    window_medium = 300
    window_long = 900

    # Rolling‐window means and stds
    features[f'Speed_rolling_mean_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .mean()
    )
    features[f'Speed_rolling_std_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .std()
        .fillna(0)
    )

    features[f'Speed_rolling_mean_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .mean()
    )
    features[f'Speed_rolling_std_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .std()
        .fillna(0)
    )

    features[f'Speed_rolling_mean_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .mean()
    )
    features[f'Speed_rolling_std_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .std()
        .fillna(0)
    )

    # Temporal‐transition differences
    features['Speed_diff_1'] = df[speed_col].diff(periods=1).fillna(0)
    features['Speed_diff_5'] = df[speed_col].diff(periods=5).fillna(0)
    features['Speed_acceleration'] = df[speed_col].diff(periods=1).diff(periods=1).fillna(0)

    # Rolling‐window z‐scores
    features[f'Speed_rolling_zscore_{window_short}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_short}']) /
        (features[f'Speed_rolling_std_{window_short}'] + 1e-9)
    )
    features[f'Speed_rolling_zscore_{window_medium}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_medium}']) /
        (features[f'Speed_rolling_std_{window_medium}'] + 1e-9)
    )
    features[f'Speed_rolling_zscore_{window_long}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_long}']) /
        (features[f'Speed_rolling_std_{window_long}'] + 1e-9)
    )

    # Rolling medians and mean‐median differences
    features[f'Speed_rolling_median_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .median()
    )
    features[f'Speed_mean_median_diff_{window_short}'] = (
        features[f'Speed_rolling_mean_{window_short}'] -
        features[f'Speed_rolling_median_{window_short}']
    )

    features[f'Speed_rolling_median_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .median()
    )
    features[f'Speed_mean_median_diff_{window_medium}'] = (
        features[f'Speed_rolling_mean_{window_medium}'] -
        features[f'Speed_rolling_median_{window_medium}']
    )

    features[f'Speed_rolling_median_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .median()
    )
    features[f'Speed_mean_median_diff_{window_long}'] = (
        features[f'Speed_rolling_mean_{window_long}'] -
        features[f'Speed_rolling_median_{window_long}']
    )

    # CUSUM of positive deviations (persistence metrics)
    features[f'Speed_positive_deviation_{window_short}'] = np.maximum(
        0, df[speed_col] - features[f'Speed_rolling_mean_{window_short}']
    )
    features[f'Speed_cusum_positive_{window_short}'] = (
        features[f'Speed_positive_deviation_{window_short}'].cumsum()
    )

    features[f'Speed_positive_deviation_{window_medium}'] = np.maximum(
        0, df[speed_col] - features[f'Speed_rolling_mean_{window_medium}']
    )
    features[f'Speed_cusum_positive_{window_medium}'] = (
        features[f'Speed_positive_deviation_{window_medium}'].cumsum()
    )

    # Additional rolling statistics (min, max, quartiles)
    features[f'Speed_rolling_min_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .min()
    )
    features[f'Speed_rolling_max_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .max()
    )
    features[f'Speed_rolling_q25_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .quantile(0.25)
    )
    features[f'Speed_rolling_q75_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .quantile(0.75)
    )

    features[f'Speed_rolling_min_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .min()
    )
    features[f'Speed_rolling_max_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .max()
    )
    features[f'Speed_rolling_q25_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .quantile(0.25)
    )
    features[f'Speed_rolling_q75_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .quantile(0.75)
    )

    # EWMA (alpha=0.1)
    features['Speed_ewm_mean_alpha_0.1'] = (
        df[speed_col]
        .ewm(span=60, adjust=False, min_periods=1)
        .mean()
    )
    features['Speed_ewm_std_alpha_0.1'] = (
        df[speed_col]
        .ewm(span=60, adjust=False, min_periods=1)
        .std()
        .fillna(0)
    )

    # Lagged features
    features['Speed_lag_1'] = df[speed_col].shift(1).fillna(0)
    features['Speed_lag_60'] = df[speed_col].shift(60).fillna(0)
    features['Speed_lag_3600'] = df[speed_col].shift(3600).fillna(0)


    X = features.fillna(features.mean())
    y = df[label_col]

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    val_fraction_within = test_size / (1 - test_size)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=val_fraction_within,
        stratify=y_train_val,
        random_state=random_state
    )

    print(f"Train shape: {X_train.shape}")
    print(f"Validation shape: {X_val.shape}")
    print(f"Test shape: {X_test.shape}\n")

    print("Anomaly distribution in training set:")
    print(y_train.value_counts(normalize=True))
    print("\nAnomaly distribution in validation set:")
    print(y_val.value_counts(normalize=True))
    print("\nAnomaly distribution in test set:")
    print(y_test.value_counts(normalize=True))
    print("\n")

    neg_count = y_train.value_counts().get(0, 0)
    pos_count = y_train.value_counts().get(1, 0)
    scale_pos_weight = neg_count / pos_count if pos_count > 0 else 1

    # ======== Train XGBClassifier on Training Split ========
    print("Training XGBClassifier on TRAIN split...")
    model = XGBClassifier(
        n_estimators=100,
        random_state=random_state,
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight
    )
    model.fit(X_train, y_train)
    print("Training complete.\n")

    # ======== Evaluate on Validation Split ========
    print("--- MODEL EVALUATION ON VALIDATION SPLIT ---")
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]

    print(f"Accuracy (Val): {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Balanced Accuracy (Val): {balanced_accuracy_score(y_val, y_val_pred):.4f}")
    try:
        print(f"ROC AUC (Val): {roc_auc_score(y_val, y_val_proba):.4f}")
    except ValueError:
        print("ROC AUC (Val): N/A (only one class present).")

    print("\nClassification Report (Val):")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix (Val):")
    print(confusion_matrix(y_val, y_val_pred))
    print("\n")

    # Show feature importances
    print("--- FEATURE IMPORTANCES (XGBoost) ---")
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print(feature_importances.to_string(index=False))
    print("\n")

    # ======== Predict & Plot on Test Split ========
    print("Predicting on TEST split and plotting anomaly intervals...")
    test_indices = X_test.index.values
    y_test_pred = model.predict(X_test)

    intervals_idx = []
    in_anomaly = False
    start_idx = None

    for orig_i, pred_label in zip(test_indices, y_test_pred):
        if (pred_label == 1) and (not in_anomaly):
            in_anomaly = True
            start_idx = orig_i
        elif (pred_label == 0) and in_anomaly:
            intervals_idx.append((start_idx, prev_i))
            in_anomaly = False

        prev_i = orig_i

    if in_anomaly:
        intervals_idx.append((start_idx, prev_i))

    if not intervals_idx:
        print("No predicted anomaly intervals in TEST split.")
        return []

    intervals_ts = []
    for s_idx, e_idx in intervals_idx:
        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]
        intervals_ts.append((s_ts, e_ts))

    context_window = 60
    for i, (s_idx, e_idx) in enumerate(intervals_idx[:5]):
        start_context = max(0, s_idx - context_window)
        end_context = min(len(df) - 1, e_idx + context_window)

        df_seg = df.loc[start_context : end_context].copy()

        plt.figure(figsize=(12, 5))
        sc = plt.scatter(
            df_seg[timestamp_col],
            df_seg[speed_col],
            c=df_seg[label_col],
            cmap='coolwarm',
            s=15,
            edgecolors='none'
        )

        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]

        plt.axvline(x=s_ts, color='green', linestyle='--', linewidth=1.2, label='Anomaly Start')
        plt.axvline(x=e_ts, color='red', linestyle='--', linewidth=1.2, label='Anomaly End')

        plt.xlabel('Time')
        plt.ylabel('Speed')
        plt.title(f'Test Interval {i+1}: {s_ts} → {e_ts}')
        plt.legend(loc='upper right')
        plt.xticks(rotation=45)
        plt.tight_layout()

        filename = f"{save_path_prefix}_test_interval_{i+1}.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved plot: {filename}")

    return intervals_ts

In [32]:
if __name__ == "__main__":
    file_path = 'AGC_Data.csv'
    try:
        data_df = pd.read_csv(file_path, parse_dates=['indo_time'])
        intervals_ts = detect_anomaly_range_ml(
            df=data_df,
            speed_col='speed',
            label_col='pred_label',
            timestamp_col='indo_time',
            random_state=42,
            save_path_prefix='anomaly_interval'
        )

        if intervals_ts:
            print(f"\nFinal Result: Found {len(intervals_ts)} anomaly ranges on TEST split.")
        else:
            print("\nFinal Result: No anomaly ranges found on TEST split.")
    except Exception as e:
        print(f"An error occurred during execution: {e}")

Train shape: (80000, 36)
Validation shape: (10000, 36)
Test shape: (10000, 36)

Anomaly distribution in training set:
pred_label
0    0.95695
1    0.04305
Name: proportion, dtype: float64

Anomaly distribution in validation set:
pred_label
0    0.9569
1    0.0431
Name: proportion, dtype: float64

Anomaly distribution in test set:
pred_label
0    0.957
1    0.043
Name: proportion, dtype: float64


Training XGBClassifier on TRAIN split...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

--- MODEL EVALUATION ON VALIDATION SPLIT ---
Accuracy (Val): 0.9998
Balanced Accuracy (Val): 0.9977
ROC AUC (Val): 1.0000

Classification Report (Val):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9569
           1       1.00      1.00      1.00       431

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix (Val):
[[9569    0]
 [   2  429]]


--- FEATURE IMPORTANCES (XGBoost) ---
                     Feature  Importance
       Speed_rolling_std_300    0.474951
        Speed_rolling_min_60    0.098253
    Speed_rolling_median_300    0.076284
    Speed_rolling_zscore_900    0.051370
    Speed_ewm_mean_alpha_0.1    0.050463
  Speed_mean_median_diff_300    0.041971
       Speed_rolling_min_300    0.037774
       Speed_rolling_mean_60    0.027805
        Speed_rolling_q25_60    0.022048


In [33]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt

def detect_anomaly_range_ml(
    df,
    speed_col='speed',
    label_col='pred_label',
    timestamp_col='indo_time',
    random_state=42,
    save_path_prefix='anomaly_interval'
):
    """
    1) Builds rolling / lagged features on `speed_col`.
    2) Splits data into 60% train / 20% validation / 20% test via two train_test_split calls.
    3) Trains XGBClassifier on the 60% TRAIN split.
    4) Evaluates on the 20% VALIDATION split.
    5) Predicts on the 20% TEST split, then sorts by original index so that
       contiguous original‐index anomalies are grouped correctly. Finally,
       plots up to 5 anomaly intervals (with a ±60‐sample context) from the test set.
    """

    # ——————————————————————————————————————————————————————————————
    # 1) BUILD FEATURE‐ENGINEERING DATAFRAME
    # ——————————————————————————————————————————————————————————————
    features = pd.DataFrame(df[speed_col]).copy()
    features.columns = ['Speed']

    window_short = 60
    window_medium = 300
    window_long = 900

    # Rolling‐window means + stds
    features[f'Speed_rolling_mean_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .mean()
    )
    features[f'Speed_rolling_std_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .std()
        .fillna(0)
    )

    features[f'Speed_rolling_mean_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .mean()
    )
    features[f'Speed_rolling_std_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .std()
        .fillna(0)
    )

    features[f'Speed_rolling_mean_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .mean()
    )
    features[f'Speed_rolling_std_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .std()
        .fillna(0)
    )

    # 1a) Temporal‐transition features
    features['Speed_diff_1'] = df[speed_col].diff(periods=1).fillna(0)
    features['Speed_diff_5'] = df[speed_col].diff(periods=5).fillna(0)
    features['Speed_acceleration'] = df[speed_col].diff(periods=1).diff(periods=1).fillna(0)

    # 1b) Rolling‐zscore
    features[f'Speed_rolling_zscore_{window_short}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_short}']) /
        (features[f'Speed_rolling_std_{window_short}'] + 1e-9)
    )
    features[f'Speed_rolling_zscore_{window_medium}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_medium}']) /
        (features[f'Speed_rolling_std_{window_medium}'] + 1e-9)
    )
    features[f'Speed_rolling_zscore_{window_long}'] = (
        (df[speed_col] - features[f'Speed_rolling_mean_{window_long}']) /
        (features[f'Speed_rolling_std_{window_long}'] + 1e-9)
    )

    # 1c) Rolling‐median + mean‐median difference
    features[f'Speed_rolling_median_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .median()
    )
    features[f'Speed_mean_median_diff_{window_short}'] = (
        features[f'Speed_rolling_mean_{window_short}'] -
        features[f'Speed_rolling_median_{window_short}']
    )

    features[f'Speed_rolling_median_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .median()
    )
    features[f'Speed_mean_median_diff_{window_medium}'] = (
        features[f'Speed_rolling_mean_{window_medium}'] -
        features[f'Speed_rolling_median_{window_medium}']
    )

    features[f'Speed_rolling_median_{window_long}'] = (
        df[speed_col]
        .rolling(window=window_long, min_periods=int(window_long * 0.2))
        .median()
    )
    features[f'Speed_mean_median_diff_{window_long}'] = (
        features[f'Speed_rolling_mean_{window_long}'] -
        features[f'Speed_rolling_median_{window_long}']
    )

    # 1d) CUSUM of positive deviations
    features[f'Speed_positive_deviation_{window_short}'] = np.maximum(
        0, df[speed_col] - features[f'Speed_rolling_mean_{window_short}']
    )
    features[f'Speed_cusum_positive_{window_short}'] = (
        features[f'Speed_positive_deviation_{window_short}'].cumsum()
    )

    features[f'Speed_positive_deviation_{window_medium}'] = np.maximum(
        0, df[speed_col] - features[f'Speed_rolling_mean_{window_medium}']
    )
    features[f'Speed_cusum_positive_{window_medium}'] = (
        features[f'Speed_positive_deviation_{window_medium}'].cumsum()
    )

    # 1e) Additional rolling stats (min, max, quartiles)
    features[f'Speed_rolling_min_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .min()
    )
    features[f'Speed_rolling_max_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .max()
    )
    features[f'Speed_rolling_q25_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .quantile(0.25)
    )
    features[f'Speed_rolling_q75_{window_short}'] = (
        df[speed_col]
        .rolling(window=window_short, min_periods=1)
        .quantile(0.75)
    )

    features[f'Speed_rolling_min_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .min()
    )
    features[f'Speed_rolling_max_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .max()
    )
    features[f'Speed_rolling_q25_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .quantile(0.25)
    )
    features[f'Speed_rolling_q75_{window_medium}'] = (
        df[speed_col]
        .rolling(window=window_medium, min_periods=int(window_medium * 0.2))
        .quantile(0.75)
    )

    # 1f) EWMA (alpha=0.1)
    features['Speed_ewm_mean_alpha_0.1'] = (
        df[speed_col]
        .ewm(span=10, adjust=False, min_periods=1)
        .mean()
    )
    features['Speed_ewm_std_alpha_0.1'] = (
        df[speed_col]
        .ewm(span=10, adjust=False, min_periods=1)
        .std()
        .fillna(0)
    )

    # 1g) Lagged features
    features['Speed_lag_1'] = df[speed_col].shift(1).fillna(0)
    features['Speed_lag_5'] = df[speed_col].shift(5).fillna(0)
    features['Speed_lag_60'] = df[speed_col].shift(60).fillna(0)

    # Final X, y
    X = features.fillna(features.mean())
    y = df[label_col]

    # ——————————————————————————————————————————————————————————————
    # 2) 60% / 20% / 20% SPLIT VIA train_test_split
    # ——————————————————————————————————————————————————————————————
    #
    # Step A: Split off 20% as TEST
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X,
        y,
        test_size=0.20,             # 20% of entire dataset → TEST
        stratify=y,
        random_state=random_state
    )

    # Step B: From the remaining 80% (X_train_val), split off 25% of that
    # → which is 0.25 * 0.80 = 0.20 of the entire dataset → VALIDATION.
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=0.25,             # 25% of TRAIN_VAL = 20% of total → VALIDATION
        stratify=y_train_val,
        random_state=random_state
    )

    print(f"Shapes →  Train: {X_train.shape},  Val: {X_val.shape},  Test: {X_test.shape}\n")
    print("Anomaly distribution in TRAIN:",  y_train.value_counts(normalize=True).to_dict())
    print("Anomaly distribution in VALID:",  y_val.value_counts(normalize=True).to_dict())
    print("Anomaly distribution in TEST: ",  y_test.value_counts(normalize=True).to_dict())
    print("\n")

    # Compute scale_pos_weight (for imbalance) on TRAIN split
    neg_count = y_train.value_counts().get(0, 0)
    pos_count = y_train.value_counts().get(1, 0)
    scale_pos_weight = (neg_count / pos_count) if (pos_count > 0) else 1

    # ——————————————————————————————————————————————————————————————
    # 3) TRAIN XGBClassifier ON TRAIN SPLIT
    # ——————————————————————————————————————————————————————————————
    print("Training XGBClassifier on TRAIN split...")
    model = XGBClassifier(
        n_estimators=100,
        random_state=random_state,
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight
    )
    model.fit(X_train, y_train)
    print("Training complete.\n")

    # ——————————————————————————————————————————————————————————————
    # 4) EVALUATE ON VALIDATION SPLIT
    # ——————————————————————————————————————————————————————————————
    print("--- MODEL EVALUATION ON VALIDATION SPLIT ---")
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]

    print(f"Accuracy (Val): {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Balanced Accuracy (Val): {balanced_accuracy_score(y_val, y_val_pred):.4f}")
    try:
        print(f"ROC AUC (Val): {roc_auc_score(y_val, y_val_proba):.4f}")
    except ValueError:
        print("ROC AUC (Val): N/A (only one class present).")

    print("\nClassification Report (Val):")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix (Val):")
    print(confusion_matrix(y_val, y_val_pred))
    print("\n")

    # Show feature importances
    print("--- FEATURE IMPORTANCES (XGBoost) ---")
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print(feature_importances.to_string(index=False))
    print("\n")

    # ——————————————————————————————————————————————————————————————
    # 5) PREDICT & PLOT ON TEST SPLIT
    # ——————————————————————————————————————————————————————————————
    print("Predicting on TEST split and plotting anomaly intervals…")

    # When we called train_test_split, X_test.index still refers to the original df indices.
    # To get correct “contiguous in time” grouping, we must sort by that original index.
    y_test_pred = model.predict(X_test)
    test_idx = X_test.index.values

    # Build a small DataFrame of (orig_idx, pred_label), then sort by orig_idx:
    test_df = pd.DataFrame({
        'orig_idx': test_idx,
        'pred': y_test_pred
    })
    test_df = test_df.sort_values(by='orig_idx').reset_index(drop=True)

    # Now scan for runs where pred == 1
    intervals_idx = []
    in_anomaly = False
    start_idx = None

    for row in test_df.itertuples(index=False):
        orig_i = row.orig_idx
        label = row.pred

        if (label == 1) and (not in_anomaly):
            # start a new anomaly run
            in_anomaly = True
            start_idx = orig_i

        elif (label == 0) and in_anomaly:
            # close the anomaly run at the previous index
            intervals_idx.append((start_idx, prev_i))
            in_anomaly = False

        prev_i = orig_i

    # If we ended while still in an anomaly run, close it now:
    if in_anomaly:
        intervals_idx.append((start_idx, prev_i))

    if not intervals_idx:
        print("No predicted anomaly intervals in TEST split.")
        return []

    # Convert each (start_idx, end_idx) to timestamps
    intervals_ts = []
    for (s_idx, e_idx) in intervals_idx:
        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]
        intervals_ts.append((s_ts, e_ts))

    # Plot up to FIRST 5 anomaly intervals (with ±60‐sample context)
    context_window = 60
    for i, (s_idx, e_idx) in enumerate(intervals_idx[:5]):
        start_context = max(0, s_idx - context_window)
        end_context   = min(len(df) - 1, e_idx + context_window)

        df_seg = df.loc[start_context : end_context].copy()

        plt.figure(figsize=(12, 5))
        sc = plt.scatter(
            df_seg[timestamp_col],
            df_seg[speed_col],
            c=df_seg[label_col],
            cmap='coolwarm',
            s=15,
            edgecolors='none'
        )

        s_ts = df.loc[s_idx, timestamp_col]
        e_ts = df.loc[e_idx, timestamp_col]

        plt.axvline(x=s_ts, color='green', linestyle='--', linewidth=1.2, label='Anomaly Start')
        plt.axvline(x=e_ts, color='red', linestyle='--', linewidth=1.2, label='Anomaly End')

        plt.xlabel('Time')
        plt.ylabel('Speed')
        plt.title(f'Test Interval {i+1}: {s_ts} → {e_ts}')
        plt.legend(loc='upper right')
        plt.xticks(rotation=45)
        plt.tight_layout()

        filename = f"{save_path_prefix}_test_interval_{i+1}.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved plot: {filename}")

    return intervals_ts


# ——————————————————————————————————————————————————————————————
# USAGE EXAMPLE (if run as a script)
# ——————————————————————————————————————————————————————————————
if __name__ == "__main__":
    file_path = 'AGC_Data.csv'
    try:
        data_df = pd.read_csv(file_path, parse_dates=['indo_time'])
        intervals_ts = detect_anomaly_range_ml(
            df=data_df,
            speed_col='speed',
            label_col='pred_label',
            timestamp_col='indo_time',
            random_state=42,
            save_path_prefix='anomaly_interval'
        )

        if intervals_ts:
            print(f"\nFinal Result: Found {len(intervals_ts)} anomaly ranges on TEST split.")
        else:
            print("\nFinal Result: No anomaly ranges found on TEST split.")
    except Exception as e:
        print(f"An error occurred during execution: {e}")


Shapes →  Train: (60000, 36),  Val: (20000, 36),  Test: (20000, 36)

Anomaly distribution in TRAIN: {0: 0.95695, 1: 0.04305}
Anomaly distribution in VALID: {0: 0.95695, 1: 0.04305}
Anomaly distribution in TEST:  {0: 0.95695, 1: 0.04305}


Training XGBClassifier on TRAIN split...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

--- MODEL EVALUATION ON VALIDATION SPLIT ---
Accuracy (Val): 0.9997
Balanced Accuracy (Val): 0.9982
ROC AUC (Val): 1.0000

Classification Report (Val):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19139
           1       1.00      1.00      1.00       861

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix (Val):
[[19136     3]
 [    3   858]]


--- FEATURE IMPORTANCES (XGBoost) ---
                     Feature  Importance
       Speed_rolling_std_300    0.519172
    Speed_ewm_mean_alpha_0.1    0.171661
    Speed_rolling_median_300    0.070850
  Speed_mean_median_diff_300    0.039620
        Speed_rolling_min_60    0.033039
       Speed_rolling_min_300    0.028034
        Speed_rolling_std_60    0.020926
        Speed_rolling_q75_60    0.017053
      Speed_rolling_mean_300    0.016

In [34]:
if __name__ == "__main__":
    file_path = 'AGC_Data.csv'
    try:
        data_df = pd.read_csv(file_path, parse_dates=['indo_time'])
        intervals_ts = detect_anomaly_range_ml(
            df=data_df,
            speed_col='speed',
            label_col='pred_label',
            timestamp_col='indo_time',
            random_state=42,
            save_path_prefix='anomaly_interval'
        )

        if intervals_ts:
            print(f"\nFinal Result: Found {len(intervals_ts)} anomaly ranges on TEST split.")
        else:
            print("\nFinal Result: No anomaly ranges found on TEST split.")
    except Exception as e:
        print(f"An error occurred during execution: {e}")

Shapes →  Train: (60000, 36),  Val: (20000, 36),  Test: (20000, 36)

Anomaly distribution in TRAIN: {0: 0.95695, 1: 0.04305}
Anomaly distribution in VALID: {0: 0.95695, 1: 0.04305}
Anomaly distribution in TEST:  {0: 0.95695, 1: 0.04305}


Training XGBClassifier on TRAIN split...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

--- MODEL EVALUATION ON VALIDATION SPLIT ---
Accuracy (Val): 0.9997
Balanced Accuracy (Val): 0.9982
ROC AUC (Val): 1.0000

Classification Report (Val):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19139
           1       1.00      1.00      1.00       861

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix (Val):
[[19136     3]
 [    3   858]]


--- FEATURE IMPORTANCES (XGBoost) ---
                     Feature  Importance
       Speed_rolling_std_300    0.519172
    Speed_ewm_mean_alpha_0.1    0.171661
    Speed_rolling_median_300    0.070850
  Speed_mean_median_diff_300    0.039620
        Speed_rolling_min_60    0.033039
       Speed_rolling_min_300    0.028034
        Speed_rolling_std_60    0.020926
        Speed_rolling_q75_60    0.017053
      Speed_rolling_mean_300    0.016