### This is the main Model which we created

In [1]:
import pandas as pd
import numpy as np
from interpret.glassbox import ExplainableBoostingRegressor
import shap
import joblib
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [None]:

# 1. Model Creation Function
def create_ebm_model(feature_cols, monotone_features=None):
    """
    Create Explainable Boosting Machine regressor with monotonicity constraints.
    :param feature_cols: list of feature column names
    :param monotone_features: dict {feature_name: +1/-1/0}, e.g. {'speed_mean': 1}
    :return: ebm model instance
    """
    monotonic_terms = {}
    if monotone_features is not None:
        for feat, mono in monotone_features.items():
            if feat in feature_cols:
                monotonic_terms[feature_cols.index(feat)] = mono

    ebm = ExplainableBoostingRegressor()
    return ebm


# 2. Data Loading Function
def load_data(path):
    df = pd.read_csv(path)
    return df


# 3. Training Function Per Cluster (EBM residual-based anomalies)
def train_cluster_models(df, feature_cols, target_col, monotone_features=None):
    os.makedirs('models', exist_ok=True)
    os.makedirs('explanations', exist_ok=True)

    df['anomaly_score_ebm'] = np.nan
    df['alert_ebm'] = False

    for cluster_id in df['cluster'].unique():
        df_cluster = df[df['cluster'] == cluster_id].copy()

        # Skip very small clusters
        if len(df_cluster) < 20:
            print(f"Skipping cluster {cluster_id} (too few samples)")
            continue

        print(f"Training EBM model for cluster {cluster_id}")

        X = df_cluster[feature_cols]
        y = df_cluster[target_col]

        model = create_ebm_model(feature_cols, monotone_features)
        model.fit(X, y)

        # Save model
        model_path = f'models/ebm_regressor_cluster_{cluster_id}.joblib'
        joblib.dump(model, model_path)

        # Predict and residuals (standardized)
        y_pred = model.predict(X)
        residuals = np.abs(y - y_pred)
        residuals = (residuals - residuals.mean()) / residuals.std()
        df.loc[df['cluster'] == cluster_id, 'anomaly_score_ebm'] = residuals

        # SHAP explainability
        explainer = shap.Explainer(model.predict, X)
        shap_values = explainer(X)

        # Save summary plot
        shap.summary_plot(shap_values, X, show=False)
        plt.savefig(f'explanations/shap_summary_cluster_{cluster_id}.png')
        plt.close()

        # Save force plot of first row
        force_plot = shap.force_plot(
            shap_values.base_values[0],
            shap_values.values[0],
            X.iloc[0],
            matplotlib=False
        )
        with open(f'explanations/shap_force_cluster_{cluster_id}.html', "w") as f:
            f.write(shap.getjs() + force_plot.html())

    return df


# 4. Adaptive Threshold & Alerting for EBM
def generate_alerts_ebm(df):
    for cluster_id in df['cluster'].unique():
        cluster_scores = df.loc[df['cluster'] == cluster_id, 'anomaly_score_ebm'].dropna()
        if len(cluster_scores) < 10:
            continue
        threshold = np.percentile(cluster_scores, 95)
        df.loc[(df['cluster'] == cluster_id) &
               (df['anomaly_score_ebm'] > threshold), 'alert_ebm'] = True
    return df


# 5. Isolation Forest Unsupervised Detector
def run_isolation_forest(df, feature_cols):
    print("\nRunning Isolation Forest (unsupervised)...")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[feature_cols])

    iso = IsolationForest(contamination=0.05, random_state=42)
    preds = iso.fit_predict(X_scaled)  # -1 = anomaly, 1 = normal
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous

    df['anomaly_score_iforest'] = scores
    df['alert_iforest'] = preds == -1
    return df


# 6. Main Execution Flow
if __name__ == '__main__':
    df = load_data('CSV_files/final_clustered_data.csv')

    # Specify features and target (adjust as needed)
    feature_cols = [col for col in df.columns if col not in
                    ['deviceID', 'tripID', 'cluster', 'label',
                     'anomaly_score_ebm', 'alert_ebm',
                     'anomaly_score_iforest', 'alert_iforest']]
    target_col = 'kpl_mean'  # example target â€” change if not in dataset

    # Specify monotonic features (domain knowledge example)
    monotone_features = {'speed_mean': 1, 'rpm_mean': 1}

    # 1. Train EBM residual-based models
    df = train_cluster_models(df, feature_cols, target_col, monotone_features)

    # 2. Generate alerts (EBM)
    df = generate_alerts_ebm(df)

    # 3. Run Isolation Forest (unsupervised)
    df = run_isolation_forest(df, feature_cols)

    # Save final data with anomaly scores and alerts
    df.to_csv('CSV_files/Final_Anomalies_Clustered_data.csv', index=False)
    print("\nCompleted EBM training, Isolation Forest, scoring, explanation, and alerting.")


Training EBM model for cluster 1


  warn(
PermutationExplainer explainer: 4170it [03:22, 20.16it/s]                          
  shap.summary_plot(shap_values, X, show=False)


Training EBM model for cluster 2


  warn(
PermutationExplainer explainer: 622it [00:14, 13.88it/s]                         
  shap.summary_plot(shap_values, X, show=False)



Running Isolation Forest (unsupervised)...

Completed EBM training, Isolation Forest, scoring, explanation, and alerting.
