In [None]:
import pandas as pd
import numpy as np

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest

# Model and evaluation
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Replace this with the path or method to load your actual dataset
df = pd.read_csv("../Dataset/preprocessed_trendedpointalarm.csv")

# Quick look at the data
print("Initial DataFrame shape:", df.shape)
df.head()


Initial DataFrame shape: (102319, 15)


Unnamed: 0,DateTime,ProcessID,AssetID,AlarmSeverityName,State,TransactionMessage,Stage,AlarmClassName,Year,Month,Day,DayOfWeek,Season,Hour,ProcessedMessage
0,2/1/2018 22:45,IBMS/201801024100783,1-JK1-JK1-01-E.02-AC-ACON-VAVU-0047,3 - Low,A2N,VAV-J09-01-029 SPACE TEMP ALARM,Cancelled,General-ELV,2018,1,2,Tuesday,Winter,22,vavj0901029 space temp alarm
1,3/1/2018 1:41,IBMS/201801024101029,1-JK1-JK1-00-C.27-AC-ACON-VAVU-0019,3 - Low,A2N,VAV-J09-00-027 SPACE TEMP ALARM,Cancelled,General-ELV,2018,1,3,Wednesday,Winter,1,vavj0900027 space temp alarm
2,3/1/2018 3:08,IBMS/201801034101175,1-JK1-JK1-00-C.27-AC-ACON-VAVU-0019,3 - Low,A2N,VAV-J09-00-027 SPACE TEMP ALARM,Cancelled,General-ELV,2018,1,3,Wednesday,Winter,3,vavj0900027 space temp alarm
3,3/1/2018 7:12,IBMS/201801034101667,1-JK1-JK1-00-D.01-AC-ACON-VAVU-0021,3 - Low,A2N,VAV-J09-00-029 SPACE TEMP ALARM,Cancelled,General-ELV,2018,1,3,Wednesday,Winter,7,vavj0900029 space temp alarm
4,3/1/2018 9:04,IBMS/201801034102043,0-JK1-JK1-B2-1.01-AC-ACON-MAHU-0003,2 - Medium,A2N,MAU-JK1-B1-001 Dis Air Temp,Cancelled,General-ELV,2018,1,3,Wednesday,Winter,9,maujk1b1001 dis air temp


In [3]:
# 3.1 Convert DateTime to datetime type
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')

# 3.2 Drop unnecessary columns (example columns to drop—modify as needed)
cols_to_drop = ['ProcessID', 'TransactionMessage', 'ProcessedMessage']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# 3.3 Check for missing values
print("Missing values before imputation:")
print(df.isnull().sum())


Missing values before imputation:
DateTime             63331
AssetID                271
AlarmSeverityName        0
State                    0
Stage                 2984
AlarmClassName           0
Year                     0
Month                    0
Day                      0
DayOfWeek                0
Season                   0
Hour                     0
dtype: int64


In [4]:
# 4.1 Target variable
target_col = 'AlarmSeverityName'

# 4.2 Features
feature_cols = [col for col in df.columns if col not in [target_col, 'DateTime']]

X = df[feature_cols]
y = df[target_col]

print("Features:", feature_cols)
print("Target:", target_col)


Features: ['AssetID', 'State', 'Stage', 'AlarmClassName', 'Year', 'Month', 'Day', 'DayOfWeek', 'Season', 'Hour']
Target: AlarmSeverityName


In [5]:
# 5.1 Detect categorical columns (object or category dtype)
cat_cols = X.select_dtypes(include=['object', 'category']).columns

# Convert them to category if not already
for col in cat_cols:
    X[col] = X[col].astype('category')

# Label-encode the categorical columns to ensure your final dataset X has only numeric columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# 5.2 Remove rare classes(for example if "Low" repeats only once that would be a rare class) in the target (any class with <=1 occurrence, for example)
y_counts = y.value_counts()
rare_classes = y_counts[y_counts <= 1].index
df = df[~df[target_col].isin(rare_classes)]

# Update X and y after removing rare-class rows
X = df[feature_cols]
y = df[target_col]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [6]:
# Reapply Label Encoding to categorical columns in X
cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    # Optionally, store the encoder for later use:
    label_encoders[col] = le

# Label-encode the target variable as well
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y.astype(str))

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# Proceed to split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Preserves class distribution
)

# Convert X_train and X_test to DataFrames with proper column names
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train, columns=feature_cols)
if not isinstance(X_test, pd.DataFrame):
    X_test = pd.DataFrame(X_test, columns=feature_cols)

# Optionally, convert all columns to numeric (they should already be numeric now)
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.


In [7]:
# Ensure X_train is a DataFrame (if it isn’t already)
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train, columns=feature_cols)  # assign appropriate column names

# Identify categorical columns (those of object type)
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply Label Encoding to each categorical column
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le

# Do the same for your test set if needed:
if isinstance(X_test, np.ndarray):
    X_test = pd.DataFrame(X_test, columns=X_train.columns)
else:
    for col in categorical_cols:
        X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Now, try SMOTE again by generating synthetic samples for the minority classes
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("Class distribution before SMOTE:", np.bincount(y_train))
print("Class distribution after SMOTE:", np.bincount(y_train_sm))

Class distribution before SMOTE: [51233 13449 16928    83   125    36]
Class distribution after SMOTE: [51233 51233 51233 51233 51233 51233]


In [8]:
from sklearn.ensemble import IsolationForest

# Apply IsolationForest on the SMOTE-resampled training data
iso_forest = IsolationForest(contamination=0.01, random_state=42)
y_train_outliers = iso_forest.fit_predict(X_train_sm)  # X_train_sm from SMOTE

# Keep only inliers (labels == 1)
inlier_mask = (y_train_outliers == 1)
X_train_clean = X_train_sm[inlier_mask]
y_train_clean = y_train_sm[inlier_mask]

print("Training set size before outlier removal:", X_train_sm.shape[0])
print("Training set size after outlier removal:", X_train_clean.shape[0])


Training set size before outlier removal: 307398
Training set size after outlier removal: 304329


In [9]:
# Outlier detection BEFORE scaling
iso_forest_orig = IsolationForest(contamination=0.01, random_state=42)
outlier_preds_orig = iso_forest_orig.fit_predict(X_train_sm)

In [10]:
from sklearn.preprocessing import RobustScaler


# Apply RobustScaler on the SMOTE-resampled training data
scaler = RobustScaler()
X_train_sm_scaled = scaler.fit_transform(X_train_sm)

# Initialize IsolationForest for the scaled data
iso_forest_scaled = IsolationForest(contamination=0.01, random_state=42)
outlier_preds_scaled = iso_forest_scaled.fit_predict(X_train_sm_scaled)

In [11]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

# Convert y_train_clean to a pandas Series
y_train_clean_series = pd.Series(y_train_clean)

seeds = [42, 123, 999, 31415, 2718]
models_list = []

for seed in seeds:
    rng = np.random.RandomState(seed)

    # Generate bootstrap indices
    boot_indices = rng.choice(len(X_train_clean), size=len(X_train_clean), replace=True)
    X_boot = X_train_clean.iloc[boot_indices]       # X_train_clean is a DataFrame
    y_boot = y_train_clean_series.iloc[boot_indices]  # Now you can use .iloc on the Series

    # Initialize XGBoost with some randomness
    model_ens = XGBClassifier(
        random_state=seed,
        use_label_encoder=False,
        eval_metric='logloss',
        subsample=0.8,
        colsample_bytree=0.8
    )

    model_ens.fit(X_boot, y_boot)
    models_list.append(model_ens)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



# Static three classes in a dashboard

In [None]:
import lightningchart as lc
import numpy as np

lc.set_license('my-license-key')

predictions_for_class0 = []
predictions_for_class1 = []
predictions_for_class2 = []

for m in models_list:
    proba = m.predict_proba(X_test)  # shape: (num_samples, n_classes)
    # Append the probabilities for each class
    predictions_for_class0.append(proba[:, 0])  # Class=0 (Low)
    predictions_for_class1.append(proba[:, 1])  # Class=1 (Medium)
    predictions_for_class2.append(proba[:, 2])  # Class=2 (High)

# Convert each list into a NumPy array of shape (num_models, num_samples)
predictions_for_class0 = np.array(predictions_for_class0)  # (num_models, num_samples)
predictions_for_class1 = np.array(predictions_for_class1)
predictions_for_class2 = np.array(predictions_for_class2)

# --- 2) Helper function to create a chart with uncertainty bands. ---
def create_uncertainty_chart(dashboard, row_idx, col_idx, predictions_array, class_label):
    """
    Creates a chart in the given (row_idx, col_idx) of the Dashboard showing:
      - Median line
      - 5–95% band (outer)
      - 25–75% band (inner)
    for a set of predictions_array shaped (num_models, num_samples).
    """
    # Compute percentiles
    median_pred = np.median(predictions_array, axis=0)
    p5 = np.percentile(predictions_array, 5, axis=0)
    p25 = np.percentile(predictions_array, 25, axis=0)
    p75 = np.percentile(predictions_array, 75, axis=0)
    p95 = np.percentile(predictions_array, 95, axis=0)

    # Sort by median for a smoother band
    order = np.argsort(median_pred)
    x_sorted = np.arange(len(order)).tolist()
    median_sorted = median_pred[order].tolist()
    p5_sorted = p5[order].tolist()
    p25_sorted = p25[order].tolist()
    p75_sorted = p75[order].tolist()
    p95_sorted = p95[order].tolist()

    # Create chart for this class
    chart = dashboard.ChartXY(row_index=row_idx, column_index=col_idx, 
                              title=f"Uncertainty Band (Class={class_label})")

    # Create the median line series
    median_series = chart.add_line_series(data_pattern="ProgressiveX")
    median_series.append_samples(x_values=x_sorted,
                                 y_values=[float(val) for val in median_sorted])
    median_series.set_name(f"Median (Class={class_label})")

    # Outer polygon: 5th-95th percentile
    outer_polygon = chart.add_area_series()
    outer_points = []
    for x, y in zip(x_sorted, p5_sorted):
        outer_points.append({"x": float(x), "y": float(y)})
    for x, y in zip(x_sorted[::-1], p95_sorted[::-1]):
        outer_points.append({"x": float(x), "y": float(y)})
    outer_polygon.add(outer_points)
    outer_polygon.set_fill_color(lc.Color(0, 0, 255))  # Blue
    outer_polygon.set_name("5th–95th Percentile")

    # Inner polygon: 25th-75th percentile
    inner_polygon = chart.add_area_series()
    inner_points = []
    for x, y in zip(x_sorted, p25_sorted):
        inner_points.append({"x": float(x), "y": float(y)})
    for x, y in zip(x_sorted[::-1], p75_sorted[::-1]):
        inner_points.append({"x": float(x), "y": float(y)})
    inner_polygon.add(inner_points)
    inner_polygon.set_fill_color(lc.Color(255, 255, 0))  # Yellow
    inner_polygon.set_name("25th–75th Percentile")

    # Create and add legend with the three series
    legend = chart.add_legend()
    legend.add(median_series)
    legend.add(outer_polygon)
    legend.add(inner_polygon)
    legend.set_title("Legend")

    chart.get_default_x_axis().set_title("Sorted Sample Index")
    chart.get_default_y_axis().set_title("Predicted Probability")
    return chart

# --- 3) Create a 3-row dashboard (one chart per class). ---
dashboard = lc.Dashboard(rows=3, columns=1, theme=lc.Themes.Light)

# Row 0 -> Class 0 (Low severity)
create_uncertainty_chart(
    dashboard=dashboard,
    row_idx=0,
    col_idx=0,
    predictions_array=predictions_for_class0,
    class_label="Low"
)

# Row 1 -> Class 1 (Medium severity)
create_uncertainty_chart(
    dashboard=dashboard,
    row_idx=1,
    col_idx=0,
    predictions_array=predictions_for_class1,
    class_label="Medium"
)
create_uncertainty_chart(
    dashboard=dashboard,
    row_idx=2,
    col_idx=0,
    predictions_array=predictions_for_class2,
    class_label="High"
)

dashboard.open(method="browser")


127.0.0.1 - - [04/Mar/2025 10:08:15] "GET / HTTP/1.1" 200 -


<lightningchart.charts.dashboard.Dashboard at 0x23b48abf150>