In [2]:
# One-time install in a notebook cell (comment out after first run)
#!pip install pandas numpy scikit-learn xgboost

In [3]:
# required imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

In [4]:
# Load & peek at the data
# What to look for: do all the columns you expect show up? Are timestamps parsed correctly?
df = pd.read_csv("sample_operational_metrics.csv", parse_dates=["timestamp_utc"])
df.head()

  df = pd.read_csv("sample_operational_metrics.csv", parse_dates=["timestamp_utc"])


Unnamed: 0,timestamp_utc,host_id,service,cpu_pct,mem_pct,disk_used_pct,error_rate,is_holiday,change_deployed_prev_1h,incident_open_in_next_2h,p1_or_p2_next_incident
0,2025-05-14 00:00:00,app-01,payments-api,52.967142,62.317104,69.665187,0.049,0,0,0,
1,2025-05-14 00:05:00,app-01,payments-api,46.617357,50.005628,70.025097,0.002,0,0,0,
2,2025-05-14 00:10:00,app-01,payments-api,54.476885,46.711582,69.501059,0.023,0,0,0,
3,2025-05-14 00:15:00,app-01,payments-api,63.230299,54.159466,71.238636,0.001,0,0,0,
4,2025-05-14 00:20:00,app-01,payments-api,45.658466,57.902765,69.937843,0.003,0,0,0,


In [5]:
# Basic preprocessing
# 1. Sort by time (safety first)
df = df.sort_values("timestamp_utc")

# 2. Handle obvious outliers or impossible negatives
metric_cols = ["cpu_pct", "mem_pct", "disk_used_pct", "error_rate"]
df[metric_cols] = df[metric_cols].clip(lower=0, upper=100)

# 3. Fill tiny gaps (if any) using forward-fill
df[metric_cols] = df[metric_cols].ffill()

In [10]:
# Feature engineering (rolling windows & lags)
# We’ll create simple rolling-mean features for CPU & memory and a “minutes since last deployment” feature.
# Start small; you can add fancier ones later.

# Rolling Window (a moving average)
# It’s like looking at a sliding window of recent history for a metric (e.g. CPU %).
# Imagine a 30-minute moving average of CPU usage:
# If current CPU is 80%, but it’s been slowly rising from 50% over the last 30 minutes… that’s a risk signal.
# What am I doing here -> df["cpu_mean_30m"] = df["cpu_pct"].rolling(6).mean()
# I am taking the last 6 rows (each = 5 min), average them, and store it.
# Same for 2 hours (24 rows at 5-minute intervals): df["cpu_mean_2h"] = df["cpu_pct"].rolling(24).mean()

# Lag-like Feature: Time since last deployment
# When something breaks, it often happens right after a code change (deployment).
# We want the model to know how long it’s been since the last change.
# This helps the model learn things like: “If we just deployed something and CPU is rising… a problem might be coming.”

# Why we need them ?????
# Without history, the model only sees what’s happening right now.
# With rolling windows and lags, it sees what has been building up over time.
# Just like weather forecasts look at past temperatures and pressure over the last few hours, 
# incident prediction needs trends — not just snapshots.



# Rolling means over the past 30 and 120 minutes (6 and 24 rows if 5-min freq)
df["cpu_mean_30m"] = df["cpu_pct"].rolling(6, min_periods=1).mean()
df["cpu_mean_2h"]  = df["cpu_pct"].rolling(24, min_periods=1).mean()

df["mem_mean_30m"] = df["mem_pct"].rolling(6, min_periods=1).mean()
df["mem_mean_2h"]  = df["mem_pct"].rolling(24, min_periods=1).mean()

# Minutes since last deployment (simple cumulative counter)
df["minutes_since_deploy"] = (
    df.groupby("host_id")["change_deployed_prev_1h"]
      .transform(lambda s: (~s.astype(bool)).cumsum())
)

In [11]:
# Choose a target to start with - first thing you want to predict :-)
# Pick one target first so you don’t feel overwhelmed. Let’s begin with:

target = "incident_open_in_next_2h"  # binary 0/1
feature_cols = [
    "cpu_pct", "mem_pct", "disk_used_pct", "error_rate",
    "cpu_mean_30m", "cpu_mean_2h",
    "mem_mean_30m", "mem_mean_2h",
    "minutes_since_deploy", "is_holiday"
]

# You’ll repeat similar steps later for the P1/P2 label, CPU spikes, and storage-90% breach.

In [12]:
# Time-aware train/test split
# Traditional random splits leak future info. Use the latest 20 % of rows as “future” test data:
split_idx = int(len(df) * 0.8)
train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

X_train, y_train = train_df[feature_cols], train_df[target]
X_test,  y_test  = test_df[feature_cols],  test_df[target]

In [16]:
# dataset is could be small or the positive class (incident_open_in_next_2h == 1) is very rare, 
# so the last 20% of rows could end up being all 0s.
# This could be valid case - so no worries

print("Train label distribution:\n", y_train.value_counts())
print("Test label distribution:\n", y_test.value_counts())

Train label distribution:
 incident_open_in_next_2h
0    420475
1         5
Name: count, dtype: int64
Test label distribution:
 incident_open_in_next_2h
0    105120
Name: count, dtype: int64


In [17]:
# stratified split instead (to preserve class balance)
# This randomly shuffles rows but preserves class proportions — useful for rare event classification 
# when you don’t have a huge time series yet.
# Downside: It breaks temporal ordering, so it's not ideal for real forecasting tasks — but great for learning and prototyping.

from sklearn.model_selection import train_test_split

X = df[feature_cols]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [18]:
# Train with Gutch's favorite model (XGBoost)
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=(y_train==0).sum() / (y_train==1).sum()  # handle class imbalance
)
model.fit(X_train, y_train)

In [19]:
# Evaluate
# Precision – when the model says an incident is coming, how often is it right?
# Recall – how many of the real incidents did it catch?
# ROC-AUC – overall ranking ability (1.0 = perfect, 0.5 = random).

proba = model.predict_proba(X_test)[:, 1]
preds = (proba >= 0.5).astype(int)

print(classification_report(y_test, preds, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, proba).round(3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000    105119
           1      0.000     0.000     0.000         1

    accuracy                          1.000    105120
   macro avg      0.500     0.500     0.500    105120
weighted avg      1.000     1.000     1.000    105120

ROC-AUC: 0.999


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
# To predict I will have an issue in the next 24 hours

# First, copy the original data so we don't overwrite anything important
df_24h = df.copy()

# Create a new label column
df_24h["incident_open_in_next_24h"] = 0

# Get the indices where incidents occurred (label == 1)
incident_indices = df_24h.index[df_24h["incident_open_in_next_2h"] == 1]

# For each incident, label the 24 hours (288 rows = 5 min x 288 = 24h) BEFORE it as 1
for idx in incident_indices:
    start_idx = max(idx - 288, 0)
    df_24h.loc[start_idx:idx, "incident_open_in_next_24h"] = 1


In [22]:
# Train the model with the new target
# Update your training code to use the new label:

target = "incident_open_in_next_24h"
feature_cols = [
    "cpu_pct", "mem_pct", "disk_used_pct", "error_rate",
    "cpu_mean_30m", "cpu_mean_2h",
    "mem_mean_30m", "mem_mean_2h",
    "minutes_since_deploy", "is_holiday"
]

X = df_24h[feature_cols]
y = df_24h[target]

# Use stratified split to avoid class imbalance issues
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Retrain your model
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
)
model.fit(X_train, y_train)


In [24]:
# Predict if an issue is coming in the next 24h
# Take the latest row
latest_input = df_24h[feature_cols].iloc[-1:]

# Predict probability
issue_prob = model.predict_proba(latest_input)[0][1]
print(f"Probability of issue in next 24 hours: {issue_prob:.3f}")

# Convert to prediction
will_have_issue = (issue_prob >= 0.5)
print("🚨 Risk of a P1/P2 issue in the next 24h!" if will_have_issue else "✅ Likely safe for the next 24h.")


Probability of issue in next 24 hours: 0.000
✅ Likely safe for the next 24h.


In [25]:
# Predict if an issue is coming in the next 7 days
# Create a new label for “incident in next 7 days”
# Each 7 days = 7 × 24 × 12 = 2016 rows if you’re using 5-minute intervals.

# Copy the working DataFrame
df_7d = df.copy()

# Create the new 7-day incident label
df_7d["incident_open_in_next_7d"] = 0

# Reuse the original 2h incident indicator to find where issues happened
incident_indices = df_7d.index[df_7d["incident_open_in_next_2h"] == 1]

# Tag all rows BEFORE each incident within the previous 7 days (2016 rows)
for idx in incident_indices:
    start_idx = max(idx - 2016, 0)
    df_7d.loc[start_idx:idx, "incident_open_in_next_7d"] = 1

In [26]:
# If an incident happens on a future date, we want today’s row to say:
# "Yes, there's an incident coming sometime in the next 7 days."
# This lets us teach the model to raise a flag when risk is building up.

In [27]:
# Train the model on the new target “incident in next 7 days”
target = "incident_open_in_next_7d"
feature_cols = [
    "cpu_pct", "mem_pct", "disk_used_pct", "error_rate",
    "cpu_mean_30m", "cpu_mean_2h",
    "mem_mean_30m", "mem_mean_2h",
    "minutes_since_deploy", "is_holiday"
]

X = df_7d[feature_cols]
y = df_7d[target]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
)
model.fit(X_train, y_train)


In [28]:
# Predict if an issue is coming in the next 7 days
# Use the most recent row of input
latest_input = df_7d[feature_cols].iloc[-1:]

# Predict the probability of a 7-day incident
issue_prob = model.predict_proba(latest_input)[0][1]
print(f"Probability of issue in next 7 days: {issue_prob:.3f}")

# Interpret the prediction
will_have_issue = (issue_prob >= 0.5)
print("🚨 Risk of incident in next 7 days!" if will_have_issue else "✅ Likely stable for the next 7 days.")


Probability of issue in next 7 days: 0.000
✅ Likely stable for the next 7 days.
