In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

print(f"Reproducibility seed fixed to {SEED}")

Reproducibility seed fixed to 42


#Global Config

In [None]:
CONFIG = {
    "CPU_IDLE_THRESHOLD": 10,
    "CPU_SCALE_THRESHOLD": 80,
    "P_IDLE": 60,              # watts per core
    "P_MAX": 250,              # watts per core
    "INTERVAL_MINUTES": 5,     # Azure sampling
    "SAVINGS_FACTOR": 0.30,
    "ANOMALY_RATE": 0.02,
    "N_CLUSTERS": 3,
}

INTERVAL_HOURS = CONFIG["INTERVAL_MINUTES"] / 60

CPU_FILES = [
    "vm_cpu_readings-file-1-of-195.csv.gz"
]

print("Using CPU files:", CPU_FILES)

Using CPU files: ['vm_cpu_readings-file-1-of-195.csv.gz']



# DOWNLOAD FILES


In [None]:
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/trace_data/vm_cpu_readings/vm_cpu_readings-file-1-of-195.csv.gz
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/trace_data/vm_cpu_readings/vm_cpu_readings-file-2-of-195.csv.gz
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/trace_data/vm_cpu_readings/vm_cpu_readings-file-3-of-195.csv.gz
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/azure2019_data/cores.txt
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/azure2019_data/memory.txt
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/azure2019_data/category.txt
# !wget https://azurepublicdatasettraces.blob.core.windows.net/azurepublicdatasetv2/trace_data/vmtable/vmtable.csv.gz


# LOAD DATA

In [None]:
ddf_cpu = pd.read_csv(
    CPU_FILES[0],
    compression="gzip",
    header=None,
    names=["timestamp", "vm_id", "cpu_min", "cpu_max", "cpu_avg"]
)

df_vm = pd.read_csv(
    "vmtable.csv.gz",
    compression="gzip",
    header=None,
    names=[
        "vm_id",
        "subscription_id",
        "deployment_id",
        "vm_created_ts",
        "vm_deleted_ts",
        "vm_cpu_max",
        "vm_cpu_avg",
        "vm_cpu_p95",
        "vm_category",
        "core_bucket",
        "memory_bucket",
    ],
)

df_cores = pd.read_csv(
    "cores.txt",
    sep="\t",
    header=None,
    names=["core_bucket", "core_count"]
)

# CLEAN & MERGE

In [None]:
df = (
    df_cpu
    .merge(df_vm[["vm_id", "core_bucket", "vm_category"]], on="vm_id", how="left")
    .merge(df_cores, on="core_bucket", how="left")
)

for col in ["cpu_avg", "core_count"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df.dropna(subset=["cpu_avg", "core_count"])
df = df.sort_values(["timestamp", "vm_id"]).reset_index(drop=True)

# POWER ESTIMATION

In [None]:
df["est_power_w"] = df["core_count"] * (
    CONFIG["P_IDLE"]
    + (df["cpu_avg"] / 100) * (CONFIG["P_MAX"] - CONFIG["P_IDLE"])
)

print("\nSample power estimates:")
print(df[["cpu_avg", "core_count", "est_power_w"]].head())


Sample power estimates:
     cpu_avg  core_count  est_power_w
0  27.828400        22.9  2584.813682
1   1.691972        58.2  3679.098268
2   3.468860        58.2  3875.586594
3   2.508941        58.2  3769.438654
4   1.604812        58.2  3669.460102


# AGGREGATE TO DATA CENTER

In [None]:
dc = (
    df.groupby("timestamp", as_index=False)["est_power_w"]
    .sum()
    .rename(columns={"est_power_w": "dc_power_w"})
)

dc = dc.sort_values("timestamp").reset_index(drop=True)

# ENERGY PREDICTION (FORECASTING)

In [None]:
dc["lag_1"] = dc["dc_power_w"].shift(1)
dc["lag_2"] = dc["dc_power_w"].shift(2)
dc["rolling_mean_3"] = dc["dc_power_w"].rolling(3).mean()
dc = dc.dropna()

X_pred = dc[["lag_1", "lag_2", "rolling_mean_3"]]
y_pred = dc["dc_power_w"]

forecast_model = GradientBoostingRegressor(random_state=SEED)
forecast_model.fit(X_pred, y_pred)

dc["predicted_power_w"] = forecast_model.predict(X_pred)

print("\nPrediction sample:")
print(dc[["dc_power_w", "predicted_power_w"]].head())


Prediction sample:
     dc_power_w  predicted_power_w
2  8.703744e+08       8.706598e+08
3  8.658754e+08       8.660018e+08
4  8.670336e+08       8.670048e+08
5  8.749024e+08       8.744862e+08
6  8.681625e+08       8.684008e+08


# ANOMALY DETECTION

In [None]:
anomaly_model = IsolationForest(
    contamination=CONFIG["ANOMALY_RATE"],
    random_state=SEED
)

dc["anomaly"] = anomaly_model.fit_predict(dc[["dc_power_w"]])

print("\nAnomaly counts:")
print(dc["anomaly"].value_counts())

anomaly_times = dc.loc[dc["anomaly"] == -1, "timestamp"]

root_cause = (
    df[df["timestamp"].isin(anomaly_times)]
    .groupby("vm_id", as_index=False)["est_power_w"]
    .sum()
    .sort_values("est_power_w", ascending=False)
    .head(10)
)

print("\nTop anomaly-causing VMs:")
print(root_cause)


Anomaly counts:
anomaly
 1    42
-1     1
Name: count, dtype: int64

Top anomaly-causing VMs:
                                                   vm_id   est_power_w
12652  gFUC9Muk/Bc1zcumogstmB3dje7MldDcUWAT4UrsFOd2RM...  14468.376508
2488   6utJfUjR2MkrhAV1WfUcG6rP+1K5H4yZ0TTLCU/NMJtTfw...  14460.267351
6307   KFZccyakM+a86jruxkOOuxu5P9RAj+IXZik5sztQ+PtXtO...  14446.397927
3798   BW+M9go1zbu8ZOIvpVwoCuVfqSOIg/1cRbRq+T82+zDPv4...  14445.570058
14257  lnpENQmRRTOft2eOXq0WPsqEw6SUZgYYd+5GMnyc1NO4Ol...  14441.720570
14511  meSI+id0ktFMFryWTSy+8nxrYnFUvMF4b4uED6bVoK1J5U...  14437.784801
4507   E2YkoS/FKMZhEPW3Q3fuKgNalyUIo3lVNsGHCsXKRIkLea...  14430.801356
4407   DitLEX1Mj9EcIWw2YuaTYfzSbkbMC8glOGzD9wvMJDIM9C...  14425.946457
16626  u5BVJO8+1dj/huvZMA1WQniAv3CBz6R3nDmnkMbvoHqAf0...  14420.811355
10337  Y2iJ08mEXdN4T4hNMRRt+IEejXBLnNqdkiN3Ep296tcEb7...  14418.820662


# CLUSTERING & PATTERN RECOGNITION

In [None]:
vm_features = (
    df[["vm_id", "cpu_avg", "core_count", "est_power_w"]]
    .groupby("vm_id", as_index=False)
    .agg({
        "cpu_avg": "mean",
        "core_count": "first",
        "est_power_w": "mean"
    })
)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(
    vm_features[["cpu_avg", "core_count", "est_power_w"]]
)

kmeans = KMeans(
    n_clusters=CONFIG["N_CLUSTERS"],
    random_state=SEED
)

vm_features["cluster"] = kmeans.fit_predict(X_scaled)

cluster_summary = (
    vm_features
    .groupby("cluster")[["cpu_avg", "core_count", "est_power_w"]]
    .mean()
)

print("\nCluster characteristics:")
print(cluster_summary)


Cluster characteristics:
           cpu_avg  core_count  est_power_w
cluster                                    
0         6.768044   20.150612  1461.427292
1        53.816396   54.298138  8665.145765
2         7.127862   58.200000  4280.198984


# OPTIMIZATION RECOMMENDATIONS

In [None]:
def recommend(row):
    if row["cpu_avg"] < CONFIG["CPU_IDLE_THRESHOLD"]:
        return "Downsize or shut down VM"
    elif row["cpu_avg"] > CONFIG["CPU_SCALE_THRESHOLD"]:
        return "Scale out / add capacity"
    else:
        return "Operating normally"

df["recommendation"] = df.apply(recommend, axis=1)

idle_rows = df[df["recommendation"] == "Downsize or shut down VM"]

idle_occurrences = len(idle_rows)
idle_unique_vms = idle_rows["vm_id"].nunique()
total_unique_vms = df["vm_id"].nunique()

idle_energy_kwh = (
    idle_rows["est_power_w"] * INTERVAL_HOURS / 1000
).sum()

potential_savings_kwh = idle_energy_kwh * CONFIG["SAVINGS_FACTOR"]

print("\n================ OPTIMIZATION SUMMARY ================")
print(f"Idle VM occurrences (VM–timestamps): {idle_occurrences:,}")
print(f"Unique idle VMs: {idle_unique_vms:,}")
print(f"Total unique VMs: {total_unique_vms:,}")
print(f"Idle VM ratio: {idle_unique_vms / total_unique_vms:.2%}")
print(f"Estimated potential savings: {potential_savings_kwh:,.2f} kWh")


Idle VM occurrences (VM–timestamps): 7,494,310
Unique idle VMs: 211,599
Total unique VMs: 241,224
Idle VM ratio: 87.72%
Estimated potential savings: 619,235.23 kWh


# FINAL SUMMARY

In [None]:
print("\nPIPELINE COMPLETED SUCCESSFULLY")
print(f"Total VMs analyzed: {total_unique_vms}")
print(f"Total timestamps analyzed: {dc.shape[0]}")


PIPELINE COMPLETED SUCCESSFULLY
Total VMs analyzed: 241224
Total timestamps analyzed: 43


#Save State

In [None]:
joblib.dump(forecast_model, "forecast_model.joblib")
joblib.dump(anomaly_model, "anomaly_model.joblib")
joblib.dump(kmeans, "kmeans_model.joblib")
joblib.dump(scaler, "scaler.joblib")

df.to_parquet("vm_level_data.parquet")
dc.to_parquet("datacenter_timeseries.parquet")
vm_features.to_parquet("vm_features.parquet")
cluster_summary.to_parquet("cluster_summary.parquet")

df.to_csv("vm_level_data.csv", index=False)
dc.to_csv("datacenter_timeseries.csv", index=False)

print("\nArtifacts saved successfully.")



Artifacts saved successfully.
