In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib



In [7]:
df = pd.read_csv("../data/processed/merged_cleaned.csv")

df["duration_hours"] = df["DURATION_SEC"] / 3600
df["Energy_MWh"] = df["MW_mean"] * df["duration_hours"]

FEATURES = [
    "DURATION_SEC", "MW_mean",
    "TEMP_mean", "TEMP_p95",
    "VALO2_mean", "VALO2_p95",
    "O2_AMOUNT_sum", "GAS_AMOUNT_sum",
    "O2_FLOW_mean", "GAS_FLOW_mean",
    "duration_hours"
]

X = df[FEATURES].copy()
y = df["Energy_MWh"].copy()

mask = ~X.isna().any(axis=1) & ~y.isna()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

len(X), len(y)


(20813, 20813)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

len(X_train), len(X_test)


(16650, 4163)

In [9]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), FEATURES)
    ]
)

models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42, n_estimators=200),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

results = {}


In [10]:
for name, model in models.items():
    pipe = Pipeline([("pre", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}

results


{'LinearRegression': {'MAE': 6.855496894590884,
  'RMSE': np.float64(22.01325884130972),
  'R2': 0.7823381741358714},
 'RandomForest': {'MAE': 0.46713962425984906,
  'RMSE': np.float64(4.581904356242556),
  'R2': 0.9905701317233309},
 'GradientBoosting': {'MAE': 1.1556187664583761,
  'RMSE': np.float64(5.101602465324775),
  'R2': 0.9883096688055903}}

In [11]:
best_model_name = max(results, key=lambda x: results[x]["R2"])
best_model = Pipeline([("pre", preprocess), ("model", models[best_model_name])])
best_model.fit(X_train, y_train)

joblib.dump(best_model, "../models/energy_model.pkl")

best_model_name, results[best_model_name]


('RandomForest',
 {'MAE': 0.46713962425984906,
  'RMSE': np.float64(4.581904356242556),
  'R2': 0.9905701317233309})

In [12]:
y_pred = best_model.predict(X_test)

pred_df = pd.DataFrame({
    "Actual_MWh": y_test,
    "Predicted_MWh": y_pred
})
pred_df.to_csv("../outputs/predictions.csv", index=False)

pred_df.head()


Unnamed: 0,Actual_MWh,Predicted_MWh
15625,113.735556,113.896827
10863,112.895972,112.835342
812,211.213542,211.398676
3689,117.9,117.777382
2140,144.2875,144.212394


In [13]:
sample = X_test.iloc[:5]
best_model.predict(sample)


array([113.89682668, 112.83534212, 211.39867636, 117.77738242,
       144.21239403])

In [14]:
import joblib, pandas as pd
model = joblib.load("models/energy_model.pkl")

row = pd.DataFrame([{
    "DURATION_SEC": 120000,
    "MW_mean": 6.5,
    "TEMP_mean": 1635,
    "TEMP_p95": 1650,
    "VALO2_mean": 800,
    "VALO2_p95": 1000,
    "O2_AMOUNT_sum": 500000,
    "GAS_AMOUNT_sum": 150000,
    "O2_FLOW_mean": 2000,
    "GAS_FLOW_mean": 800,
    "duration_hours": 120000 / 3600
}])

print(model.predict(row))


FileNotFoundError: [Errno 2] No such file or directory: 'models/energy_model.pkl'