In [0]:
from pyspark.sql.functions import *
from prophet import Prophet
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error


In [0]:
schema_name = 'artemis'
spark.sql(f"use {schema_name}")

In [0]:
df = spark.sql(
    f"""
               select 
                time as ds
                , miner_id
                , kwh as y
               from {schema_name}.training_data
               """
)
display(df)

In [0]:
df = (df
    .drop('mac', 'serial', 'name', 'client_id', 'client_name', 'pickaxe_id', 'group_id', 'rack_id', 'hashing_uptime_low', 'hashing_uptime_normal', 'hashing_uptime_high')
)
display(df)

In [0]:
interval_width = 0.8
forecast_frequency = 'H'
forecast_periods = 24
include_history = True
freq = 'H'

In [0]:
# single miner forecast
pdf = (df.filter(col('miner_id')==10796964)
       .filter(col('ds')<'2025-03-31T10:00:00.000+00:00')
       .filter(col('ds')>'2025-03-28')
       .toPandas()
)
model = Prophet(interval_width=interval_width, changepoint_prior_scale=0.90)
model.fit(pdf)
future = model.make_future_dataframe(periods=8, freq=freq)
fcst = model.predict(future)
fig = model.plot(fcst)

In [0]:
# single miner forecast
pdf = (df.filter(col('miner_id')==10442585)
       .filter(col('ds')<'2025-03-30T19:00:00.000+00:00')
       .filter(col('ds')>'2025-03-27')
       .toPandas()
)
model = Prophet(interval_width=interval_width, changepoint_prior_scale=0.85, seasonality_mode='multiplicative')
model.fit(pdf)
future = model.make_future_dataframe(periods=8, freq=freq)
fcst = model.predict(future)
fig = model.plot(fcst)

In [0]:
fcst = fcst.merge(pdf[['ds', 'y']], on='ds', how='left')
display(fcst)

In [0]:
y_true = fcst.dropna()[['y']]
y_pred = fcst.dropna()[['yhat']]

rmse = mean_squared_error(y_true, y_pred, squared=False)
rmse


In [0]:
# # remove missing values (more likely at day-store-item level)
# history_pd = history_pd.dropna()

# extra_cols = [col for col in history_pd.columns if col not in ['ds', 'y']]

# # train and configure the model
# model = Prophet( interval_width=interval_width )
# model.fit( history_pd )

# # make predictions
# future_pd = model.make_future_dataframe(
#     periods=forecast_periods, 
#     freq=forecast_frequency, 
#     include_history=include_history
# )
# forecast_pd = model.predict( future_pd )
# forecast_pd = pd.merge(forecast_pd, history_pd[['ds','y']], on='ds', how='left')


# # forecast_pd['y'] = history_pd['y']

# for c in extra_cols:
#     forecast_pd[c] = history_pd[c]

# output_spark_df = (spark.createDataFrame(forecast_pd)
#     .withColumn("yhat_lower", greatest(col("yhat_lower"), lit(0)))
#     .withColumn("yhat_upper", greatest(col("yhat_upper"), lit(0)))
#     .withColumn("YearMonth", date_format(col("ds"), "yyyy-MM"))

# )

# display(output_spark_df)