In [1]:
# Set up Spark

from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("SCADA-Forecasting")
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "50")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)

# Load Data

import pandas as pd
from sktime.forecasting.model_selection import temporal_train_test_split

pdf = pd.read_parquet(r"scada_prepro.parquet")
pdf_turbine1 = pdf[pdf["item_id"] == "1_Kelmarsh"].copy()
pdf_turbine1 = pdf_turbine1.drop(columns=["item_id"])
pdf_turbine1["timestamp"] = pd.to_datetime(pdf_turbine1["timestamp"])
pdf_turbine1 = pdf_turbine1.set_index("timestamp").sort_index()
pdft = pdf_turbine1[3:]
pdf_turbine1_no_NaN = pdf_turbine1.dropna(axis=1)
pdf_turbine1_no_NaN = pdf_turbine1_no_NaN.asfreq("10min")
pdf_turbine1_no_NaN["target"] = pdf_turbine1_no_NaN["target"].interpolate("time").ffill().bfill()
X_cols = [c for c in pdf_turbine1_no_NaN.columns if c != "target"]
pdf_turbine1_no_NaN[X_cols] = pdf_turbine1_no_NaN[X_cols].interpolate("time").ffill().bfill()


train_pdf, test_pdf = temporal_train_test_split(pdf_turbine1_no_NaN, test_size=0.2)
y_train = train_pdf["target"]

y_test = test_pdf["target"]
X_test= test_pdf.drop(columns=["target"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/16 14:55:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import sys
sys.path.append('/workspaces/amos2025ws03-rtdip-timeseries-forecasting/src/sdk/python')
from rtdip_sdk.pipelines.forecasting.spark.catboost_timeseries import CatboostTimeSeries

In [3]:
cbts = CatboostTimeSeries(target_col="target", timestamp_col="timestamp")

In [4]:
cbts.train(spark.createDataFrame(train_pdf.reset_index()))

0:	learn: 648.1804997	total: 1.22s	remaining: 5m 3s
1:	learn: 619.1597785	total: 2.03s	remaining: 4m 11s
2:	learn: 591.5538326	total: 2.83s	remaining: 3m 53s
3:	learn: 565.3112375	total: 3.55s	remaining: 3m 38s
4:	learn: 540.3151533	total: 4.29s	remaining: 3m 30s
5:	learn: 516.3928463	total: 4.98s	remaining: 3m 22s
6:	learn: 494.2503217	total: 5.67s	remaining: 3m 16s
7:	learn: 472.9118067	total: 6.39s	remaining: 3m 13s
8:	learn: 452.5242805	total: 7.1s	remaining: 3m 10s
9:	learn: 433.5954722	total: 7.81s	remaining: 3m 7s
10:	learn: 415.5698927	total: 8.5s	remaining: 3m 4s
11:	learn: 398.5450693	total: 9.14s	remaining: 3m 1s
12:	learn: 382.6125773	total: 9.77s	remaining: 2m 58s
13:	learn: 367.4884276	total: 10.4s	remaining: 2m 55s
14:	learn: 352.8734485	total: 11s	remaining: 2m 52s
15:	learn: 339.3263482	total: 11.7s	remaining: 2m 50s
16:	learn: 326.3954555	total: 12.3s	remaining: 2m 48s
17:	learn: 314.1838754	total: 13s	remaining: 2m 47s
18:	learn: 302.7153239	total: 13.6s	remaining: 2

In [5]:
spark_test = spark.createDataFrame(test_pdf.reset_index())
metrics = cbts.evaluate(spark_test)


CatBoost Metrics:
--------------------------------------------------------------------------------
MAE                 : 104.7876
RMSE                : 154.5119
MAPE                : 5.2792
MASE                : 1.0964
SMAPE               : 111.9547
