# Elektri hinna ennustamine kasutades teeki `XGBoost`

[`XGBoost`](xgboost.readthedocs.io)
on teek, mis ilplementeerib Extreme Gradient Boosting algoritmi.
Järgisime osaliselt seda [tutoriali](https://machinelearningmastery.com/xgboost-for-time-series-forecasting/).

In [1]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [4]:
# arvutamine ja joonistamine
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# aegridade spetsiifiline
from xgboost import XGBRegressor

In [6]:
# laen treening- ja testandmed
data_raw =  pd.read_csv('../data/df_clean.csv')
test_raw = pd.read_csv('../data/df_clean_test.csv')

In [7]:
data = data_raw.set_index(pd.to_datetime(data_raw['timestamp']))[["consumption"]]
data = data[~data.index.duplicated()].asfreq("H").sort_index()

test = test_raw.set_index(pd.to_datetime(test_raw['timestamp']))[[]]
test = test[~test.index.duplicated()].asfreq("H").sort_index()
test = test.tz_localize("+03:00")

In [9]:
relevant_data = data[(data.index.month > 4) | (data.index.month < 9)]
relevant_data

Unnamed: 0_level_0,consumption
timestamp,Unnamed: 1_level_1
2021-09-01 00:00:00,0.577
2021-09-01 01:00:00,0.594
2021-09-01 02:00:00,0.685
2021-09-01 03:00:00,1.016
2021-09-01 04:00:00,0.677
...,...
2022-08-24 19:00:00,0.678
2022-08-24 20:00:00,0.457
2022-08-24 21:00:00,0.500
2022-08-24 22:00:00,2.321


In [10]:
def series_to_slicing_window(data, n_in=1, n_out=1):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together
    agg = pd.concat(cols, axis=1)
    # drop rows with NaN values
    agg.dropna(inplace=True)
    return agg.values

In [11]:
 # transform list into array
 train_data = series_to_slicing_window(relevant_data, 10)
 # split into input and output columns
 trainX, trainy = train_data[:, :-1], train_data[:, -1]
 # fit model
 model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
 model.fit(trainX, trainy)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [12]:
prediction_data = list(trainX[-1])
prediction_data.pop(0)
prediction_data.append(trainy[0])

predictions = []


for _ in range(168):
    predictions.append(model.predict([prediction_data])[0])
    prediction_data.pop(0)
    prediction_data.append(predictions[-1])

In [None]:
submission = pd.read_csv("../data/sample_submission.csv")
submission['consumption'] = predictions
submission.to_csv("../submissions/xgboost-submission-iterative.csv", index=False)