In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
import json

df = pd.read_csv("../public/harga-properti.csv")[['Quarter', 'Value']].dropna()

class LaggedXGBRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_lags=4, n_estimators=100, learning_rate=0.1):
        self.n_lags = n_lags
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.model = None

    def create_lag_features(self, y):
        df_lag = pd.DataFrame()
        for i in range(self.n_lags):
            df_lag[f'lag_{i+1}'] = y.shift(i+1)
        df_lag['target'] = y
        df_lag.dropna(inplace=True)
        return df_lag

    def fit(self, X, y):
        data = self.create_lag_features(y)
        X_lag = data.drop(columns='target').values
        y_lag = data['target'].values
        self.model = XGBRegressor(n_estimators=self.n_estimators, learning_rate=self.learning_rate)
        self.model.fit(X_lag, y_lag)
        self.history = y.values
        return self

    def predict(self, X):
        last_known = self.history[-self.n_lags:].tolist()
        preds = []
        for _ in range(len(X)):
            input_features = np.array(last_known[-self.n_lags:]).reshape(1, -1)
            pred = self.model.predict(input_features)[0]
            preds.append(pred)
            last_known.append(pred)
        return np.array(preds)

tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'n_lags': [4, 8, 12, 16, 20],
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1]
}

y_series = df['Value']

grid_search = GridSearchCV(LaggedXGBRegressor(), param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', verbose=1)
X_dummy = np.zeros((len(y_series), 1))
grid_search.fit(X_dummy, y_series)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))

future_X = np.zeros((88, 1))
forecast = best_model.predict(future_X)

future_quarters = np.arange(df['Quarter'].max() + 1, df['Quarter'].max() + 89)

all_quarters = df['Quarter'].tolist() + future_quarters.tolist()
all_values = df['Value'].tolist() + forecast.tolist()

combined_data = [{"Quarter": q, "Value": float(f"{v:.2f}")} for q, v in zip(all_quarters, all_values)]

with open("../backend/model/combined_forecast.json", "w") as f:
    json.dump(combined_data, f, indent=4)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'learning_rate': 0.05, 'n_estimators': 100, 'n_lags': 12}
Best RMSE: 3.8109394750117485
