In [None]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
from interpret.glassbox import ExplainableBoostingRegressor
from mapie.regression import MapieQuantileRegressor
from quantile_forest import RandomForestQuantileRegressor


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..')))
from src.utils.model import split_dataset, compare_models_per_station, create_deep_model

In [None]:
#replace with your directory

INPUT_DIR = "/N/lustre/project/proj-212/Ramtelpp/PersonalProject/Coda/data/input/"
MODEL_DIR = "/N/lustre/project/proj-212/Ramtelpp/PersonalProject/Coda/model/"
DATASET_DIR = "/N/lustre/project/proj-212/Ramtelpp/PersonalProject/Coda/data/"

SEED = 42
NUMBER_OF_WEEK = 4 # Number of weeks to predict one model is trained per week

FINAL_MODELS = ["qrf",]

qrf = {}
COLUMNS_TO_DROP = ["water_flow_week1", "water_flow_week2", "water_flow_week3", "water_flow_week4"]

### 2. Data Loading
Load in the baseline datasets, create the directory to save models.

In [None]:
#dataset_train = pd.read_csv(f"dataset_baseline.csv") # the database is Preprocessing
dataset_train = dataset_train.set_index("ObsDate")
if not os.path.exists(f"{MODEL_DIR}final/"):
    os.makedirs(f"{MODEL_DIR}final/")

Data pre-processing removal of unnecessary columns, setup of the target

In [None]:
X_train = dataset_train.drop(columns=COLUMNS_TO_DROP)
y_train = {}

for i in range(0, NUMBER_OF_WEEK):
    y_train[i] = dataset_train[f"water_flow_week{i+1}"]


### 3. Model Development (QRF)

- **Training:**  
  Initializes a `RandomForestQuantileRegressor` with the following parameters:
  - 100 estimators
  - Maximum depth of 10
  - Minimum of 10 samples per leaf

  These parameters allow for relatively fast training, though they are not optimized for peak performance. 
  
  The model is then fitted using `X_train` and the corresponding weekly target `y_train[i]`.

In [None]:
X_train_qrf = X_train.drop(columns=["station_code"])
SEED = 42  
random_state = SEED

# Drop the 'station_code' column
X_train_qrf = X_train.drop(columns=["station_code"])

# Optimal features selected 
feature = [
    "water_flow_lag_1w",
    "water_flow_lag_2w",
    "soil_moisture_region",
    "precipitation_region_lag_1w",
    "catchment",
    "soil_moisture",
    "temperatures",
    "precipitation_sector_lag_1w",
    "soil_moisture_sub_sector",
    "precipitation_sub_sector_lag_1w",
    "precipitation_zone",
    "precipitation_sub_sector",
    "evaporation_sub_sector_lag_1w",
    "temperature_region"
]  # selected features

X_train_qrf = X_train_qrf[feature]

In [None]:
#range(NUMBER_OF_WEEK)

if "qrf" in FINAL_MODELS:
    for i in range(NUMBER_OF_WEEK):
        print(f"Training week {i}")
        # Train RandomForestQuantileRegressor
        qrf[i] = RandomForestQuantileRegressor(n_estimators=1000, max_depth=25, min_samples_leaf=25, random_state=random_state)
        qrf[i].fit(X_train_qrf, y_train[i])
        time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
        model_path = f"{MODEL_DIR}final/qrf_quantile_{time}_week_{i}.pkl"
        joblib.dump(qrf[i], model_path)

