In [None]:
import numpy as np
from ipywidgets import interactive, IntSlider, VBox, Label, Layout, fixed
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt

def plot_model_predictions_over_days():
    """
    This function is a placeholder for future implementation.
    """
    raise NotImplementedError("This function is not yet implemented.")

%run example_definitions/lecture_06/plot_phenotyping_regression.ipynb

def rf_regression(df_loaded, n_estimators=100, max_depth=2, min_samples_split=2):
    model_name = "Random Forest Regression"
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
    )

    # ----------------------
    # Define Features/Target
    # ----------------------
    passthrough_cols = [
        "days_of_phenotyping",
        "nitrogen_applied",
        "drought_stress",
    ]
    # X_data_init = df_loaded[feature_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ('other_features', 'passthrough', passthrough_cols),  # Leave these unchanged
            ('species_ohe', OneHotEncoder(drop='first', sparse_output=False), ['species']),
        ]
    )
    X_data_init = preprocessor.fit_transform(df_loaded)
    feature_names = passthrough_cols + preprocessor.named_transformers_['species_ohe'].get_feature_names_out(['species']).tolist()
    X_data_init =  pd.DataFrame(X_data_init, columns=feature_names)

    # display(X_data_init.head())

    target_col = "digital_biomass"
    y_data_init = df_loaded[target_col]

    # --------------------------
    # Train/Test Split
    # --------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X_data_init, y_data_init, test_size=0.3, random_state=42
    )

    # --------------------------
    # 7. Fit Model
    # --------------------------
    model.fit(X_train, y_train)

    # --------------------------
    # 8. Evaluate
    # --------------------------

    # Predict scaled targets on test set
    y_pred = model.predict(X_test)

    r2_test_ridge = r2_score(y_test, y_pred)
    mse_test_ridge = np.mean((y_test - y_pred) ** 2)
    rmse_test_ridge = np.sqrt(mse_test_ridge)

    print("--------------------------------------------------")
    print(f"{model_name} - Regression Results:")
    print(f"Test Set RMSE: {rmse_test_ridge:.3e}")
    print(f"Test Set RÂ² score: {r2_test_ridge:.3f}")
    print("--------------------------------------------------")

    # #########

    # # Use grid search to find best hyperparameters
    # # ---------------------------------------------------
    # pipeline = Pipeline([("preprocess", preprocessor), ("regressor", model)])

    # # n_estimators=n_estimators,
    # #     max_depth=max_depth,
    # #     min_samples_split=min_samples_split,


    # # Use the original DataFrame for X
    # X = df_loaded
    # y = df_loaded[target_col]

    # # Split the data
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # param_grid = {
    #     "regressor__n_estimators": np.arange(1, 15, 1),
    #     "regressor__max_depth": np.arange(5, 10, 1),
    #     "regressor__min_samples_split": np.arange(5, 30, 1),
    #     }
    # grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring="r2")
    # grid.fit(X_train, y_train)
    # print(f"Best parameters: {grid.best_params_}")

    # #########


    # %% Test predictions over days for different nitrogen levels
    plot_model_predictions_over_days(
        preprocessor,
        None,
        model,
        model_name,
        df_loaded,
        feature_names=feature_names,
    )

# ---------------------------------------------------
# 5. Interactive controls
# ---------------------------------------------------
def random_forest_regression_interact(df_loaded):
    n_estimator_slider = IntSlider(
        value=10,
        min=1,
        max=20,
        step=1,
        description="Number of Estimators:",
        continuous_update=False,
        style={'description_width': '160px'},
        layout=Layout(width="400px"),
    )

    max_depth_slider = IntSlider(
        value=7,
        min=1,
        max=20,
        step=1,
        description="Max Depth:",
        continuous_update=False,
        style={'description_width': '160px'},
        layout=Layout(width="400px"),
    )
    min_samples_split_slider = IntSlider(
        value=15,
        min=5,
        max=30,
        step=1,
        description="Min Samples Split:",
        continuous_update=False,
        style={'description_width': '160px'},
        layout=Layout(width="400px"),
    )

    ui_box = VBox([
        Label(value="ðŸ“Š Controls", layout=Layout(margin="0 0 0 0")),
    ])

    interactive_plot = interactive(
        rf_regression,
        df_loaded=fixed(df_loaded),
        n_estimators=n_estimator_slider,
        max_depth=max_depth_slider,
        min_samples_split=min_samples_split_slider,
    )

    display(ui_box, interactive_plot)