In [4]:
#bootstrpping repo root + .env

import sys
from pathlib import Path
from dotenv import load_dotenv
import os

root = Path().resolve()
while root != root.parent and not (root / ".env").exists():
    root = root.parent

root_dir = str(root)
print("Root dir:", root_dir)

if root_dir not in sys.path:
    sys.path.append(root_dir)

load_dotenv(Path(root_dir) / ".env")

assert os.getenv("HOPSWORKS_API_KEY"), "Missing HOPSWORKS_API_KEY in .env"
print("Loaded .env successfully")

Root dir: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project
Loaded .env successfully


In [5]:
import json
import joblib
import hopsworks
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, confusion_matrix

In [6]:
# paths, and training config

#Feature Views created in Notebook 2
DATASETS = {
    #key: (feature_view_name, feature_view_version, target_col, human_label)
    "energy_modea": ("mcphases_energy_modea_fv", 1, "y_energy_cls3", "Energy (Mode A)"),
    "energy_modeb": ("mcphases_energy_modeb_fv", 1, "y_energy_cls3", "Energy (Mode B, lag1)"),
    "mood_modea":   ("mcphases_mood_modea_fv",   1, "y_mood_stability_cls3", "Mood stability (Mode A)"),
    "mood_modeb":   ("mcphases_mood_modeb_fv",   1, "y_mood_stability_cls3", "Mood stability (Mode B, lag1)"),
}

ARTIFACTS_DIR = Path(root_dir) / "artifacts" / "models"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2

#feature view query includes event_time, dropping it
DROP_FEATURE_COLS = ["subject_id", "day_in_study", "event_time"]

In [12]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store()
print("Logged in to Hopsworks, got Feature Store")

2026-01-11 19:26:34,836 INFO: Closing external client and cleaning up certificates.
2026-01-11 19:26:34,856 INFO: Connection closed.
2026-01-11 19:26:34,866 INFO: Initializing external client
2026-01-11 19:26:34,867 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 19:26:35,834 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3208
Logged in to Hopsworks, got Feature Store.


In [13]:
#loading and prepping dataset from feature views

def load_xy_from_fv(fv_name: str, fv_version: int, target_col: str, drop_cols=DROP_FEATURE_COLS):
    fv = fs.get_feature_view(fv_name, version=fv_version)

    try:
        X_df, y_df = fv.training_data(
            description=f"{fv_name}_inmem_training",
            primary_key=True,
            event_time=True,
            training_helper_columns=True,
        )
    except TypeError:
        X_df, y_df = fv.training_data(
            description=f"{fv_name}_inmem_training",
            primary_keys=True,
            event_time=True,
            training_helper_columns=True,
        )

    df = pd.concat([X_df, y_df], axis=1)

    if "subject_id" in df.columns:
        sid_col = "subject_id"
    else:
        candidates = [c for c in df.columns if c.endswith("subject_id")]
        assert len(candidates) == 1, f"Could not uniquely find subject_id column. Found: {candidates}"
        sid_col = candidates[0]

    assert target_col in df.columns, f"{fv_name} missing target {target_col}"

    y = df[target_col].astype(int)
    groups = df[sid_col].astype(int)

    drop_actual = [c for c in drop_cols if c in df.columns] + [target_col, sid_col]
    X = df.drop(columns=drop_actual)

    for c in X.columns:
        if X[c].dtype == bool:
            X[c] = X[c].astype(int)
    X = X.replace([np.inf, -np.inf], np.nan)

    return X, y, groups, df

In [9]:
#subject-wise training/testing split, so that no leakage across people

def group_split(X, y, groups, test_size=TEST_SIZE, random_state=RANDOM_STATE):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_idx, test_idx = next(splitter.split(X, y, groups=groups))

    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

    g_train, g_test = groups.iloc[train_idx].copy(), groups.iloc[test_idx].copy()

    return X_train, X_test, y_train, y_test, g_train, g_test

In [14]:
#training and evaluating the model

def train_and_eval_tree_model(X_train, y_train, X_test, y_test, random_state=RANDOM_STATE):
    model = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("clf", RandomForestClassifier(
            n_estimators=800,
            random_state=random_state,
            n_jobs=-1,
            class_weight="balanced",
            min_samples_leaf=2,
            max_features="sqrt"
        )),
    ])
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    metrics = {
        "accuracy": float(accuracy_score(y_test, preds)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, preds)),
        "f1_macro": float(f1_score(y_test, preds, average="macro")),
        "confusion_matrix": confusion_matrix(y_test, preds).tolist(),
    }
    
    majority = int(y_train.value_counts().idxmax())
    baseline_acc = float((y_test == majority).mean())
    metrics["majority_baseline_accuracy"] = baseline_acc
    
    report = classification_report(y_test, preds, digits=4)
    return model, metrics, report

In [15]:
#training all four models

results = {}
trained_models = {}

for key, (fv_name, fv_version, target_col, label) in DATASETS.items():
    print("\n" + "="*80)
    print(label)
    print("Feature View:", fv_name, f"v{fv_version}", "| Target:", target_col)

    X, y, groups, raw_df = load_xy_from_fv(fv_name, fv_version, target_col)

    print("X shape:", X.shape, "| y labeled:", y.shape[0])
    print("Classes:", sorted(y.unique().tolist()))
    print("Feature count:", X.shape[1])
    
    X_train, X_test, y_train, y_test, g_train, g_test = group_split(X, y, groups)

    print("Train subjects:", g_train.nunique(), "| Test subjects:", g_test.nunique())
    print("Train rows:", len(X_train), "| Test rows:", len(X_test))

    model, metrics, report = train_and_eval_tree_model(
        X_train, y_train, X_test, y_test,
   )

    print("Accuracy:", metrics["accuracy"])
    print("Balanced acc:", metrics["balanced_accuracy"])
    print("F1 macro:", metrics["f1_macro"])
    print("Majority baseline acc:", metrics["majority_baseline_accuracy"])
    print("\nClassification report:\n", report)
    print("\nConfusion matrix:\n", np.array(metrics["confusion_matrix"]))
    
    #saving artifacts without overwriting
    out_dir = ARTIFACTS_DIR / key
    out_dir.mkdir(parents=True, exist_ok=True)

    joblib.dump(model, out_dir / "model.joblib")

    #saving feature columns in order
    feature_cols = list(X.columns)
    (out_dir / "feature_columns.json").write_text(json.dumps(feature_cols, indent=2), encoding="utf-8")

    #saving metrics
    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")

    #saving a small readme
    (out_dir / "info.json").write_text(json.dumps({
        "feature_view": f"{fv_name}_v{fv_version}",
        "target_col": target_col,
        "label": label,
        "drop_feature_cols": DROP_FEATURE_COLS,
        "random_state": RANDOM_STATE,
        "test_size": TEST_SIZE
    }, indent=2), encoding="utf-8")

    results[key] = metrics
    trained_models[key] = model

print("\nDone. Saved models to:", ARTIFACTS_DIR)


Energy (Mode A)
Feature View: mcphases_energy_modea_fv v1 | Target: y_energy_cls3
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.51s) 
2026-01-11 19:27:03,259 INFO: Computing insert statistics

X shape: (3331, 12) | y labeled: 3331
Classes: [0, 1, 2]
Feature count: 12


Train subjects: 33 | Test subjects: 9
Train rows: 2597 | Test rows: 734















































Accuracy: 0.6553133514986376
Balanced acc: 0.6627464467237402
F1 macro: 0.6574023902889995
Majority baseline acc: 0.41825613079019075

Classification report:
               precision    recall  f1-score   support

           0     0.5447    0.6809    0.6052       188
           1     0.6489    0.5961    0.6214       307
           2     0.7834    0.7113    0.7456       239

    accuracy                         0.6553       734
   macro avg     0.6590    0.6627    0.6574       734
weighted avg     0.6660    0.6553    0.6577       734


Confusion matrix:
 [[128  50  10]
 

In [16]:
print(json.dumps(results, indent=2))

{
  "energy_modea": {
    "accuracy": 0.6553133514986376,
    "balanced_accuracy": 0.6627464467237402,
    "f1_macro": 0.6574023902889995,
    "confusion_matrix": [
      [
        128,
        50,
        10
      ],
      [
        87,
        183,
        37
      ],
      [
        20,
        49,
        170
      ]
    ],
    "majority_baseline_accuracy": 0.41825613079019075
  },
  "energy_modeb": {
    "accuracy": 0.6903703703703704,
    "balanced_accuracy": 0.6895658263305323,
    "f1_macro": 0.687837659196919,
    "confusion_matrix": [
      [
        107,
        55,
        8
      ],
      [
        65,
        179,
        36
      ],
      [
        7,
        38,
        180
      ]
    ],
    "majority_baseline_accuracy": 0.4148148148148148
  },
  "mood_modea": {
    "accuracy": 0.6158038147138964,
    "balanced_accuracy": 0.5388652555855296,
    "f1_macro": 0.5349849151857761,
    "confusion_matrix": [
      [
        26,
        43,
        53
      ],
      [
       

In [18]:
mr = project.get_model_registry()

#because hopsworks only accepts numeric vals as metrics
def numeric_only(metrics: dict) -> dict:
    out = {}
    for k, v in metrics.items():
        if isinstance(v, (int, float)) and not isinstance(v, bool):
            out[k] = float(v)
    return out

def register_model(model_key: str, model_dir: Path, metrics: dict, description: str):
    model_key = model_key.lower()  #lower-case for safety
    hw_metrics = numeric_only(metrics)

    model = mr.python.create_model(
        name=f"mcphases_{model_key}_randomforest",
        metrics=hw_metrics,
        description=description,
    )
    #uploading the whole folder containing model.joblib + feature_columns.json + full metrics.json
    model.save(str(model_dir))
    print(f"Registered: mcphases_{model_key}_randomforest | metrics={list(hw_metrics.keys())}")


for key, (fv_name, fv_version, target_col, label) in DATASETS.items():
    model_dir = ARTIFACTS_DIR / key
    assert (model_dir / "model.joblib").exists(), f"Missing model.joblib in {model_dir}"
    assert (model_dir / "metrics.json").exists(), f"Missing metrics.json in {model_dir}"

    metrics_full = json.loads((model_dir / "metrics.json").read_text(encoding="utf-8"))

    register_model(
        model_key=key,
        model_dir=model_dir,
        metrics=metrics_full,
        description=f"{label}. RandomForestClassifier. Target={target_col}. Subject-wise split."
    )

2026-01-11 19:32:11,543 INFO: Closing external client and cleaning up certificates.
2026-01-11 19:32:11,548 INFO: Connection closed.
2026-01-11 19:32:11,550 INFO: Initializing external client
2026-01-11 19:32:11,551 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 19:32:12,967 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3208
Logged in to Hopsworks, got Model Registry.



  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_energy_modea_randomforest/1
Registered: mcphases_energy_modea_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/energy_mod…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_energy_modeb_randomforest/1
Registered: mcphases_energy_modeb_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modea…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modea…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modea…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modea…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_mood_modea_randomforest/1
Registered: mcphases_mood_modea_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeb…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeb…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeb…

Uploading /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project/artifacts/models/mood_modeb…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/3208/models/mcphases_mood_modeb_randomforest/1
Registered: mcphases_mood_modeb_randomforest | metrics=['accuracy', 'balanced_accuracy', 'f1_macro', 'majority_baseline_accuracy']
