In [12]:
#bootstrpping repo root + .env

import sys
from pathlib import Path
from dotenv import load_dotenv
import os

root = Path().resolve()
while root != root.parent and not (root / ".env").exists():
    root = root.parent

root_dir = str(root)
print("Root dir:", root_dir)

if root_dir not in sys.path:
    sys.path.append(root_dir)

load_dotenv(Path(root_dir) / ".env")

assert os.getenv("HOPSWORKS_API_KEY"), "Missing HOPSWORKS_API_KEY in .env"
print("Loaded .env successfully")

Root dir: /Users/sreenijaveladri/Downloads/llm_project_starter/scalable-ml-project
Loaded .env successfully


In [13]:
#loading models and feature column order from the Hopsworks Model Registry

import hopsworks
import json
import joblib
from pathlib import Path

import warnings
warnings.filterwarnings("ignore", message=".*is_sparse is deprecated.*")
warnings.filterwarnings("ignore", category=DeprecationWarning)

project = hopsworks.login(engine="python")
fs = project.get_feature_store()
mr = project.get_model_registry()
print("Logged in, Got Feature Store + Model Registry")

#dataset keys used in training
MODEL_KEYS = ["energy_modea", "energy_modeb", "mood_modea", "mood_modeb"]

def normalize_feature_name(name: str) -> str:
    return name.replace("__", "_").lower()

def load_latest_model(model_name: str):
    #selecting the latest version
    models = mr.get_models(model_name)
    if not models:
        raise ValueError(f"No models found in registry for: {model_name}")

    latest = sorted(models, key=lambda m: m.version)[-1]
    local_dir = Path(latest.download())

    model = joblib.load(local_dir / "model.joblib")
    feature_cols_raw = json.loads((local_dir / "feature_columns.json").read_text(encoding="utf-8"))

    #normalizing to match Feature Store column names
    feature_cols = [normalize_feature_name(c) for c in feature_cols_raw]

    return latest, model, feature_cols_raw, feature_cols, local_dir

MODELS = {}
for key in MODEL_KEYS:
    model_name = f"mcphases_{key}_randomforest"
    meta, model, cols_raw, cols_norm, model_dir = load_latest_model(model_name)

    MODELS[key] = {
        "model_name": model_name,
        "model_version": meta.version,
        "model": model,
        "feature_cols_raw": cols_raw, 
        "feature_cols": cols_norm,
        "model_dir": str(model_dir),
    }

    print(f"Loaded {model_name} v{meta.version} | n_features={len(cols_norm)}")


2026-01-11 20:13:41,265 INFO: Closing external client and cleaning up certificates.
2026-01-11 20:13:41,272 INFO: Connection closed.
2026-01-11 20:13:41,274 INFO: Initializing external client
2026-01-11 20:13:41,276 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 20:13:42,065 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3208
Logged in. Got Feature Store + Model Registry.


Downloading: 0.000%|          | 0/64414439 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/323 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/241 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/244 elapsed<00:00 remaining<?

Loaded mcphases_energy_modea_randomforest v1 | n_features=12


Downloading: 0.000%|          | 0/57745975 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/319 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/247 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/261 elapsed<00:00 remaining<?

Loaded mcphases_energy_modeb_randomforest v1 | n_features=13


Downloading: 0.000%|          | 0/58067239 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/321 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/255 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/244 elapsed<00:00 remaining<?

Loaded mcphases_mood_modea_randomforest v1 | n_features=12


Downloading: 0.000%|          | 0/49410935 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/320 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/261 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/259 elapsed<00:00 remaining<?

Loaded mcphases_mood_modeb_randomforest v1 | n_features=13


In [14]:
import numpy as np
import pandas as pd

#Feature Views created in Hopsworks
FV_NAME = {
    "energy_modea": "mcphases_energy_modea_fv",
    "energy_modeb": "mcphases_energy_modeb_fv",
    "mood_modea":   "mcphases_mood_modea_fv",
    "mood_modeb":   "mcphases_mood_modeb_fv",
}

LABEL_COL = {
    "energy_modea": "y_energy_cls3",
    "energy_modeb": "y_energy_cls3",
    "mood_modea":   "y_mood_stability_cls3",
    "mood_modeb":   "y_mood_stability_cls3",
}

FV_VERSION = 1

PHASE_DUMMIES = ["phase_fertility", "phase_follicular", "phase_luteal", "phase_menstrual", "phase_nan"]

_FV_CACHE = {}

def _get_fv(dataset_key: str):
    k = dataset_key.lower()
    if k in _FV_CACHE:
        return _FV_CACHE[k]

    name = FV_NAME[k]
    fv = fs.get_feature_view(name, version=FV_VERSION)
    _FV_CACHE[k] = fv
    return fv

def _normalize_key(name: str) -> str:
    return name.replace("__", "_").lower()

#single row df aliged to model feature cols
def prepare_X_from_inputs(inputs: dict, feature_cols: list[str]) -> pd.DataFrame:
    d = {}
    for k, v in inputs.items():
        d[_normalize_key(str(k))] = v

    #phase - one-hot encoded dummies
    phase_val = d.get("phase", None)
    if phase_val is not None:
        ph = str(phase_val).strip().lower()
        # default all 0
        for col in PHASE_DUMMIES:
            d[col] = 0
        match = f"phase_{ph}"
        if match in PHASE_DUMMIES:
            d[match] = 1
        else:
            d["phase_nan"] = 1
        d.pop("phase", None)

    #building the row in exact order
    row = {}
    for c in feature_cols:
        c_norm = _normalize_key(c)
        #0 for missing dummies/numeric flags, otherwise np.na
        if c_norm in PHASE_DUMMIES:
            row[c_norm] = int(d.get(c_norm, 0))
        else:
            row[c_norm] = d.get(c_norm, np.nan)

    X = pd.DataFrame([row])

    #converting bool-like to int
    for c in X.columns:
        if X[c].dtype == bool:
            X[c] = X[c].astype(int)

    return X

#fetchig single row from online store using the dataset's feature view, and aligning it to models feature order
def fetch_online_X(dataset_key: str, subject_id: int, day_in_study: int, passed_features: dict | None = None) -> pd.DataFrame:
    k = dataset_key.lower()
    fv = _get_fv(k)

    entry = {"subject_id": int(subject_id), "day_in_study": int(day_in_study)}

    pf = None
    if passed_features:
        pf = {_normalize_key(kk): vv for kk, vv in passed_features.items()}

    vec = fv.get_feature_vector(entry=entry, passed_features=pf)

    #pulling the features from model
    feature_cols = MODELS[k]["feature_cols"]
    
    expected_pk_features       = ["subject_id", "day_in_study"] + feature_cols
    expected_pk_time_features  = ["subject_id", "day_in_study", "event_time"] + feature_cols
    expected_pk_features_label = ["subject_id", "day_in_study"] + feature_cols + [LABEL_COL[k]]
    expected_full              = ["subject_id", "day_in_study", "event_time"] + feature_cols + [LABEL_COL[k]]
    expected_features_only     = feature_cols

    if len(vec) == len(expected_full):
        data = dict(zip(expected_full, vec))
    elif len(vec) == len(expected_pk_time_features):
        data = dict(zip(expected_pk_time_features, vec))
    elif len(vec) == len(expected_pk_features_label):
        data = dict(zip(expected_pk_features_label, vec))
    elif len(vec) == len(expected_pk_features):
        data = dict(zip(expected_pk_features, vec))
    elif len(vec) == len(expected_features_only):
        data = dict(zip(expected_features_only, vec))
    else:
        raise RuntimeError(
            f"Unexpected feature vector length for {k}. "
            f"got={len(vec)} expected one of "
            f"{len(expected_full)},{len(expected_pk_time_features)},{len(expected_pk_features_label)},"
            f"{len(expected_pk_features)},{len(expected_features_only)}"
        )


    #building X in exact model feature order
    row = {c: data.get(c, np.nan) for c in feature_cols}
    X = pd.DataFrame([row])

    #bool->int
    for c in X.columns:
        if X[c].dtype == bool:
            X[c] = X[c].astype(int)

    return X


In [15]:
#predicting using online store
def predict_dataset(dataset_key: str,
                    subject_id: int | None = None,
                    day_in_study: int | None = None,
                    inputs: dict | None = None,
   
    k = dataset_key.lower()
    model = MODELS[k]["model"]
    feature_cols = MODELS[k]["feature_cols"]

    if inputs is not None:
        X = prepare_X_from_inputs(inputs, feature_cols)
    else:
        if subject_id is None or day_in_study is None:
            raise ValueError("For online inference, provide subject_id and day_in_study")
        X = fetch_online_X(k, subject_id, day_in_study, passed_features=passed_features)

    #if lag is missing/NaN, fall back to modea
    if k.endswith("modeb"):
        needed = "lag1_energy" if k.startswith("energy") else "lag1_mood"
        if needed in X.columns and (pd.isna(X.loc[0, needed]) or X.loc[0, needed] is None):
            fallback = k.replace("modeb", "modea")
            return predict_dataset(fallback, subject_id=subject_id, day_in_study=day_in_study, inputs=inputs)

    pred = int(model.predict(X)[0])
    proba = None
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X)[0].tolist()

    return {"dataset": k, "pred": pred, "proba": proba}

def predict_energy(subject_id: int | None = None,
                   day_in_study: int | None = None,
                   inputs: dict | None = None,
                   lag1_energy: int | None = None) -> dict:
    #if lag is provided, try modeb, otherwise modea
    if lag1_energy is not None:
        pf = {"lag1_energy": int(lag1_energy)}
        return predict_dataset("energy_modeb", subject_id, day_in_study, inputs=inputs, passed_features=pf)
    return predict_dataset("energy_modea", subject_id, day_in_study, inputs=inputs)

def predict_mood(subject_id: int | None = None,
                 day_in_study: int | None = None,
                 inputs: dict | None = None,
                 lag1_mood: int | None = None) -> dict:
    if lag1_mood is not None:
        pf = {"lag1_mood": int(lag1_mood)}
        return predict_dataset("mood_modeb", subject_id, day_in_study, inputs=inputs, passed_features=pf)
    return predict_dataset("mood_modea", subject_id, day_in_study, inputs=inputs)

In [18]:
#ONLINE TEST - picking an existing key from engineered FG (offline read)
fg_test = fs.get_feature_group("mcphases_energy_modea_fg", version=2)
sample = fg_test.read().iloc[0]

sid = int(sample["subject_id"])
day = int(sample["day_in_study"])

print("Testing online on existing row:", {"subject_id": sid, "day_in_study": day})
print("Energy:", predict_energy(subject_id=sid, day_in_study=day))
print("Mood:  ", predict_mood(subject_id=sid, day_in_study=day))

#MANUAL TEST - providing inputs
manual_inputs = {
    "is_weekend": 0,
    "sleep_duration_minutes": 420,
    "resting_heart_rate_value": 58,
    "cramps_num": 1,
    "stress_num": 2,
    "headaches_num": 0,
    "sleepissue_num": 0,
    "phase": "Luteal",
}

print("\nTesting manual inputs (modeA):")
print("Energy:", predict_energy(inputs=manual_inputs))
print("Mood:  ", predict_mood(inputs=manual_inputs))

print("\nTesting manual inputs (modeB with lag overrides):")
print(predict_dataset("energy_modeb", inputs={**manual_inputs, "lag1_energy": 1}))
print(predict_dataset("mood_modeb",   inputs={**manual_inputs, "lag1_mood": 2}))

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.39s) 
Testing ONLINE on existing row: {'subject_id': 1, 'day_in_study': 1}
Energy: {'dataset': 'energy_modea', 'pred': 0, 'proba': [0.7226359244246936, 0.21123167931956804, 0.06613239625573691]}
Mood:   {'dataset': 'mood_modea', 'pred': 2, 'proba': [0.11279066663745003, 0.4151351066108977, 0.4720742267516524]}

Testing MANUAL inputs (ModeA):
Energy: {'dataset': 'energy_modea', 'pred': 1, 'proba': [0.07013379836121844, 0.6295087774167568, 0.30035742422202544]}
Mood:   {'dataset': 'mood_modea', 'pred': 2, 'proba': [0.08181239428744776, 0.04909231087115665, 0.8690952948413957]}

Testing MANUAL inputs (ModeB with lag overrides):
{'dataset': 'energy_modeb', 'pred': 1, 'proba': [0.08401855394836258, 0.6878914090909224, 0.2280900369607157]}
{'dataset': 'mood_modeb', 'pred': 2, 'proba': [0.026562904750626518, 0.05033375716744299, 0.9231033380819308]}
