In [3]:
%%bash
# Install Jupyter kernel in the virtual environment
source .env/bin/activate 
uv pip install ipykernel -q

# Install custom kernel
python -m ipykernel install --user --name=mlops --display-name="Python (oppe2)"

Installed kernelspec mlops in /home/jupyter/.local/share/jupyter/kernels/mlops


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

# 1. Load and prepare data
print("Loading California Housing Dataset...")
df = pd.read_csv("data/housing.csv")

# Separate features and target
X = df.drop(["median_house_value", "ocean_proximity"], axis=1)
y = df["median_house_value"]

# Create a synthetic sensitive attribute
X['high_income'] = (X['median_income'] > X['median_income'].median()).astype(int)
sensitive_feature = X['high_income']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# 2. Train a simple but reasonably strong RandomForest model
model = RandomForestRegressor(
    n_estimators=100,   # a bit more trees than default for better performance
    max_depth=15,       # prevents overfitting, keeps training fast
    random_state=42,
    n_jobs=-1           # use all CPU cores for speed
)

print("Training the model...")
model.fit(X_train, y_train)

# 3. Evaluate model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE on test set: {rmse:.2f}")
print(f"R2 Score on test set: {r2:.4f}")

# 4. Save model using joblib
dump(model, "./artifacts/california_rf_model.joblib")
print("Model saved as 'california_rf_model.joblib'")

Loading California Housing Dataset...
Training set size: 16512
Test set size: 4128
Training the model...
RMSE on test set: 50201.61
R2 Score on test set: 0.8077
Model saved as 'california_rf_model.joblib'
