In [2]:
import pandas as pd
import numpy as np
import os
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump

# === Load processed data ===
train_data = pd.read_csv("../data/processed/train.csv")
test_data = pd.read_csv("../data/processed/test.csv")

# === Target column ===
target_column = "MedHouseVal"  # Update based on your data
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]

X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

print(X_train.columns.tolist())

# === Define experiment ===
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("housing-DecisionTree")

with mlflow.start_run():
    # Model and training
    model = DecisionTreeRegressor(max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Log params & metrics
    mlflow.log_param("max_depth", 5)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    
    # Save model
    dump(model, "model.joblib")
    mlflow.sklearn.log_model(model, "model")

    print(f"✅ Decision Tree Model logged with R2 score: {r2:.4f}")


['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
✅ Decision Tree Model logged with R2 score: 0.5997
